In [61]:
import pymupdf4llm
import sys, pathlib, pymupdf
import re
import pandas as pd

In [62]:
# Remove column width limit
pd.set_option('display.max_colwidth', None)

# Optional: show all columns (if you have many)
pd.set_option('display.max_columns', None)

# Optional: show all rows
pd.set_option('display.max_rows', None)

In [63]:
eu_finance_fname = "200309-sustainable-finance-teg-final-report-taxonomy-annexes_en.pdf"

## Load pdf-data

In [64]:

with pymupdf.open(f"data/{eu_finance_fname}") as doc:  # open document
    text = chr(12).join([page.get_text() for page in doc])

In [65]:
# write as a binary file to support non-ASCII characters
# pathlib.Path(f"data/{eu_finance_fname}.txt").write_bytes(text.encode())

## Split into Paragraphs

In [66]:
#use four space as paragraph delimiter to convert the text into list of paragraphs.
paragraphs = re.split('\s{4,}', text)
len(paragraphs)

2148

### Clean data

In [67]:

def clean_text(text: str):
    # Remove all non-ASCII characters
    cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove two-digit hex patterns
    # cleaned = re.sub(r'\b[0-9A-Fa-f]{2}\b', '', cleaned)

    # Keep only letters, numbers, and basic punctuation
    cleaned = re.sub(r'[^\w\s.,!?\'\"()-]', '', cleaned)

    # \t = tab
    # \n newline
    # \u2022 = • (Unicode bullet)
    # - and * are common bullet symbols
    cleaned = re.sub(r'[\t\n\u2022\-*•]', '', cleaned)
    return cleaned


In [68]:
cleaned_paragraphs = [clean_text(p) for p in paragraphs]
len(cleaned_paragraphs)
cleaned_paragraphs2 = [p for p in cleaned_paragraphs if len(p) > 30]
len(cleaned_paragraphs2)

1631

### Pandas dataframe

In [69]:
# Convert to DataFrame
df = pd.DataFrame(cleaned_paragraphs2, columns=["paragraph"])

# Show the result
df.head(0)

Unnamed: 0,paragraph


In [70]:
df.head()

Unnamed: 0,paragraph
0,Updated methodology Updated Technical Screening Criteria March 2020
1,"2 About this report This document includes an updated Part B Methodology from the June 2019 report and an updated Part F Full list of technical screening criteria. The other original sections from the June 2019 report can be found as labelled in the June 2019 report. PART A Explanation of the Taxonomy approach. This section sets out the role and importance of sustainable finance in Europe from a policy and investment perspective, the rationale for the development of an EU Taxonomy, the daft regulation and the mandate of the TEG. PART B Methodology. This explains the methodologies for developing technical screening criteria for climate change mitigation objectives, adaptation objectives and do no significant harm to other environmental objectives in the legislative proposal. This has been updated since 2019. PART C Taxonomy user and use case analysis. This section provides practical guidance to potential users of the Taxonomy, including case studies. PART D Economic impacts of the Taxonomy. This section provides the TEGs analysis of the likely economic impacts of establishing an EU Taxonomy. PART E Next steps for the Taxonomy. This section elaborates on unresolved issues and potential ways forward for the Taxonomy and the technical work of the Platform on Sustainable Finance. PART F Full list of technical screening criteria. This annex sets out the sector and economic activityspecific technical screening criteria and rationale for the TEGs analysis. These have been updated since 2019."
2,"Disclaimer This report represents the overall view of the members of the Technical Expert Group, and although it represents such a consensus, it may not necessarily, on all details, represent the individual views of member institutions or experts. The views reflected in this Report are the views of the experts only. This report does not reflect the views of the European Commission or its services."
3,"3 Contents Methodology statements ....................................................................10 1. Substantial contribution to Climate change mitigation .......................................................... 10 1.1 Work process conceptual approach ............................................................................ 10 1.2 Defining substantial contribution to climate change mitigation ....................................... 14 1.3 Eligibility of finance for activities contributing substantially to mitigation ........................ 16 1.4 Further development ....................................................................................................... 16 2. Substantial contribution to Climate change adaptation ......................................................... 18 2.1 Work process conceptual approach ............................................................................ 18 2.2 Defining substantial contribution to climate change adaptation ..................................... 20 2.3 Screening criteria for activities making a substantial contribution to adaptation ............ 25 2.4 Eligibility of finance for activities contributing substantially to adaptation ....................... 27 2.5 Classification of climaterelated hazards ........................................................................ 28 3. Do no significant harm (DNSH) ............................................................................................. 29 3.1 DNSH to environmental objectives 36 ........................................................................... 29 3.2 DNSH to climate change adaptation ............................................................................... 29 3.3 DNSH to environmental objectives 36 ........................................................................... 31 Eligible NACE Codes Points of Note .................................................36 3.4 Use of NACE ................................................................................................................... 36 3.5 Assessing the Taxonomy criteria for Green Debt and Loans ......................................... 37 Technical screening criteria substantial contribution to climate change mitigation ...............................................................................39 1. Forestry ................................................................................................................................. 40 1.1 Afforestation .................................................................................................................... 52 1.2 Rehabilitation, Restoration .............................................................................................. 60 1.3 Reforestation ................................................................................................................... 68 1.4 Existing forest management ........................................................................................... 76 1.5 Conservation forest ......................................................................................................... 85 2. Agriculture ........................................................................................................................... 102"
4,"4 2.1 Growing of perennial crops ........................................................................................... 112 2.3 Growing of nonperennial crops .................................................................................... 126 2.4 Livestock production ..................................................................................................... 140 3. Manufacturing...................................................................................................................... 155 3.1 Manufacture of Low carbon technologies ..................................................................... 162 3.2 Manufacture of Cement ................................................................................................ 167 3.3 Manufacture of Aluminium ............................................................................................ 172 3.4 Manufacture of Iron and Steel ...................................................................................... 176 3.5 Manufacture of Hydrogen ............................................................................................. 180 3.6 Manufacture of other inorganic basic chemicals .......................................................... 183 3.7 Manufacture of other organic basic chemicals ............................................................. 189 3.8 Manufacture of fertilizers and nitrogen compounds ...................................................... 196 3.9 Manufacture of plastics in primary form ........................................................................ 200 4. Electricity, gas, steam and air conditioning supply ............................................................. 205 4.1 Production of Electricity from Solar PV ......................................................................... 212 4.2 Production of Electricity from Concentrated Solar Power ............................................. 215 4.3 Production of Electricity from Wind Power.................................................................... 218 4.4 Production of Electricity from Ocean Energy ................................................................ 221 4.5 Production of Electricity from Hydropower.................................................................... 224 4.6 Production of Electricity from Geothermal .................................................................... 228 4.7 Production of Electricity from Gas (not exclusive to natural gas) ................................. 231 4.8 Production of Electricity from Bioenergy (Biomass, Biogas and Biofuels) ................... 234 4.9 Transmission and Distribution of Electricity .................................................................. 238 4.10 Storage of Electricity ..................................................................................................... 243 4.11 Storage of Thermal Energy ........................................................................................... 245 4.12 Storage of Hydrogen ..................................................................................................... 247 4.13 Manufacture of Biomass, Biogas or Biofuels ................................................................ 249 4.14 Retrofit of Gas Transmission and Distribution Networks .............................................. 252 4.15 District HeatingCooling Distribution ............................................................................. 255 4.16 Installation and operation of Electric Heat Pumps ........................................................ 258 4.17 Cogeneration of HeatCool and Power from Concentrated Solar Power ..................... 260 4.18 Cogeneration of HeatCool and Power from Geothermal Energy ................................ 263 4.19 Cogeneration of HeatCool and Power from Gas (not exclusive to natural gas) .......... 266"


In [71]:
df.to_csv("data/paragraphs.csv", index=False)