# Download Sustainability Reports

In [None]:
# .venv\Scripts\Activate # to activate the virtual environment
!pip freeze > requirements.txt

In [1]:
import pandas as pd
import requests

## Reports for 2024
Source: Sustainability Reporting Navigator (they crowd-source list of CSRD-compliant reports for fiscal years starting on 01/01/2024)

Download pdf with a list of all reports under https://www.sustainabilityreportingnavigator.com/#/csrdreports 

In [8]:
# Open the csv data file
reports_24 = pd.read_csv('esg_reports_2024.csv')
print(len(reports_24))

277


## Reports for 2010 until 2023
Source: Donau, Charlotte-Louise, Fikir Worku Edossa, Joachim Gassen, Gaia Melloni, Inga Meringdal, Bianca Minuth, Arianna Piscella, Paul Pronobis and Victor Wagner (2023): SRN Document Database, https://github.com/trr266/srn_docs.

"Our objective is to develop this repository into a collaborative data platform that provides extensive coverage of sustainability-related documents published by European publicly-listed firms."

In [3]:
# Code from SRN API documentation https://github.com/trr266/srn_docs/blob/main/srn_docs_api.py

srn_api_url = "https://api.sustainabilityreportingnavigator.com/api/"

def get_srn_companies():
    """
    Returns a list of companies that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg company level metadata
    """
    response = requests.get(srn_api_url + "companies")
    return response.json()


def get_srn_documents():
    """
    Returns a list of documents that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg document level metadata
    """
    response = requests.get(srn_api_url + "documents")
    return response.json()


def download_document(id, fpath, timeout=60):
    """
    Retreives a certain document from the SRN Document Database and 
    stores it at the provided file path.

    Args:
        id (str): The SRN document id.
        fpath (str): A sting containt the file path where you want to
            store the file.
        timeout (int, optional): Sometimes, a download API call might
            nlock because of a dying connection or because the data
            is not available. If a timeout is reached, the according
            API request will raise an exception and exit. 
            Defaults to 60 seconds.
    """
    response = requests.get(
        srn_api_url + f"documents/{id}/download", 
        timeout=timeout
    )
    with open(fpath, 'wb') as f: f.write(response.content)


if __name__ == "__main__":
    companies = get_srn_companies()
    documents = get_srn_documents()
    print("Searching comapny with a name containing 'Allianz'")
    matches = [c for c in companies if 'Allianz' in c['name']]
    print(
        f"Found {len(matches)} match(es). " +
        "Retrieving the documents for the first match."
    )
    docs = [d for d in documents if d['company_id'] == matches[0]['id']] 
    FPATH = 'test_srn_docs.pdf'
    print(
        f"Found {len(docs)} documents. " +
        "Retrieving the first document from the list " +
        f"and storing it as '{FPATH}'."
    )
    download_document(docs[0]['id'], FPATH)
    print("done!")

Searching comapny with a name containing 'Allianz'
Found 1 match(es). Retrieving the documents for the first match.
Found 30 documents. Retrieving the first document from the list and storing it as 'test_srn_docs.pdf'.
done!


In [4]:
# get a list of all companies in the SRN database
srn_companies = get_srn_companies()
print(len(srn_companies))
print(srn_companies[0])

922
{'id': '8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1', 'name': '1&1', 'isin': 'DE0005545503', 'country': 'Germany', 'sector': 'Media & Entertainment', 'href': '', 'href_logo': '', 'company_type': 'public', 'indices': ['c233b29e-f073-426f-88cf-9e9d5e645e6e']}


In [5]:
# retrieve all documents from the SRN database
documents = get_srn_documents()
print(len(documents))
print(documents[0])

11922
{'id': '8e22a8db-51ec-49f2-9118-52b5f3e745bd', 'name': 'A.P. Møller-Mærsk Sustainability Report 2022', 'href': 'https://www.maersk.com/~/media_sc9/maersk/corporate/sustainability/files/resources/2022/maersk-sustainability-yearly-report_2022.pdf?la=de-de&hash=C82244C7CF694E2B8D83CDC7BAC8306D', 'type': 'SR', 'year': '2022', 'source': 'url_cached', 'company_id': '4e2266f6-6bc9-469f-bfa0-344873b81fc6', 'created_at': '2023-10-14T10:30:05.879596', 'created_by_info': None}


## Prepare datasets for further investigation

### 1. For all companies available

In [6]:
all_sr = pd.DataFrame(srn_companies)
all_sr = all_sr.drop(columns=['isin', 'href', 'href_logo', 'indices']) # drop unecessary columns

# normalize company name to lowercase and remove whitespace
all_sr['normalized_name'] = all_sr['name'].str.lower()
all_sr['normalized_name'] = all_sr['normalized_name'].str.replace(' ', '')

# also remove * in the names in reports_24 & normalize the name
reports_24['company_withAccessInfo'] = reports_24['company_withAccessInfo'].str.replace('*', '')
reports_24['normalized_name'] = reports_24['company_withAccessInfo'].str.lower()
reports_24['normalized_name'] = reports_24['normalized_name'].str.replace(' ', '')

# add a column type = CSRD to the reports_24 dataframe
reports_24['type'] = 'CSRD'
# add a column year = 2024
reports_24['year'] = 2024	

# merge the two datasets on the normalized name
all_sr = pd.merge(all_sr, reports_24, on='normalized_name', how='outer')

all_sr.head()

Unnamed: 0.1,id,name,country_x,sector_x,company_type,normalized_name,Unnamed: 0,company_withAccessInfo,link,country_y,sector_y,industry,publication date,pages PDF,auditor,type,year
0,8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1,1&1,Germany,Media & Entertainment,public,1&1,,,,,,,,,,,
1,def442c8-8f64-42c3-af85-728971264d7e,3i Group PLC,United Kingdom,Financial Services,public,3igroupplc,,,,,,,,,,,
2,b21cc316-6693-4dc9-a5d5-fabf650b5787,3M,United States,Basic Materials & Mining,public,3m,,,,,,,,,,,
3,1bcd009f-f39f-44a5-8d61-69e531bffcb9,4Workers Sp. z o.o.,Poland,Clothing & Footwear,private,4workerssp.zo.o.,,,,,,,,,,,
4,4e2266f6-6bc9-469f-bfa0-344873b81fc6,A.P. Moeller-Maersk,Denmark,Automobiles & Other Transport Vehicles,public,a.p.moeller-maersk,,,,,,,,,,,


In [7]:
### controll whether the two dataframes contain any mismatches in the information they provide

# drop all rows where the name and company_withAccessInfo columns are not the same
control = all_sr.dropna()

# show whether name and company_withAccessInfo are the same
control['names_match'] = control['name'] == control['company_withAccessInfo']
mismatches = control[control['names_match'] == False]
print(mismatches[['name', 'company_withAccessInfo']])


            name company_withAccessInfo
29        Adidas                 adidas
48       Aixtron                AIXTRON
582   Kion Group             KION Group
587         KONE                   Kone
781         Relx                   RELX
981    TietoEVRY              Tietoevry
1042       Vinci                  VINCI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['names_match'] = control['name'] == control['company_withAccessInfo']


Only differences in writing not in meaning.

In [8]:
# control the countries
control['country_match'] = control['country_x'] == control['country_y']
mismatches = control[control['country_match'] == False]
print(mismatches[['name', 'country_x', 'country_y']])

       name country_x   country_y
44   Airbus   Germany      France
535  InPost    Poland  Luxembourg
694   Nokia    Sweden     Finland
844     SEB    France      Sweden


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['country_match'] = control['country_x'] == control['country_y']


! need further investigation which information is truthfull

In [9]:
# control the sector
control['sector_match'] = control['sector_x'] == control['sector_y']
mismatches = control[control['sector_match'] == False]
print(mismatches[['name', 'sector_x', 'sector_y']])

                name                                sector_x  \
29            Adidas                     Clothing & Footwear   
44            Airbus                         Heavy Machinery   
48           Aixtron                             Electronics   
53             Alfen                 Building & Construction   
59           Allianz                               Insurance   
...              ...                                     ...   
1018             UPM                  Forestry & Agriculture   
1042           Vinci                 Building & Construction   
1052      Volkswagen  Automobiles & Other Transport Vehicles   
1077  Wolters Kluwer          Industrial Products & Services   
1080        Wärtsilä                    Industrial Machinery   

                                      sector_y  
29                              Consumer Goods  
44                     Resource Transformation  
48                 Technology & Communications  
53                     Resource Tra

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['sector_match'] = control['sector_x'] == control['sector_y']


! need further investigation also with respect to the industry

In [11]:
# impute missing information to columns we want to keep
# company name
all_sr['name'] = all_sr['name'].fillna(all_sr['company_withAccessInfo'])
# country
all_sr['country_x'] = all_sr['country_x'].fillna(all_sr['country_y'])
# sector
all_sr['sector_x'] = all_sr['sector_x'].fillna(all_sr['sector_y'])

# finish the dataframe of reports for 2024 by keeping only the columns we need
all_sr = all_sr[['id', 'name', 'normalized_name', 'country_x', 'sector_x', 'company_type', 'link', 'year', 'pages PDF', 'auditor', 'type']]
all_sr = all_sr.rename(columns={'id': 'company_id', 'name': 'company_name', 'country_x': 'country', 'sector_x': 'sector', 'pages PDF': 'pages_pdf'})

In [13]:
### add reports from before 2024
documents = pd.DataFrame(documents)

# merge the two datasets on company_id
all_sr = pd.merge(all_sr, documents, on='company_id', how='outer')

# impute missing information to columns we want to keep
# link
all_sr['link'] = all_sr['link'].fillna(all_sr['href'])
# type
all_sr['type'] = all_sr['type_x'].fillna(all_sr['type_y'])
# year
all_sr['year'] = all_sr['year_x'].fillna(all_sr['year_y'])

# finish the dataframe by keeping only the columns we need
all_sr = all_sr[['company_id', 'company_name', 'normalized_name', 'country', 'sector', 'company_type', 'link', 'type', 'pages_pdf', 'year', 'auditor', 'id']]
all_sr = all_sr.rename(columns={'id': 'document_id'})


In [14]:
# drop missing in company_name
all_sr = all_sr.dropna(subset=['company_name'])

In [15]:
# drop all missings and empty strings in the link column
all_sr = all_sr.dropna(subset=['link'])
all_sr = all_sr[all_sr['link'] != '']

In [16]:
# print links in ascending order
all_sr = all_sr.sort_values(by=['link'])
print(all_sr[['company_name', 'link']].head(10))

        company_name                                               link
3861          Smiths  *https://www.smiths.com/-/media/files/smiths-c...
2698         Qinetiq                                                  0
7688             EQT       2517f65322a56bad909f90dd00f55506db442bea.pdf
10417     SUEZ Group             No internet link available (only PDFs)
17     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
16     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
4722    Adecco Group  annualreport2012.adecco.com/fileadmin/user_upl...
4727    Adecco Group  annualreport2013.adecco.com/fileadmin/user_upl...
4721    Adecco Group  annualreport2014.adecco.com/fileadmin/user_upl...
4724    Adecco Group  annualreport2015.adecco.com/fileadmin/user_upl...


### 2. Only companies with CSRD reports in 2024 and sustainability reports before 2024

# Extract pdf text

In [None]:
def download_document(id, fpath, timeout=60):
    """
    Retreives a certain document from the SRN Document Database and 
    stores it at the provided file path.

    Args:
        id (str): The SRN document id.
        fpath (str): A sting containt the file path where you want to
            store the file.
        timeout (int, optional): Sometimes, a download API call might
            nlock because of a dying connection or because the data
            is not available. If a timeout is reached, the according
            API request will raise an exception and exit. 
            Defaults to 60 seconds.
    """
    response = requests.get(
        srn_api_url + f"documents/{id}/download", 
        timeout=timeout
    )
    with open(fpath, 'wb') as f: f.write(response.content)

In [25]:
# dowload five pdfs
for i in range(5):
    # get the document id
    doc_id = all_sr.iloc[i]['document_id']
    # get company name
    company_name = all_sr.iloc[i]['company_name']
    # get year
    year = all_sr.iloc[i]['year']
    # get the file path
    fpath = f"document_{company_name}{year}.pdf"
    # download the document
    download_document(doc_id, fpath)
    print(f"Downloaded document {doc_id} to {fpath}")

Downloaded document c43e45ee-0246-4e86-9596-0f08b7c5ede2 to document_Smiths2011.pdf
Downloaded document 165fb0c3-ee68-43a7-8b62-af05d15081a6 to document_Qinetiq2022.pdf
Downloaded document e7ea3b2e-34ed-4ab8-a35e-14e75018ace7 to document_EQT2022.pdf
Downloaded document 04529c13-9db0-4cc4-8b37-1ac5149ba17e to document_SUEZ Group2019.pdf
Downloaded document 641c9576-2512-4e80-8150-c3cb7583bc35 to document_Admiral Group2013.pdf


--> did not work

In [17]:
# randomly select 20 reports from all_sr dataframe
sample = all_sr[all_sr['type'].isin(['CSRD', 'SR'])].sample(n=20, random_state=1)
sample.head()

Unnamed: 0,company_id,company_name,normalized_name,country,sector,company_type,link,type,pages_pdf,year,auditor,document_id
7083,981ca55f-11a4-4b79-95c9-af66c676e083,Londonmetric Property PLC,londonmetricpropertyplc,United Kingdom,Financial Services,,https://www.londonmetric.com/sites/london-metr...,SR,,2020.0,,99085e04-47ef-41e7-873f-79bc7fe8df3a
4394,606dffee-a3af-45ac-98e1-94332e916738,Investor,investor,Sweden,Financial Services,public,https://www.investorab.com/media/t11dahrs/cop-...,SR,,2020.0,,dbb53e5b-069f-4eaf-a43f-940f599bb7da
4746,64f716cf-80ba-40c5-8dfb-38de6d85c73a,PHP Primay Health Properties,phpprimayhealthproperties,United Kingdom,Financial Services,public,https://www.phpgroup.co.uk/application/files/3...,SR,,2022.0,,c734d646-b4d2-4417-b4d9-8dd213b52d2a
10917,ecdd8b16-543e-4f0f-ae3c-edad528974d0,Aixtron,aixtron,Germany,Electronics,public,https://www.aixtron.com/investoren/publikation...,CSRD,87.0,2024.0,KPMG,78a5f52c-d8a1-4c52-9cc5-c0bb0a4d9da2
7837,a899ee2f-da85-4220-803d-64fdd8a70c3e,Yara,yara,Norway,"Chemicals, Fuels & Biofuels",public,https://www.yara.com/siteassets/investors/057-...,SR,,2013.0,,2a3d1d49-7787-46ad-93eb-270c25fd3a94


# Extract information from sustainability reports

During search found several aplication examples of pdf information retrieval:
- from SRN: https://github.com/trr266/srn_docs/blob/main/extract_text_from_docs.py 
- from Chatreport (Ni et al., 2023): https://github.com/EdisonNi-hku/chatreport/blob/main/code/document.py 

Decided to base my code on Chatrport because:
- SRN only extracts raw text, Chatreport uses vector embeddings and semantic search
- Chatreport uses PDF Parsing with `PyMuPDF` --> faster, more reliable and accurent in comparioson to pdfminer (used in SRN)
- Chatreport extracts metadata like report title, chapter names, and images & uses Vector Search & Retrieval



In [None]:
# Setup & Imports
! pip install pymupdf faiss-cpu langchain sentence-transformers


In [None]:
# Prpare Environment & Output Directories
import os

os.makedirs('reports_db', exist_ok=True)
os.makedirs('retrieved_chunks', exist_ok=True)
os.makedirs('saved_pdfs', exist_ok=True)


In [None]:
# Iterate through each pdf
from tqdm import tqdm

all_reports = []
for i, row in tqdm(sample.iterrows(), total=len(sample)):
    url = row['link']
    company_name = row.get('company', f'company_{i}')  # fallback if no company name
    db_path = f'reports_db/{company_name}_faiss'
    retrieved_path = f'retrieved_chunks/{company_name}'
    store_pdf_path = f'saved_pdfs/{company_name}.pdf'
    
    try:
        report = Report(
            url=url,
            title=company_name,
            store_path=store_pdf_path,
            db_path=db_path,
            retrieved_chunks_path=retrieved_path,
        )
        all_reports.append(report)
    except Exception as e:
        print(f"Failed to process {company_name} - {url} | Error: {e}")


In [None]:
from your_module import Report  # adjust if class is in same file

# Directory to store output data
db_path_root = "./faiss_dbs"
retrieved_chunks_path_root = "./retrieved_chunks"

for idx, row in sample.iterrows():
    url = row['link']
    company_name = row.get('company_name', f"report_{idx}")  # fallback name
    db_path = f"{db_path_root}/{company_name}_db"
    retrieved_path = f"{retrieved_chunks_path_root}/{company_name}"

    print(f"Processing {company_name}...")

    try:
        report = Report(
            url=url,
            db_path=db_path,
            retrieved_chunks_path=retrieved_path
        )
        print(f"✅ Done: {company_name}")
    except Exception as e:
        print(f"❌ Failed to process {company_name}: {e}")


# PyMuPDF

Additional information:
- extract text as blocks https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/text-extraction/PDF2TextBlocks.py


In [18]:
import pymupdf

doc = pymupdf.open("Lonza_2021.pdf") # open a document

In [21]:
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
    text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
    out.write(text) # write text of page
    out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close() # close the output file

In [35]:
import fitz  # PyMuPDF

def extract_text_from_pdf(filename):
    ofile = filename + ".txt"
    doc = fitz.open(filename)
    with open(ofile, "wb") as fout:
        for page in doc:
            fout.write(page.get_text().encode("utf-8") + bytes((12,)))

# Usage
extract_text_from_pdf("Lonza_2021.pdf")

In [27]:
import pymupdf
from pprint import pprint

doc = pymupdf.open("Lonza_2021.pdf") # open document
page = doc[11] # get the 1st page of the document
tabs = page.find_tables() # locate and extract any tables on page
print(f"{len(tabs.tables)} found on {page}") # display number of found tables

if tabs.tables:  # at least one table found?
   pprint(tabs[11].extract())  # print content of first table

0 found on page 11 of Lonza_2021.pdf


In [30]:
from pdf2docx import Converter

pdf_file = 'Lonza_2021.pdf'

cv = Converter(pdf_file)
tables = cv.extract_tables(start=0, end=7)
cv.close()

for table in tables:
    print(table)

[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/7) Page 1
[INFO] (2/7) Page 2
[INFO] (3/7) Page 3
[INFO] (4/7) Page 4
[INFO] (5/7) Page 5
[INFO] (6/7) Page 6
[INFO] (7/7) Page 7


# PyMuPDF4LLM

In [2]:
import pymupdf4llm

In [3]:
md_text = pymupdf4llm.to_markdown("Lonza_2021.pdf")
print(md_text)

**Enabling a Healthier World**
## Sustainability Report
###### 2021



-----

**Sustainability Report 2021**

2


Mangrove forest, Thailand


-----

### Contents


26 **Environment**
28 Energy
30 Water
32 Greenhouse Gas
Emissions
37 Waste
39 Innovating for
Sustainability


**Contents**

3


4 **Our Commitment to** 
**Sustainability**
6 About Us
11 Our Journey
14 Our Stakeholders

42 **People and Society**
42 Occupational Health
and Safety
45 Diversity and Equal
Opportunity
46 Employee
Recruitment,

Retention and

Development
48 Employee
Engagement
48 Investing in
Communities


18 **Responsibility**
18 Risk Management
22 Anti-Bribery and
Anti-Corruption
24 Supply Chain
Responsibility

52 **GRI** **and** **TCFD**
**Overview**


-----

**Sustainability Report 2021**
###### Our Commitment to Sustainability

**Dear Friends of Lonza,**

Welcome to our Sustainability Report 2021.

This year was important for sustainability across our global
network. We have redesigned our

In [4]:
import pathlib

output_file = pathlib.Path("output.md")
output_file.write_bytes(md_text.encode())

171082

In [11]:
# output in LlamaIndex format

md_read = pymupdf4llm.LlamaMarkdownReader()
data = md_read.load_data("Lonza_2021.pdf")

# The result 'data' is of type List[LlamaIndexDocument]
# Every list item contains metadata and the markdown text of 1 page.

Successfully imported LlamaIndex


In [None]:
# Table extraction

md_text_tables = pymupdf4llm.to_markdown(
    doc="Lonza_2021.pdf"
)


md_text_tables

'**Enabling a Healthier World**\n## Sustainability Report\n###### 2021\n\n\n\n-----\n\n**Sustainability Report 2021**\n\n2\n\n\nMangrove forest, Thailand\n\n\n-----\n\n### Contents\n\n\n26 **\x07Environment**\n28 \x07Energy\n30 \x07Water\n32 \x07Greenhouse Gas\nEmissions\n37 \x07Waste\n39 \x07Innovating for\nSustainability\n\n\n**Contents**\n\n3\n\n\n4 **Our Commitment to** \x07\n**Sustainability**\n6 \x07About Us\n11 \x07Our Journey\n14 \x07Our Stakeholders\n\n42 **\x07People and Society**\n42 \x07Occupational Health\nand Safety\n45 \x07Diversity and Equal\nOpportunity\n46 \x07Employee\nRecruitment,\n\nRetention and\n\nDevelopment\n48 \x07Employee\nEngagement\n48 \x07Investing in\nCommunities\n\n\n18 **\x07Responsibility**\n18 \x07Risk Management\n22 \x07Anti-Bribery and\nAnti-Corruption\n24 \x07Supply Chain\nResponsibility\n\n52 **\x07GRI** **and** **TCFD**\n**Overview**\n\n\n-----\n\n**Sustainability Report 2021**\n###### Our Commitment to Sustainability\n\n**Dear Friends of Lonza,*

In [None]:
# Image extraction
md_text_images = pymupdf4llm.to_markdown(
    doc="Lonza_2021.pdf",
    pages=[5, 9],
    page_chunks=True, # dictionary for each page
    write_images=True,
    image_path="images", # images will be saved in this directory
    force_text=False, # force text extraction
    image_format="png",
    dpi=300
)

In [None]:
# Document structure extraction
md_text_structure = pymupdf4llm.to_markdown(
    doc="Lonza_2021.pdf",
    pages=[0, 9],
    page_chunks=True,
    write_images=True,
    image_path="images", 
    image_format="png",
    dpi=300,
    extract_words=True
)