# Download Sustainability Reports

In [2]:
import pandas as pd
import requests

## Reports for 2024
Source: Sustainability Reporting Navigator (crowd-source list of CSRD-compliant reports for fiscal years starting on 01/01/2024)

Download CSV with information on all reports https://www.sustainabilityreportingnavigator.com/#/csrdreports 

In [3]:
# Open the csv data file
reports_24 = pd.read_csv('esg_reports_2024.csv')
print(len(reports_24))

277


## Reports for 2010 until 2023
Source: Donau, Charlotte-Louise, Fikir Worku Edossa, Joachim Gassen, Gaia Melloni, Inga Meringdal, Bianca Minuth, Arianna Piscella, Paul Pronobis and Victor Wagner (2023): SRN Document Database, https://github.com/trr266/srn_docs.

- "Our objective is to develop this repository into a collaborative data platform that provides extensive coverage of sustainability-related documents published by European publicly-listed firms."
- "We try to collect all documents that contain relevant sustainability information. This includes but is not limited to annual and sustainability reports (AR and SR). For some firms it also includes additional reports like integreated reports (IR), Carbon Diclosure Project data (CDP), and other reporting formats."

In [3]:
# Code from SRN API documentation https://github.com/trr266/srn_docs/blob/main/srn_docs_api.py

srn_api_url = "https://api.sustainabilityreportingnavigator.com/api/"

def get_srn_companies():
    """
    Returns a list of companies that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg company level metadata
    """
    response = requests.get(srn_api_url + "companies")
    return response.json()


def get_srn_documents():
    """
    Returns a list of documents that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg document level metadata
    """
    response = requests.get(srn_api_url + "documents")
    return response.json()


def download_document(id, fpath, timeout=60):
    """
    Retreives a certain document from the SRN Document Database and 
    stores it at the provided file path.

    Args:
        id (str): The SRN document id.
        fpath (str): A sting containt the file path where you want to
            store the file.
        timeout (int, optional): Sometimes, a download API call might
            nlock because of a dying connection or because the data
            is not available. If a timeout is reached, the according
            API request will raise an exception and exit. 
            Defaults to 60 seconds.
    """
    response = requests.get(
        srn_api_url + f"documents/{id}/download", 
        timeout=timeout
    )
    with open(fpath, 'wb') as f: f.write(response.content)


In [4]:
# get a list of all companies in the SRN database
srn_companies = get_srn_companies()
print(len(srn_companies))
print(srn_companies[0])

922
{'id': '8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1', 'name': '1&1', 'isin': 'DE0005545503', 'country': 'Germany', 'sector': 'Media & Entertainment', 'href': '', 'href_logo': '', 'company_type': 'public', 'indices': ['c233b29e-f073-426f-88cf-9e9d5e645e6e']}


In [4]:
# retrieve all documents from the SRN database
documents = get_srn_documents()
print(len(documents))
print(documents[0])

11931
{'id': '8e22a8db-51ec-49f2-9118-52b5f3e745bd', 'name': 'A.P. Møller-Mærsk Sustainability Report 2022', 'href': 'https://www.maersk.com/~/media_sc9/maersk/corporate/sustainability/files/resources/2022/maersk-sustainability-yearly-report_2022.pdf?la=de-de&hash=C82244C7CF694E2B8D83CDC7BAC8306D', 'type': 'SR', 'year': '2022', 'source': 'url_cached', 'company_id': '4e2266f6-6bc9-469f-bfa0-344873b81fc6', 'created_at': '2023-10-14T10:30:05.879596', 'created_by_info': None}


## Prepare datasets for further investigation

### 1. For all companies available

In [6]:
all_sr = pd.DataFrame(srn_companies)
all_sr = all_sr.drop(columns=['isin', 'href', 'href_logo', 'indices']) # drop unecessary columns

# normalize company name to lowercase and remove whitespace
all_sr['normalized_name'] = all_sr['name'].str.lower()
all_sr['normalized_name'] = all_sr['normalized_name'].str.replace(' ', '')

# also remove * in the names in reports_24 & normalize the name
reports_24['company_withAccessInfo'] = reports_24['company_withAccessInfo'].str.replace('*', '')
reports_24['normalized_name'] = reports_24['company_withAccessInfo'].str.lower()
reports_24['normalized_name'] = reports_24['normalized_name'].str.replace(' ', '')

# add a column type = CSRD to the reports_24 dataframe
reports_24['type'] = 'CSRD'
# add a column year = 2024
reports_24['year'] = 2024	

# merge the two datasets on the normalized name
all_sr = pd.merge(all_sr, reports_24, on='normalized_name', how='outer')

all_sr.head()

Unnamed: 0.1,id,name,country_x,sector_x,company_type,normalized_name,Unnamed: 0,company_withAccessInfo,link,country_y,sector_y,industry,publication date,pages PDF,auditor,type,year
0,8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1,1&1,Germany,Media & Entertainment,public,1&1,,,,,,,,,,,
1,def442c8-8f64-42c3-af85-728971264d7e,3i Group PLC,United Kingdom,Financial Services,public,3igroupplc,,,,,,,,,,,
2,b21cc316-6693-4dc9-a5d5-fabf650b5787,3M,United States,Basic Materials & Mining,public,3m,,,,,,,,,,,
3,1bcd009f-f39f-44a5-8d61-69e531bffcb9,4Workers Sp. z o.o.,Poland,Clothing & Footwear,private,4workerssp.zo.o.,,,,,,,,,,,
4,4e2266f6-6bc9-469f-bfa0-344873b81fc6,A.P. Moeller-Maersk,Denmark,Automobiles & Other Transport Vehicles,public,a.p.moeller-maersk,,,,,,,,,,,


In [7]:
### controll whether the two dataframes contain any mismatches in the information they provide

# drop all rows where the name and company_withAccessInfo columns are not the same
control = all_sr.dropna()

# show whether name and company_withAccessInfo are the same
control['names_match'] = control['name'] == control['company_withAccessInfo']
mismatches = control[control['names_match'] == False]
print(mismatches[['name', 'company_withAccessInfo']])


            name company_withAccessInfo
29        Adidas                 adidas
48       Aixtron                AIXTRON
582   Kion Group             KION Group
587         KONE                   Kone
781         Relx                   RELX
981    TietoEVRY              Tietoevry
1042       Vinci                  VINCI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['names_match'] = control['name'] == control['company_withAccessInfo']


Only differences in writing not in meaning.

In [8]:
# control the countries
control['country_match'] = control['country_x'] == control['country_y']
mismatches = control[control['country_match'] == False]
print(mismatches[['name', 'country_x', 'country_y']])

       name country_x   country_y
44   Airbus   Germany      France
535  InPost    Poland  Luxembourg
694   Nokia    Sweden     Finland
844     SEB    France      Sweden


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['country_match'] = control['country_x'] == control['country_y']


! need further investigation which information is truthfull

In [9]:
# control the sector
control['sector_match'] = control['sector_x'] == control['sector_y']
mismatches = control[control['sector_match'] == False]
print(mismatches[['name', 'sector_x', 'sector_y']])

                name                                sector_x  \
29            Adidas                     Clothing & Footwear   
44            Airbus                         Heavy Machinery   
48           Aixtron                             Electronics   
53             Alfen                 Building & Construction   
59           Allianz                               Insurance   
...              ...                                     ...   
1018             UPM                  Forestry & Agriculture   
1042           Vinci                 Building & Construction   
1052      Volkswagen  Automobiles & Other Transport Vehicles   
1077  Wolters Kluwer          Industrial Products & Services   
1080        Wärtsilä                    Industrial Machinery   

                                      sector_y  
29                              Consumer Goods  
44                     Resource Transformation  
48                 Technology & Communications  
53                     Resource Tra

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['sector_match'] = control['sector_x'] == control['sector_y']


! need further investigation also with respect to the industry

In [11]:
# impute missing information to columns we want to keep
# company name
all_sr['name'] = all_sr['name'].fillna(all_sr['company_withAccessInfo'])
# country
all_sr['country_x'] = all_sr['country_x'].fillna(all_sr['country_y'])
# sector
all_sr['sector_x'] = all_sr['sector_x'].fillna(all_sr['sector_y'])

# finish the dataframe of reports for 2024 by keeping only the columns we need
all_sr = all_sr[['id', 'name', 'normalized_name', 'country_x', 'sector_x', 'company_type', 'link', 'year', 'pages PDF', 'auditor', 'type']]
all_sr = all_sr.rename(columns={'id': 'company_id', 'name': 'company_name', 'country_x': 'country', 'sector_x': 'sector', 'pages PDF': 'pages_pdf'})

In [13]:
### add reports from before 2024
documents = pd.DataFrame(documents)

# merge the two datasets on company_id
all_sr = pd.merge(all_sr, documents, on='company_id', how='outer')

# impute missing information to columns we want to keep
# link
all_sr['link'] = all_sr['link'].fillna(all_sr['href'])
# type
all_sr['type'] = all_sr['type_x'].fillna(all_sr['type_y'])
# year
all_sr['year'] = all_sr['year_x'].fillna(all_sr['year_y'])

# finish the dataframe by keeping only the columns we need
all_sr = all_sr[['company_id', 'company_name', 'normalized_name', 'country', 'sector', 'company_type', 'link', 'type', 'pages_pdf', 'year', 'auditor', 'id']]
all_sr = all_sr.rename(columns={'id': 'document_id'})


In [14]:
# drop missing in company_name
all_sr = all_sr.dropna(subset=['company_name'])

In [15]:
# drop all missings and empty strings in the link column
all_sr = all_sr.dropna(subset=['link'])
all_sr = all_sr[all_sr['link'] != '']

In [16]:
# print links in ascending order
all_sr = all_sr.sort_values(by=['link'])
print(all_sr[['company_name', 'link']].head(10))

        company_name                                               link
3861          Smiths  *https://www.smiths.com/-/media/files/smiths-c...
2698         Qinetiq                                                  0
7688             EQT       2517f65322a56bad909f90dd00f55506db442bea.pdf
10417     SUEZ Group             No internet link available (only PDFs)
17     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
16     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
4722    Adecco Group  annualreport2012.adecco.com/fileadmin/user_upl...
4727    Adecco Group  annualreport2013.adecco.com/fileadmin/user_upl...
4721    Adecco Group  annualreport2014.adecco.com/fileadmin/user_upl...
4724    Adecco Group  annualreport2015.adecco.com/fileadmin/user_upl...


### 2. Only companies with CSRD reports in 2024 and sustainability reports before 2024

# Extract pdf text

### with Docling

In [8]:
# randomly select 10 reports from 2024
sample = reports_24.sample(n=10, random_state=123)
sample.head(10)

Unnamed: 0.1,Unnamed: 0,company_withAccessInfo,link,country,sector,industry,publication date,pages PDF,auditor
254,287,argenx SE*,https://argenx.com/content/dam/argenx-corp/med...,Netherlands,Health Care,Biotechnology & Pharmaceuticals,2025-03-20,70,Deloitte
220,19,Continental AG,https://annualreport.continental.com/2024/en/s...,Germany,Transportation,Auto Parts,2025-03-18,125,PwC
201,107,DSB,https://www.dsb.dk/globalassets/arsrapport/202...,Denmark,Transportation,Rail Transportation,2025-02-07,106,EY
171,151,Fresenius,https://www.fresenius.com/sites/default/files/...,Germany,Health Care,Health Care Delivery,2025-03-26,154,PwC
269,222,Aena,https://www.aena.es/sites/Satellite?blobcol=ur...,Spain,Transportation,Air Freight & Logistics,2025-02-26,303,KPMG
209,166,Demant,https://assets-we.cas.dgs.com/-/media/demant/s...,Denmark,Health Care,Medical Equipment & Supplies,2025-02-05,66,PwC
20,135,Tryg,https://tryg.com/sites/tryg.com/files/2025-01/...,Denmark,Financials,Insurance,2025-01-23,79,PwC
30,277,TenneT Holding B.V.*,https://tennet-drupal.s3.eu-central-1.amazonaw...,Netherlands,Infrastructure,Electric Utilities & Power Generators,2025-03-06,60,Deloitte
273,61,Acomo,https://www.acomo.nl/wp-content/uploads/2025/0...,Netherlands,Food & Beverage,Processed Foods,2025-03-07,56,EY
42,270,Svenska Handelsbanken AB*,https://www.handelsbanken.com/tron/xgpu/info/c...,Sweden,Financials,Commercial Banks,2025-02-26,120,Deloitte


In [None]:
import pymupdf
doc = pymupdf.open("argenx-2024.pdf") # open the document
# select 100 pages of the document

doc.save("sample_pages.pdf") # save the document

In [6]:
# Convert the selected page to markdown
from docling.document_converter import DocumentConverter
import time

start = time.time()
source = "argenx-2024.pdf"
converter = DocumentConverter()
result = converter.convert(source)
#result.document.export_to_markdown()
print("Time taken:", (time.time() - start)/60 , "minutes")	
print(result.document.export_to_markdown())


KeyboardInterrupt: 

In [4]:
print(result)

input=InputDocument(file=PureWindowsPath('sample_pages.pdf'), document_hash='0c81e21e3111635b848d54b6e0938708bfd4769732c5d2512028e1ac17f30333', valid=True, limits=DocumentLimits(max_num_pages=9223372036854775807, max_file_size=9223372036854775807, page_range=(1, 9223372036854775807)), format=<InputFormat.PDF: 'pdf'>, filesize=81562629, page_count=10) status=<ConversionStatus.SUCCESS: 'success'> errors=[] pages=[Page(page_no=0, size=Size(width=594.719970703125, height=841.6799926757812), cells=[PdfTextCell(index=0, rgba=ColorRGBA(r=0, g=0, b=0, a=255), rect=BoundingRectangle(r_x0=39.6, r_y0=169.2359926757813, r_x1=169.674, r_y1=169.2359926757813, r_x2=169.674, r_y2=137.7399926757812, r_x3=39.6, r_y3=137.7399926757812, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), text='Annual ', orig='Annual ', text_direction=<TextDirection.LEFT_TO_RIGHT: 'left_to_right'>, confidence=1.0, from_ocr=False, rendering_mode=<PdfCellRenderingMode.UNKNOWN: -1>, widget=False, font_key='/F1', font_name='/AAAQS

In [9]:
# use docling to extract the text from the pdfs
from docling.document_converter import DocumentConverter
from tqdm import tqdm
import json

# Prepare converter
converter = DocumentConverter()

# Output list for storing results
results = []

# Loop through the first 10 rows
for idx, row in tqdm(sample.iterrows()):
    pdf_url = row["link"]
    company_name = row["company_withAccessInfo"]
    try:
        result = converter.convert(pdf_url)
        markdown_text = result.document.export_to_markdown()
        results.append({
            "index": idx,
            "company_name": company_name,
            "link": pdf_url,
            "text": markdown_text
        })
    except Exception as e:
        results.append({
            "index": idx,
            "link": pdf_url,
            "error": str(e)
        })

  from .autonotebook import tqdm as notebook_tqdm


: 

: 

In [None]:
# Save results to a JSONL file (scalable format)
with open("docling_output_sample.jsonl", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
with open("docling_output_sample.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

df_out = pd.DataFrame(data)


### with PyMuPDF
based on: https://github.com/EdisonNi-hku/chatreport/blob/main/code/document.py 

In [None]:
import pymupdf
import requests
import os
import io

In [23]:
url = sample['link'].values[5]
print(url)

https://assets-we.cas.dgs.com/-/media/demant/shared/new-library-2022/financial-reports/annual-report-english/demant-annual-report-2024_2.pdf?la=en&rev=0487&hash=BF20FE17B1586ABE0B06D808F899EC67


In [24]:
# from load_pdf
response = requests.get(url)
pdf_bytes = io.BytesIO(response.content)
pdf = pymupdf.open(stream=pdf_bytes, filetype='pdf')  # Open the PDF from bytes

In [25]:
# from extract_text
text_list = [page.get_text() for page in pdf]
all_text = ''.join(text_list)

In [29]:
print(text_list[0])  # Print the text of the first page

 
 
 
 
 
 
Annual 
Report 
2024 
Peter, hearing aid user, pensioner, and ironman 
Demant A/S 
Kongebakken 9 
2765 Smørum 
Denmark 
CVR no. 71186911 
1 January – 31 December 2024 



In [26]:
# from get_title
doc = pdf
max_font_size = 0
max_string = ''
font_sizes = [0]

for page in doc:
    text = page.get_text("dict")
    for block in text.get('blocks', []):
        if block["type"] == 0 and block["lines"]:
            span = block["lines"][0]["spans"][0]
            size = span['size']
            font_sizes.append(size)
            if size > max_font_size:
                max_font_size = size
                max_string = span['text']
font_sizes.sort()
probable_title = ''
for page in doc:
    text = page.get_text("dict")
    for block in text.get("blocks", []):
        if block["type"] == 0 and block["lines"]:
            span = block["lines"][0]["spans"][0]
            size = span["size"]
            cur_text = span["text"]
            if abs(size - font_sizes[-1]) < 0.3 or abs(size - font_sizes[-2]) < 0.3:
                if len(cur_text.strip()) > 4:
                    probable_title += (' ' + cur_text.strip())
title = probable_title.strip()
print("Title:", title)


Title: Management statement Overview Market and strategy Financial performance Corporate governance Sustainability statement Sustainability Environment Social Governance Additional information Financial statements Consolidated financial Parent financial Signatures


In [27]:
# get chapter names
roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
digit_num = [str(d + 1) for d in range(10)]

chapter_names = []
for line in all_text.split('\n'):
    parts = line.split(' ')
    if '.' in line and 1 < len(parts) < 5:
        point_split = line.split('.')
        if 1 < len(point_split) < 5:
            if point_split[0] in roman_num or point_split[0] in digit_num:
                chapter_names.append(line.strip())
print(chapter_names)

['10.6%', '2.3', '2.1', '2.9', '1.9', '2.8', '10.9', '10.3', '9.4', '8.8', '8.5', '4.11', '10.06', '4.68', '10.99', '8.04', '9.21', '10.70', '4.68', '2.3', '2.3', '2.1', '2.4', '2.9', '10.9', '10.6', '10.3', '9.9', '9.4', '7.08', '6.18', '6.50', '6.14', '4.99', '5.47', '5.47', '5.44', '2.60', '4.61', '2.5.', '4.64', '2.5.', '8.1', '3.4', '7.2', '10.0', '1.62%', '1.09%', '10.9', '1.5', '1.5', '1.1', '3.4 / 6.1', '1.9', '4.2', '4.2', '5.1', '6.2', '1.4', '1.4', '1.4', '10.99', '8.04', '1.4', '10.99', '8.04', '4.3 / 4.4', '5.2', '7.1', '4.3 / 7.3', '7.4', '4.3 / 4.4', '4.3', '7.1', '4.3 / 7.3', '7.4', '6.2', '3.1', '3.2', '3.3', '3.4', '4.3 / 4.5', '5.2', '3.5', '1.5', '1.6 / 4.3', '4.3', '4.3 / 4.4', '4.3 / 4.4', '6.2', '1.8', '4.4', '4.4', '4.4', '3.3 / 4.4', '6.2', '4.3 / 4.4', '1.2 Employees', '1.5 Inventories', '1.6 Trade receivables', '1.7 Customer loans', '3.1 Intangible assets', '3.3 Leases', '3.6 Impairment testing', '5.2 Deferred tax', '7.1 Provisions', '7.3 Other liabilities', 

### with ReportProcessor module

In [31]:
from report_processor import ReportProcessor

def extract_text_from_link(row):
    try:
        rp = ReportProcessor(url=row['link'])
        rp.load_pdf()
        rp.extract_text()
        return rp.all_text  # or rp.text_list if you want page-wise text
    except Exception as e:
        print(f"Failed for {row['link']}: {e}")
        return None

# Add a column with the raw extracted text
sample['extracted_text'] = sample.apply(extract_text_from_link, axis=1)