# Download Sustainability Reports

In [1]:
import pandas as pd
import requests

## Reports for 2024
Source: Sustainability Reporting Navigator (crowd-source list of CSRD-compliant reports for fiscal years starting on 01/01/2024)

Download CSV with information on all reports https://www.sustainabilityreportingnavigator.com/#/csrdreports 

In [2]:
# Open the csv data file
reports_24 = pd.read_csv('esg_reports_2024.csv')
print(len(reports_24))

277


## Reports for 2010 until 2023
Source: Donau, Charlotte-Louise, Fikir Worku Edossa, Joachim Gassen, Gaia Melloni, Inga Meringdal, Bianca Minuth, Arianna Piscella, Paul Pronobis and Victor Wagner (2023): SRN Document Database, https://github.com/trr266/srn_docs.

- "Our objective is to develop this repository into a collaborative data platform that provides extensive coverage of sustainability-related documents published by European publicly-listed firms."
- "We try to collect all documents that contain relevant sustainability information. This includes but is not limited to annual and sustainability reports (AR and SR). For some firms it also includes additional reports like integreated reports (IR), Carbon Diclosure Project data (CDP), and other reporting formats."

In [3]:
# Code from SRN API documentation https://github.com/trr266/srn_docs/blob/main/srn_docs_api.py

srn_api_url = "https://api.sustainabilityreportingnavigator.com/api/"

def get_srn_companies():
    """
    Returns a list of companies that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg company level metadata
    """
    response = requests.get(srn_api_url + "companies")
    return response.json()


def get_srn_documents():
    """
    Returns a list of documents that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg document level metadata
    """
    response = requests.get(srn_api_url + "documents")
    return response.json()


def download_document(id, fpath, timeout=60):
    """
    Retreives a certain document from the SRN Document Database and 
    stores it at the provided file path.

    Args:
        id (str): The SRN document id.
        fpath (str): A sting containt the file path where you want to
            store the file.
        timeout (int, optional): Sometimes, a download API call might
            nlock because of a dying connection or because the data
            is not available. If a timeout is reached, the according
            API request will raise an exception and exit. 
            Defaults to 60 seconds.
    """
    response = requests.get(
        srn_api_url + f"documents/{id}/download", 
        timeout=timeout
    )
    with open(fpath, 'wb') as f: f.write(response.content)


In [4]:
# get a list of all companies in the SRN database
srn_companies = get_srn_companies()
print(len(srn_companies))
print(srn_companies[0])

922
{'id': '8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1', 'name': '1&1', 'isin': 'DE0005545503', 'country': 'Germany', 'sector': 'Media & Entertainment', 'href': '', 'href_logo': '', 'company_type': 'public', 'indices': ['c233b29e-f073-426f-88cf-9e9d5e645e6e']}


In [4]:
# retrieve all documents from the SRN database
documents = get_srn_documents()
print(len(documents))
print(documents[0])

11931
{'id': '8e22a8db-51ec-49f2-9118-52b5f3e745bd', 'name': 'A.P. Møller-Mærsk Sustainability Report 2022', 'href': 'https://www.maersk.com/~/media_sc9/maersk/corporate/sustainability/files/resources/2022/maersk-sustainability-yearly-report_2022.pdf?la=de-de&hash=C82244C7CF694E2B8D83CDC7BAC8306D', 'type': 'SR', 'year': '2022', 'source': 'url_cached', 'company_id': '4e2266f6-6bc9-469f-bfa0-344873b81fc6', 'created_at': '2023-10-14T10:30:05.879596', 'created_by_info': None}


## Prepare datasets for further investigation

### 1. For all companies available

In [6]:
all_sr = pd.DataFrame(srn_companies)
all_sr = all_sr.drop(columns=['isin', 'href', 'href_logo', 'indices']) # drop unecessary columns

# normalize company name to lowercase and remove whitespace
all_sr['normalized_name'] = all_sr['name'].str.lower()
all_sr['normalized_name'] = all_sr['normalized_name'].str.replace(' ', '')

# also remove * in the names in reports_24 & normalize the name
reports_24['company_withAccessInfo'] = reports_24['company_withAccessInfo'].str.replace('*', '')
reports_24['normalized_name'] = reports_24['company_withAccessInfo'].str.lower()
reports_24['normalized_name'] = reports_24['normalized_name'].str.replace(' ', '')

# add a column type = CSRD to the reports_24 dataframe
reports_24['type'] = 'CSRD'
# add a column year = 2024
reports_24['year'] = 2024	

# merge the two datasets on the normalized name
all_sr = pd.merge(all_sr, reports_24, on='normalized_name', how='outer')

all_sr.head()

Unnamed: 0.1,id,name,country_x,sector_x,company_type,normalized_name,Unnamed: 0,company_withAccessInfo,link,country_y,sector_y,industry,publication date,pages PDF,auditor,type,year
0,8dee5d4e-2b5d-44c4-a78d-2d5d8dd92df1,1&1,Germany,Media & Entertainment,public,1&1,,,,,,,,,,,
1,def442c8-8f64-42c3-af85-728971264d7e,3i Group PLC,United Kingdom,Financial Services,public,3igroupplc,,,,,,,,,,,
2,b21cc316-6693-4dc9-a5d5-fabf650b5787,3M,United States,Basic Materials & Mining,public,3m,,,,,,,,,,,
3,1bcd009f-f39f-44a5-8d61-69e531bffcb9,4Workers Sp. z o.o.,Poland,Clothing & Footwear,private,4workerssp.zo.o.,,,,,,,,,,,
4,4e2266f6-6bc9-469f-bfa0-344873b81fc6,A.P. Moeller-Maersk,Denmark,Automobiles & Other Transport Vehicles,public,a.p.moeller-maersk,,,,,,,,,,,


In [7]:
### controll whether the two dataframes contain any mismatches in the information they provide

# drop all rows where the name and company_withAccessInfo columns are not the same
control = all_sr.dropna()

# show whether name and company_withAccessInfo are the same
control['names_match'] = control['name'] == control['company_withAccessInfo']
mismatches = control[control['names_match'] == False]
print(mismatches[['name', 'company_withAccessInfo']])


            name company_withAccessInfo
29        Adidas                 adidas
48       Aixtron                AIXTRON
582   Kion Group             KION Group
587         KONE                   Kone
781         Relx                   RELX
981    TietoEVRY              Tietoevry
1042       Vinci                  VINCI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['names_match'] = control['name'] == control['company_withAccessInfo']


Only differences in writing not in meaning.

In [8]:
# control the countries
control['country_match'] = control['country_x'] == control['country_y']
mismatches = control[control['country_match'] == False]
print(mismatches[['name', 'country_x', 'country_y']])

       name country_x   country_y
44   Airbus   Germany      France
535  InPost    Poland  Luxembourg
694   Nokia    Sweden     Finland
844     SEB    France      Sweden


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['country_match'] = control['country_x'] == control['country_y']


! need further investigation which information is truthfull

In [9]:
# control the sector
control['sector_match'] = control['sector_x'] == control['sector_y']
mismatches = control[control['sector_match'] == False]
print(mismatches[['name', 'sector_x', 'sector_y']])

                name                                sector_x  \
29            Adidas                     Clothing & Footwear   
44            Airbus                         Heavy Machinery   
48           Aixtron                             Electronics   
53             Alfen                 Building & Construction   
59           Allianz                               Insurance   
...              ...                                     ...   
1018             UPM                  Forestry & Agriculture   
1042           Vinci                 Building & Construction   
1052      Volkswagen  Automobiles & Other Transport Vehicles   
1077  Wolters Kluwer          Industrial Products & Services   
1080        Wärtsilä                    Industrial Machinery   

                                      sector_y  
29                              Consumer Goods  
44                     Resource Transformation  
48                 Technology & Communications  
53                     Resource Tra

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['sector_match'] = control['sector_x'] == control['sector_y']


! need further investigation also with respect to the industry

In [11]:
# impute missing information to columns we want to keep
# company name
all_sr['name'] = all_sr['name'].fillna(all_sr['company_withAccessInfo'])
# country
all_sr['country_x'] = all_sr['country_x'].fillna(all_sr['country_y'])
# sector
all_sr['sector_x'] = all_sr['sector_x'].fillna(all_sr['sector_y'])

# finish the dataframe of reports for 2024 by keeping only the columns we need
all_sr = all_sr[['id', 'name', 'normalized_name', 'country_x', 'sector_x', 'company_type', 'link', 'year', 'pages PDF', 'auditor', 'type']]
all_sr = all_sr.rename(columns={'id': 'company_id', 'name': 'company_name', 'country_x': 'country', 'sector_x': 'sector', 'pages PDF': 'pages_pdf'})

In [13]:
### add reports from before 2024
documents = pd.DataFrame(documents)

# merge the two datasets on company_id
all_sr = pd.merge(all_sr, documents, on='company_id', how='outer')

# impute missing information to columns we want to keep
# link
all_sr['link'] = all_sr['link'].fillna(all_sr['href'])
# type
all_sr['type'] = all_sr['type_x'].fillna(all_sr['type_y'])
# year
all_sr['year'] = all_sr['year_x'].fillna(all_sr['year_y'])

# finish the dataframe by keeping only the columns we need
all_sr = all_sr[['company_id', 'company_name', 'normalized_name', 'country', 'sector', 'company_type', 'link', 'type', 'pages_pdf', 'year', 'auditor', 'id']]
all_sr = all_sr.rename(columns={'id': 'document_id'})


In [14]:
# drop missing in company_name
all_sr = all_sr.dropna(subset=['company_name'])

In [15]:
# drop all missings and empty strings in the link column
all_sr = all_sr.dropna(subset=['link'])
all_sr = all_sr[all_sr['link'] != '']

In [16]:
# print links in ascending order
all_sr = all_sr.sort_values(by=['link'])
print(all_sr[['company_name', 'link']].head(10))

        company_name                                               link
3861          Smiths  *https://www.smiths.com/-/media/files/smiths-c...
2698         Qinetiq                                                  0
7688             EQT       2517f65322a56bad909f90dd00f55506db442bea.pdf
10417     SUEZ Group             No internet link available (only PDFs)
17     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
16     Admiral Group  admiralgroup.co.uk/sites/default/files_public/...
4722    Adecco Group  annualreport2012.adecco.com/fileadmin/user_upl...
4727    Adecco Group  annualreport2013.adecco.com/fileadmin/user_upl...
4721    Adecco Group  annualreport2014.adecco.com/fileadmin/user_upl...
4724    Adecco Group  annualreport2015.adecco.com/fileadmin/user_upl...


### 2. Only companies with CSRD reports in 2024 and sustainability reports before 2024

# Extract pdf text

In [8]:
# Get the row with the minimum number of pages
shortest_report = reports_24.loc[reports_24['pages PDF'].idxmin()]

# Get the link
print("Link to the shortest report:", shortest_report['link'])
print(shortest_report)

Link to the shortest report: https://www.karnovgroup.com/en/wp-content/uploads/sites/2/2025/02/karnovgroup-publication-of-karnov-groups-annual-report-and-sustainability-report-2024-250331.pdf
Unnamed: 0                                                              253
company_withAccessInfo                                        Karnov Group*
link                      https://www.karnovgroup.com/en/wp-content/uplo...
country                                                              Sweden
sector                                          Technology & Communications
industry                                          Internet Media & Services
publication date                                                 2025-03-31
pages PDF                                                                22
auditor                                                                 PwC
Name: 139, dtype: object


In [5]:
# randomly select 10 reports from 2024
sample = reports_24.sample(n=20, random_state=123)
sample.head(20)

Unnamed: 0.1,Unnamed: 0,company_withAccessInfo,link,country,sector,industry,publication date,pages PDF,auditor
254,287,argenx SE*,https://argenx.com/content/dam/argenx-corp/med...,Netherlands,Health Care,Biotechnology & Pharmaceuticals,2025-03-20,70,Deloitte
220,19,Continental AG,https://annualreport.continental.com/2024/en/s...,Germany,Transportation,Auto Parts,2025-03-18,125,PwC
201,107,DSB,https://www.dsb.dk/globalassets/arsrapport/202...,Denmark,Transportation,Rail Transportation,2025-02-07,106,EY
171,151,Fresenius,https://www.fresenius.com/sites/default/files/...,Germany,Health Care,Health Care Delivery,2025-03-26,154,PwC
269,222,Aena,https://www.aena.es/sites/Satellite?blobcol=ur...,Spain,Transportation,Air Freight & Logistics,2025-02-26,303,KPMG
209,166,Demant,https://assets-we.cas.dgs.com/-/media/demant/s...,Denmark,Health Care,Medical Equipment & Supplies,2025-02-05,66,PwC
20,135,Tryg,https://tryg.com/sites/tryg.com/files/2025-01/...,Denmark,Financials,Insurance,2025-01-23,79,PwC
30,277,TenneT Holding B.V.*,https://tennet-drupal.s3.eu-central-1.amazonaw...,Netherlands,Infrastructure,Electric Utilities & Power Generators,2025-03-06,60,Deloitte
273,61,Acomo,https://www.acomo.nl/wp-content/uploads/2025/0...,Netherlands,Food & Beverage,Processed Foods,2025-03-07,56,EY
42,270,Svenska Handelsbanken AB*,https://www.handelsbanken.com/tron/xgpu/info/c...,Sweden,Financials,Commercial Banks,2025-02-26,120,Deloitte


In [10]:
# Convert the report to markdown
from docling.document_converter import DocumentConverter
import time

start = time.time()
source = "karnovgroup-2024.pdf"
converter = DocumentConverter()
result = converter.convert(source)
#result.document.export_to_markdown()
print("Time taken:", (time.time() - start)/60:.2f , "minutes")	
print(result.document.export_to_markdown())

2025-05-15 14:32:43.582500: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 14:32:43.595649: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747312363.610216  335345 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747312363.614700  335345 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-15 14:32:43.631461: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Time taken: 3.0718769232432046 minutes
ANNUAL REPORT 2024

CLEARING THEPATH TOJUSTICE

IIIKARNOV GROUP

## Contents

| Karnov in Brief                                    |   3 |
|----------------------------------------------------|-----|
| Significant events in 2024                         |   4 |
| Multi-year overview                                |   5 |
| The CEO sets the scene                             |   6 |
| Karnov Group as an investment                      |   8 |
| Value model and financial targets                  |  10 |
| Strategy                                           |  13 |
| A European market with clear drivers               |  18 |
| Products and services                              |  22 |
| The Regions                                        |  26 |
| The Karnov share                                   |  30 |
| Sustainability Report                              |  32 |
| Corporate Governance Report                        |  54 |
| Board of Directors and Mana

In [2]:
# Convert the report to markdown
from docling.document_converter import DocumentConverter
import time

start = time.time()
source = "argenx-2024.pdf"
converter = DocumentConverter()
result = converter.convert(source)
#result.document.export_to_markdown()
print("Time taken:", (time.time() - start) / 60 , "minutes")	
print(result.document.export_to_markdown())

2025-05-15 16:23:56.640964: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 16:23:56.654346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747319036.668844 3875094 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747319036.673309 3875094 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-15 16:23:56.689970: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Time taken: 26.646835275491078 minutes
<!-- image -->

Disclaimer PDF print - this document is only a 'printed version' and is not the original annual financial reporting including the audited financial statements pursuant to Article 361 of Book 2 of the Dutch Civil Code.

These original annual financial reporting included in the audited financial statements and the auditor's report thereto, are included in the single report package which can be found at https://www.argenx.com/investors/financial-reports

## 2024 Annual Report including the Annual Financial Statements for the year ended December 31, 2024

This Annual Report is filed with the Dutch Authority for the Financial Markets ( Stichting Autoriteit Financiële Markten , AFM ). The following main items included in our annual report on Form 20-F for the year ended December 31, 2024 ( 2024 20-F ) filed with the United States Securities and Exchange Commission ( SEC ) on or about the date of this Annual Report have not been included 

In [3]:
markdown_text = result.document.export_to_markdown()

with open("argenx_2024.md", "w", encoding="utf-8") as f:
    f.write(markdown_text)

In [None]:
import pymupdf
doc = pymupdf.open("argenx-2024.pdf") # open the document
# select 100 pages of the document

doc.save("sample_pages.pdf") # save the document

In [6]:
# Convert the selected page to markdown
from docling.document_converter import DocumentConverter
import time

start = time.time()
source = "argenx-2024.pdf"
converter = DocumentConverter()
result = converter.convert(source)
#result.document.export_to_markdown()
print("Time taken:", (time.time() - start)/60 , "minutes")	
print(result.document.export_to_markdown())


KeyboardInterrupt: 

In [4]:
print(result)

input=InputDocument(file=PureWindowsPath('sample_pages.pdf'), document_hash='0c81e21e3111635b848d54b6e0938708bfd4769732c5d2512028e1ac17f30333', valid=True, limits=DocumentLimits(max_num_pages=9223372036854775807, max_file_size=9223372036854775807, page_range=(1, 9223372036854775807)), format=<InputFormat.PDF: 'pdf'>, filesize=81562629, page_count=10) status=<ConversionStatus.SUCCESS: 'success'> errors=[] pages=[Page(page_no=0, size=Size(width=594.719970703125, height=841.6799926757812), cells=[PdfTextCell(index=0, rgba=ColorRGBA(r=0, g=0, b=0, a=255), rect=BoundingRectangle(r_x0=39.6, r_y0=169.2359926757813, r_x1=169.674, r_y1=169.2359926757813, r_x2=169.674, r_y2=137.7399926757812, r_x3=39.6, r_y3=137.7399926757812, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), text='Annual ', orig='Annual ', text_direction=<TextDirection.LEFT_TO_RIGHT: 'left_to_right'>, confidence=1.0, from_ocr=False, rendering_mode=<PdfCellRenderingMode.UNKNOWN: -1>, widget=False, font_key='/F1', font_name='/AAAQS

In [9]:
# use docling to extract the text from the pdfs
from docling.document_converter import DocumentConverter
from tqdm import tqdm
import json

# Prepare converter
converter = DocumentConverter()

# Output list for storing results
results = []

# Loop through the first 10 rows
for idx, row in tqdm(sample.iterrows()):
    pdf_url = row["link"]
    company_name = row["company_withAccessInfo"]
    try:
        result = converter.convert(pdf_url)
        markdown_text = result.document.export_to_markdown()
        results.append({
            "index": idx,
            "company_name": company_name,
            "link": pdf_url,
            "text": markdown_text
        })
    except Exception as e:
        results.append({
            "index": idx,
            "link": pdf_url,
            "error": str(e)
        })

  from .autonotebook import tqdm as notebook_tqdm


: 

: 

In [None]:
# Save results to a JSONL file (scalable format)
with open("docling_output_sample.jsonl", "w", encoding="utf-8") as f:
    for entry in results:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
with open("docling_output_sample.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

df_out = pd.DataFrame(data)
