# Download Sustainability Reports

In [6]:
!pip install requests

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Downloading certifi-2025.1.31-py3-none-any.whl (166 kB)
Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl (102 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Using cached urllib3-2.3.0-py3-none-any.whl (128 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.1.31 charset-normalizer-3.4.1 idna-3.10 requests-2.32.3

In [7]:
!pip freeze > requirements.txt

In [8]:
import pandas as pd
import requests

## Reports for 2024
Source: Sustainability Reporting Navigator (they crowd-source list of CSRD-compliant reports for fiscal years starting on 01/01/2024)

Download pdf with a list of all reports under https://www.sustainabilityreportingnavigator.com/#/csrdreports 

In [4]:
# Open the csv data file
reports_24 = pd.read_csv('esg_reports_2024.csv')

In [9]:
print(len(reports_24))

277


## Reports for 2010 until 2023
Source: Donau, Charlotte-Louise, Fikir Worku Edossa, Joachim Gassen, Gaia Melloni, Inga Meringdal, Bianca Minuth, Arianna Piscella, Paul Pronobis and Victor Wagner (2023): SRN Document Database, https://github.com/trr266/srn_docs.

"Our objective is to develop this repository into a collaborative data platform that provides extensive coverage of sustainability-related documents published by European publicly-listed firms."

In [10]:
# Code from SRN API documentation https://github.com/trr266/srn_docs/blob/main/srn_docs_api.py

srn_api_url = "https://api.sustainabilityreportingnavigator.com/api/"

def get_srn_companies():
    """
    Returns a list of companies that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg company level metadata
    """
    response = requests.get(srn_api_url + "companies")
    return response.json()


def get_srn_documents():
    """
    Returns a list of documents that are included in SRN Document Database.

    Returns:
        [list{dict}]: A list containg document level metadata
    """
    response = requests.get(srn_api_url + "documents")
    return response.json()


def download_document(id, fpath, timeout=60):
    """
    Retreives a certain document from the SRN Document Database and 
    stores it at the provided file path.

    Args:
        id (str): The SRN document id.
        fpath (str): A sting containt the file path where you want to
            store the file.
        timeout (int, optional): Sometimes, a download API call might
            nlock because of a dying connection or because the data
            is not available. If a timeout is reached, the according
            API request will raise an exception and exit. 
            Defaults to 60 seconds.
    """
    response = requests.get(
        srn_api_url + f"documents/{id}/download", 
        timeout=timeout
    )
    with open(fpath, 'wb') as f: f.write(response.content)


if __name__ == "__main__":
    companies = get_srn_companies()
    documents = get_srn_documents()
    print("Searching comapny with a name containing 'Allianz'")
    matches = [c for c in companies if 'Allianz' in c['name']]
    print(
        f"Found {len(matches)} match(es). " +
        "Retrieving the documents for the first match."
    )
    docs = [d for d in documents if d['company_id'] == matches[0]['id']] 
    FPATH = 'test_srn_docs.pdf'
    print(
        f"Found {len(docs)} documents. " +
        "Retrieving the first document from the list " +
        f"and storing it as '{FPATH}'."
    )
    download_document(docs[0]['id'], FPATH)
    print("done!")

Searching comapny with a name containing 'Allianz'
Found 1 match(es). Retrieving the documents for the first match.
Found 30 documents. Retrieving the first document from the list and storing it as 'test_srn_docs.pdf'.
done!


In [16]:
# get a list of all companies in the SRN database
srn_companies = get_srn_companies()

# covert to a dataframe
srn_companies = pd.DataFrame(srn_companies)

In [17]:
# get all companies that are in the reports_24 dataframe and SRN database
companies = srn_companies[srn_companies['name'].isin(reports_24['company_withAccessInfo'])]

! reports_24 some companys have * maybe also other preprocessing