Purpose: usage examples of my pipeline

In [1]:
import pandas as pd
esrs_reports = pd.read_excel("./data_preparation/esrs_reports.xlsx")

In [2]:
print(esrs_reports.head())

         company          isin  country publication_date   auditor  \
0     Netcompany  DK0060952919  Denmark       2025-01-29        EY   
1           Tryg  DK0060636678  Denmark       2025-01-23       PwC   
2  DSV Panalpina  DK0060079531  Denmark       2025-02-04       PwC   
3       Lundbeck  DK0061804697  Denmark       2025-02-04       PwC   
4         Vestas  DK0061539921  Denmark       2025-02-05  Deloitte   

                                                link  \
0     https://netcompany.com/investor/annual-report/   
1  https://tryg.com/sites/tryg.com/files/2025-01/...   
2  https://investor.dsv.com/static-files/d34d3dfc...   
3  https://www.lundbeck.com/content/dam/lundbeck-...   
4  https://www.vestas.com/content/dam/vestas-com/...   

                          SASB_industry  
0             Internet Media & Services  
1                             Insurance  
2               Air Freight & Logistics  
3       Biotechnology & Pharmaceuticals  
4  Wind Technology & Project Dev

In [3]:
test = esrs_reports[10:20]
test

Unnamed: 0,company,isin,country,publication_date,auditor,link,SASB_industry
10,Spar Nord Bank,DK0060036564,Denmark,2025-02-05,Deloitte,https://attachment.news.eu.nasdaq.com/a264f5d2...,Commercial Banks
11,TomTom,NL0013332471,Netherlands,2025-02-05,PwC,https://corporate.tomtom.com/static-files/2391...,Internet Media & Services
12,GN Store Nord,DK0010272632,Denmark,2025-02-06,PwC,https://www.gn.com/-/media/Files/Document-Down...,Medical Equipment & Supplies
13,Ørsted,DK0060094928,Denmark,2025-02-06,PwC,https://cdn.orsted.com/-/media/annual2024/orst...,Electric Utilities & Power Generators
14,Mærsk,DK0010244425,Denmark,2025-02-06,PwC,https://attachment.news.eu.nasdaq.com/a55930e6...,Marine Transportation
15,Solar,DK0010274844,Denmark,2025-02-06,Deloitte,https://attachment.news.eu.nasdaq.com/ac073c66...,Solar Technology & Project Developers
16,Carlsberg,DK0010181759,Denmark,2025-02-06,PwC,https://attachment.news.eu.nasdaq.com/aa5bd828...,Alcoholic Beverages
17,Rockwool,DK0010219153,Denmark,2025-02-06,PwC,https://attachment.news.eu.nasdaq.com/af5fffe3...,Building Products & Furnishings
18,Danske Bank,DK0010274414,Denmark,2025-02-07,Deloitte,https://attachment.news.eu.nasdaq.com/afa0c01a...,Commercial Banks
19,Danica Pension,n.a.,Denmark,2025-02-07,Deloitte,https://danicapension.dk/-/media/pdf/danica-pe...,Insurance


In [3]:
import os
import re
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import login
from rag_system import RAGSystem

# preparing filenames
def prepare_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)


# ----------------------------
# 0) Environment & Auth
# ----------------------------
os.environ["TOKENIZERS_PARALLELISM"] = "false"
dotenv_path = os.path.expanduser("~/thesis/esg_extraction/.env")
load_dotenv(dotenv_path=dotenv_path)
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)


# ----------------------------
# 1) Config
# ----------------------------
DB_ROOT = Path("./faiss_dbs")
DB_ROOT.mkdir(parents=True, exist_ok=True)
ESRS_METADATA_PATH = 'EsrsMetadata.xlsx'
REPORTS_CSV_PATH = './data_preparation/esrs_reports.csv' 

# ----------------------------
# 2) Instantiate your system
# ----------------------------
rag = RAGSystem(ESRS_METADATA_PATH)

  backends.update(_get_backends("networkx.backends"))


Initializing RAG System...
Loading embedding model: Qwen/Qwen3-Embedding-0.6B...
Loading generation model: meta-llama/Llama-3.1-8B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


RAG System initialized successfully.


In [7]:
# ----------------------------
# 3) Process each report URL
# ----------------------------
all_results = {}  # nested dict: { report_id: { query_id: {verdict, analysis, sources} } }

import time
start_time = time.time()

for idx, row in test.iterrows():
    url = row.get("link", None)
    company_name = row['company']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    db_path = str(DB_ROOT / report_id)
    
    if pd.isna(url) or not isinstance(url, str) or not url.strip():
        print(f"Row {idx}, Company {report_id}: no valid 'link' URL -> skipping")
        continue

    print(f"\n=== Running pipeline for: {idx} - {report_id} ===")
    try:
        # Important: use the method your class exposes
        # (expects report_id, db_path, and either pdf_url or pdf_path)
        result = rag.process_and_analyze_report(
            report_id=report_id,
            db_path=db_path,
            pdf_url=url        # we’re using the 'link' column (a URL)
        )
        # merge into big dict
        all_results.update(result)
    except Exception as e:
        print(f"Failed on {report_id}: {e}")

time = (time.time() - start_time)/ 60
print(time, "minutes needed for 10 reports")


=== Running pipeline for: SparNordBank_2024 ===

--- Starting Full Pipeline for: SparNordBank_2024 ---
Creating new vector store at faiss_dbs/SparNordBank_2024...
Clearing GPU cache...
--- Finished Pipeline for: SparNordBank_2024 ---

=== Running pipeline for: TomTom_2024 ===

--- Starting Full Pipeline for: TomTom_2024 ---
Creating new vector store at faiss_dbs/TomTom_2024...
Clearing GPU cache...
--- Finished Pipeline for: TomTom_2024 ---

=== Running pipeline for: GNStoreNord_2024 ===

--- Starting Full Pipeline for: GNStoreNord_2024 ---
Creating new vector store at faiss_dbs/GNStoreNord_2024...
Clearing GPU cache...
--- Finished Pipeline for: GNStoreNord_2024 ---

=== Running pipeline for: Ørsted_2024 ===

--- Starting Full Pipeline for: Ørsted_2024 ---
Creating new vector store at faiss_dbs/Ørsted_2024...
Clearing GPU cache...
--- Finished Pipeline for: Ørsted_2024 ---

=== Running pipeline for: Mærsk_2024 ===

--- Starting Full Pipeline for: Mærsk_2024 ---
Creating new vector st

In [9]:
25/10*500/60

20.833333333333332

In [8]:
print(all_results)

{'SparNordBank_2024': {'S1_A1': {'verdict': 'NO', 'analysis': '[[NO]] The company does not explicitly confirm that all people in its own workforce who could be materially impacted are included in the scope of its disclosure.', 'sources': ['1', '3', '4', '5', '6', '7']}, 'S1_A2': {'verdict': 'NO', 'analysis': '[[NO]] The company does not explicitly describe the types of employees in its own workforce that are subject to material impacts.', 'sources': ['1', '2', '4', '6']}, 'S1_A3': {'verdict': 'NO', 'analysis': '[[NO]] The company does not describe the types of non-employees in its own workforce that are subject to material impacts. Although the company mentions its employees as the most important resource, it does not explicitly mention non-employees such as self-employed individuals or those provided by third-party agencies.', 'sources': ['1', '4']}, 'S1_A4': {'verdict': 'YES', 'analysis': '[[YES]] The company describes activities that result in material positive impacts on its own wo