Purpose: usage examples of my pipeline

In [1]:
import pandas as pd
skipped_reports = pd.read_csv("skipped_reports.csv")

In [40]:
skipped_reports['link'][20]

'https://www.fluidra.com/pdf-viewer.php?file=https://www.fluidra.com/wp-content/uploads/2025/03/3.-Integrated-Report_ENG_vFinal-ENG.pdf'

In [41]:
skipped_reports['report_id'][20]

'Fluidra_2024'

In [11]:
### Load Metadata of Reports to be analyzed
import pandas as pd
esrs_reports = pd.read_excel("./data_preparation/esrs_reports.xlsx")
esrs_reports = esrs_reports[99:]
print(len(esrs_reports))
esrs_reports.head()

383


Unnamed: 0,company,isin,country,publication_date,auditor,link,SASB_industry,report_id
99,Volkswagen,DE0007664039,Germany,2025-03-11,EY,https://www.volkswagen-group.com/en/publicatio...,Automobiles,Volkswagen_2024
100,Helvetia,CH0466642201,Switzerland,2025-03-06,KPMG,https://www.helvetia.com/content/dam/os/corpor...,Insurance,Helvetia_2024
101,Iveco,NL0015000LU4,Netherlands,2025-03-05,Deloitte,https://www.ivecogroup.com/-/media/investors/s...,Industrial Machinery & Goods,Iveco_2024
102,Air Liquide,FR0000120073,France,2025-03-07,PwC & KPMG,https://www.airliquide.com/sites/airliquide.co...,Chemicals,AirLiquide_2024
103,ABN Amro,NL0011540547,Netherlands,2025-03-12,EY,https://downloads.ctfassets.net/1u811bvgvthc/4...,Commercial Banks,ABNAmro_2024


In [3]:
import os
import re
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import login
from rag_system import RAGSystem



# ----------------------------
# 0) Environment & Auth
# ----------------------------
os.environ["TOKENIZERS_PARALLELISM"] = "false"
dotenv_path = os.path.expanduser("~/thesis/esg_extraction/.env")
load_dotenv(dotenv_path=dotenv_path)
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)


# ----------------------------
# 1) Config
# ----------------------------
DB_ROOT = Path("./faiss_dbs")
DB_ROOT.mkdir(parents=True, exist_ok=True)
ESRS_METADATA_PATH = 'EsrsMetadata.xlsx'
REPORTS_CSV_PATH = './data_preparation/esrs_reports.csv' 

RESULTS_PATH = "all_results.jsonl" # nested dict: { report_id: { query_id: {verdict, analysis, sources} } }
SKIPPED_PATH = "skipped_reports.csv" # keep track of skipped reports

# Ensure skipped_reports.csv has a header if file doesn't exist
if not os.path.exists(SKIPPED_PATH):
    pd.DataFrame(columns=esrs_reports.columns).to_csv(SKIPPED_PATH, index=False)

# ----------------------------
# 2) Instantiate your system
# ----------------------------
rag = RAGSystem(ESRS_METADATA_PATH)


  backends.update(_get_backends("networkx.backends"))


Initializing RAG System...
Loading embedding model: Qwen/Qwen3-Embedding-0.6B...
Loading generation model: meta-llama/Llama-3.1-8B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


RAG System initialized successfully.


In [9]:
for idx, row in esrs_reports.iterrows():
    url = row.get("link", None)
    company_name = row['company']
    report_id = row['report_id']
    db_path = str(DB_ROOT / report_id)
    
    if pd.isna(url) or not isinstance(url, str) or not url.strip():
        print(f"Row {idx}, Company {report_id}: no valid 'link' URL -> skipping")
        # append directly to skipped_reports.csv
        row.to_frame().T.to_csv(SKIPPED_PATH, mode="a", header=False, index=False)
        continue

    print(f"\n=== Running pipeline for: {idx} - {report_id} ===")
    try:
        # Important: use the method your class exposes
        # (expects report_id, db_path, and either pdf_url or pdf_path)
        result = rag.process_and_analyze_report(
            report_id=report_id,
            db_path=db_path,
            pdf_url=url        # we’re using the 'link' column (a URL)
        )

        # augment result with metadata before saving
        record = {
            "report_id": report_id,
            "company": company_name,
            "row_index": int(idx),
            "result": result.get(report_id, {}),
        }

        # append result as one JSON line
        with open(RESULTS_PATH, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    except Exception as e:
        print(f"Failed on {report_id}: {e}")
        # also log failure in skipped_reports.csv
        row.to_frame().T.to_csv(SKIPPED_PATH, mode="a", header=False, index=False)


=== Running pipeline for: 89 - Merck_2024 ===
Failed on Merck_2024: HTTPSConnectionPool(host='www.merckgroup.com', port=443): Read timed out. (read timeout=60)

=== Running pipeline for: 90 - Telefónica_2024 ===
Creating new vector store at faiss_dbs/Telefónica_2024...
Create vectorestore at faiss_dbs/Telefónica_2024
Starting augmented generation for 65 Prompts...
Clearing GPU cache...
--- Finished Pipeline for: Telefónica_2024 ---

=== Running pipeline for: 91 - Signify_2024 ===
Creating new vector store at faiss_dbs/Signify_2024...
Create vectorestore at faiss_dbs/Signify_2024
Starting augmented generation for 65 Prompts...
Clearing GPU cache...
--- Finished Pipeline for: Signify_2024 ---

=== Running pipeline for: 92 - UPM_2024 ===
Creating new vector store at faiss_dbs/UPM_2024...
Create vectorestore at faiss_dbs/UPM_2024
Starting augmented generation for 65 Prompts...
Clearing GPU cache...
--- Finished Pipeline for: UPM_2024 ---

=== Running pipeline for: 93 - NokianRenkaat_2024 