# Explore different prompts and their performance

Purpose of the notebook: try out different prompts to find the ones with the best performance

Looked at:
- Prompt Template
- System Prompt
- Queries - Guidelines

# Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
import urllib3
import tenacity
import configparser
import markdown
import json
import pymupdf
import requests
import os
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
import tiktoken
import asyncio
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_huggingface import HuggingFacePipeline
from torch import cuda, bfloat16
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, pipeline
import torch
from torch import cuda, bfloat16
import transformers
from tqdm import tqdm
from math import ceil
from datasets import Dataset
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support


# import OpenAI API key from environment variable
from dotenv import load_dotenv

dotenv_path = os.path.expanduser("~/thesis/esg_extraction/.env")
load_dotenv(dotenv_path=dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# for working with huggingface
from huggingface_hub import login
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HF_TOKEN)

# Validation approach
Use sample reports to manually hand code the "ground truth" that the output of the LLM is compared against
Source: Sustainability Reporting Navigator (crowd-source list of CSRD-compliant reports for fiscal years starting on 01/01/2024)

Downloaded CSV with information on all reports on the 08/04/2025 https://www.sustainabilityreportingnavigator.com/#/csrdreports 

In [2]:
# Open the csv data file
reports_24 = pd.read_csv('esg_reports_2024.csv')
print(len(reports_24))

277


In [5]:
sample = reports_24[reports_24['company_withAccessInfo'].isin(['Continental AG', 'Schneider Electric*'])]
sample.head()

Unnamed: 0.1,Unnamed: 0,company_withAccessInfo,link,country,sector,industry,publication date,pages PDF,auditor
58,266,Schneider Electric*,https://www.se.com/ww/en/assets/564/document/5...,France,Resource Transformation,Electrical & Electronic Equipment,2025-03-26,186,PwC & Mazars
220,19,Continental AG,https://annualreport.continental.com/2024/en/s...,Germany,Transportation,Auto Parts,2025-03-18,125,PwC


In [7]:
# Read in the manually hand coded validation set (based on the sample reports)
validation_set = pd.read_excel('validation_dataset.xlsx')
validation_set.head()

Unnamed: 0,report_name,query,verdict,analysis,sources
0,ContinentalAG_2024,S1_A1,YES,"[[YES]] \n""The consolidation at group level of...","[175, 176]"
1,ContinentalAG_2024,S1_A2,YES,[[YES]] \nreport explicitly mentions character...,"[175, 176]"
2,ContinentalAG_2024,S1_A3,NO,[[NO]] \ndoes not describe the types of non-em...,[176]
3,ContinentalAG_2024,S1_A4,YES,"[[YES]]\nworking time arrangements, paying ade...","[110, 173, 174]"
4,ContinentalAG_2024,S1_A5,NO,[[NO]]\ndoes not explicitly describe the types...,"[173, 174]"


In [8]:
# check for consistency in the validation set
print(validation_set.groupby(['report_name' ]).size())

report_name
ContinentalAG_2024        65
SchneiderElectric_2024    65
dtype: int64


In [9]:
# function to calculate the performance of the YES/NO predictions
def evaluate_verdicts(df_val, predicted_results, verbose=False):
    print("\n--- LLM Evaluation: Evaluating performance on verdicts ---")
    y_true, y_pred = [], []
    mismatches = []
    dropped = 0
    total = len(df_val)

    for _, row in df_val.iterrows():
        report = row["report_name"]
        query = row["query"]
        true_verdict = row["verdict"]
        pred_verdict = predicted_results.get(report, {}).get(query, {}).get("verdict")
        true_analysis = row["analysis"]
        pred_analysis = predicted_results.get(report, {}).get(query, {}).get("analysis")

        # Drop if either is missing or N/A
        if pred_verdict is None or pred_verdict == "N/A":
            if verbose:
                print(f"[DROPPED] {report} | {query} | True verdict: {true_verdict} | Predicted verdict: {pred_verdict}, LLM Analysis: {analysis}")
            dropped += 1
            continue

        y_true.append(true_verdict)
        y_pred.append(pred_verdict)

        if verbose and true_verdict != pred_verdict:
            mismatches.append((report, query, true_verdict, pred_verdict, true_analysis, pred_analysis))

    if verbose and mismatches:
        print("\n Mismatches:")
        for report, query, true, pred, true_analysis, pred_analysis in mismatches:
            print(f"  {report} | {query} ")
            print(f" TRUE VERDICT: {true}, TRUE ANALYSIS   : {true_analysis}")
            print(f" PRED VERDICT: {pred}, PRED ANALYSIS   : {pred_analysis}")

    print(f"\n Dropped {dropped} of {total} queries due to missing verdicts.")

    # Check to avoid errors when nothing is left
    if not y_true:
        print(" No valid data to evaluate.")
        return {
            "accuracy": None,
            "precision": None,
            "recall": None,
            "f1_score": None
        }

    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted', zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# First Draft
Based on Ni et al., 2023 and Colesanti Senni et al., 2025
- Ni, J., Bingler, J., Colesanti-Senni, C., Kraus, M., Gostlow, G., Schimanski, T., Stammbach, D., Vaghefi, S. A., Wang, Q., Webersinke, N., Wekhof, T., Yu, T., & Leippold, M. (2023). CHATREPORT: Democratizing Sustainability Disclosure Analysis through LLM-based Tools. Swiss Finance Institute Research Paper, No. 23-111. https://doi.org/10.48550/arXiv.2307.15770
- Colesanti Senni, C., Schimanski, T., Bingler, J., Ni, J., & Leippold, M. (2025). Using AI to assess corporate climate transition disclosures. Environmental Research Communications, 7(2), 021010. https://doi.org/10.1088/2515-7620/ad9e88

With the configuration:
- PDF text extraction: MyPuPDF
- Retrieval:
    - Embedding model: OpenAI text-embedding-ada-002
    - top k: 8
    - chunk size: 350
    - chunk overlap: 50
- Generation:
    - Generative LLM: OpenAI gpt-4.1-mini-2025-04-14
    - answer_length: 200
    - temperature: 0

In [None]:
### HELPER FUNCTIONS ###

# preparing filenames
def prepare_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)
    

def _docs_to_string(docs, with_source=True):
# def _docs_to_string(docs, num_docs=TOP_K, with_source=True):
    output = ""
    # docs = docs[:num_docs]
    for doc in docs:
        output += "Content: {}\n".format(doc.page_content)
        if with_source:
            output += "Source: {}\n".format(doc.metadata['page'])
        output += "\n---\n"
    return output


def _find_answer(full_text):
    try:
        for line in full_text.splitlines():
            if "ANSWER" in line:
                idx = line.find(":") + 1
                return line[idx:].strip().strip('",')
        return full_text.strip()  # fallback if no ANSWER found
    except Exception:
        return full_text.strip()


def _find_verdict(answer_text):
    if not answer_text:
        return "N/A"
    
    # Look for [[YES]] or [[NO]], case-insensitive
    match = re.search(r'\[\[\s*(YES|NO)\s*\]\]', answer_text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    return "N/A"


def _find_sources(full_text):
    # Attempt to match a SOURCES line first
    sources_match = re.search(r"SOURCES\s*:\s*\[([^\]]+)\]", full_text)
    if sources_match:
        number_list = re.findall(r'\d+', sources_match.group(1))
        return [int(n) for n in number_list]

    # Fallback: extract all numbers from full text
    return [int(n) for n in re.findall(r'\b\d{3,5}\b', full_text)]  # assumes sources have 3-5 digits


In [11]:
# retrieval settings
embedding_model = OpenAIEmbeddings() # initialize embedding model
TOP_K = 8
CHUNK_SIZE = 350
CHUNK_OVERLAP = 50

# generation settings
llm_name = 'gpt-4.1-mini-2025-04-14'
MAX_TOKEN=500
llm = ChatOpenAI(model=llm_name, temperature=0, max_tokens=MAX_TOKEN) # initialize LLM
ANSWER_LENGTH=200

In [3]:
# based on the example of Colesanti Senni et al., 2025 defined queries and guidline pairs to capture ESRS guidlines
draft = pd.read_excel('../EsrsMetadata.xlsx')
print(draft[:3])

# Create QUERIES dictionary
QUERIES = dict(zip(draft["query_id"], draft["query"]))
print(QUERIES)

# Create GUIDELINES dictionary
GUIDELINES = dict(zip(draft["query_id"], draft["guidelines"]))
print(GUIDELINES)

  topic_id query_id                                              query  \
0     S1_A    S1_A1  Does the company disclose whether all people i...   
1     S1_A    S1_A2  Does the company describe the types of employe...   
2     S1_A    S1_A3  Does the company describe the types of non-emp...   

                                          guidelines      esrs paragraph  \
0  Focus on whether the company explicitly confir...  S1.SBM-3        14   
1  Check whether the company identifies which typ...  S1.SBM-3      14 a   
2  Assess whether the company identifies non-empl...  S1.SBM-3      14 a   

   related_ar  
0  AR 6 - AR7  
1         NaN  
2         NaN  
{'S1_A1': 'Does the company disclose whether all people in its own workforce who could be materially impacted are included in the scope of its disclosure?', 'S1_A2': 'Does the company describe the types of employees in its own workforce that are subject to material impacts?', 'S1_A3': 'Does the company describe the types of non-empl

In [13]:
PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability.


You are presented with the following sources from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

Please enforce to the following guidelines in your answer:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Keep your ANSWER within {answer_length} words.
4. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
5. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
6. Always acknowledge that the information provided is representing the company's view based on its report.
7. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
8. Start your answer with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.

Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in sustainability reporting that analyzes companys' annual reports."

In [14]:
### Report Processor Module ###

## 1. Parse the Document ##
def parse_pdf(path=None, url=None):
    assert (path is not None) != (url is not None), "Provide either a local path or a URL."
    
    if path:
        pdf = pymupdf.open(path)
    else:
        response = requests.get(url)
        pdf = pymupdf.open(stream=io.BytesIO(response.content), filetype='pdf')
    
    pages = [page.get_text() for page in pdf]
    full_text = ''.join(pages)
    
    return pages, full_text


## 2. Chunk the text ##
def chunk_text(pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " "],
    )

    chunks = []
    metadata = []

    for idx, page in enumerate(pages):
        page_chunks = splitter.split_text(page)
        chunks.extend(page_chunks)
        metadata.extend([{"page": str(idx + 1)}] * len(page_chunks))

    return chunks, metadata


## 3. Generate and store vector representations ##
def get_vectorstore(chunks, metadata, db_path, embedding_model):

    # if vector representation database already exists load FAISS
    if os.path.exists(db_path):
        vectorstore = FAISS.load_local(db_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
    else:
        vectorstore = FAISS.from_texts(chunks, embedding_model, metadatas=metadata)
        vectorstore.save_local(db_path)

    return vectorstore

In [15]:
### RetrievalAugmentedGenerator ###

## 4. Retrieve relevant chunks ##
def retrieve_chunks(vectorstore, queries, report_id, top_k=TOP_K):
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    
    section_text_dict = {}

    for key, query in queries.items():
        section_text_dict[key] = retriever.invoke(query)
    
    return {report_id: section_text_dict}


## 5. Generate Augmented LLM answer ##
def generate_llm_answer(report_list, section_text_dict, llm, answer_length=ANSWER_LENGTH):
    final_results_by_report = {}

    for report in report_list:
        print(f"--- Generating Answers for Report: {report} ---")
        report_assessments = {}
        
        for key, query_text in QUERIES.items():
            context_str = _docs_to_string(section_text_dict[report].get(key, []), with_source=True)
            
            # Skip if no context is found
            if not context_str.strip():
                print(f"    -> Skipping key '{key}' due to empty context.")
                report_assessments[key] = {
                    "verdict": "N/A",
                    "analysis": "N/A",
                    "sources": []
                }
                continue

            # 1. Format the prompt for the current question
            current_prompt_text = disclosure_prompt.format(
                query=query_text,
                sources=context_str,
                guideline=GUIDELINES.get(key, ""),
                answer_length=answer_length
            )
            
            current_message = [
                SystemMessage(content=SYSTEM_PROMPT),
                HumanMessage(content=current_prompt_text)
            ]

            # 2. Make a single, synchronous API call
            response = llm.invoke(current_message)
            text = response.content
                
            # 3. Parse the output            
            try:
                parsed_json = json.loads(text)
                answer = parsed_json.get("ANSWER", "")
                sources = parsed_json.get("SOURCES", [])

            except (json.JSONDecodeError, TypeError) as e:
                print(f"[Warning] JSON parsing failed for key '{key}' in report '{report}': {e}")
                answer = _find_answer(text)
                sources = _find_sources(text)
                
            verdict = _find_verdict(answer)
            report_assessments[key] = {
                "verdict": verdict,
                "analysis": answer,
                "sources": sources
            }

        final_results_by_report[report] = report_assessments
        
    return final_results_by_report

In [None]:
retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    start_time = time.time()
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    print(f"Step 1: Parsed PDF — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 2. Chunk the text
    start_time = time.time()
    chunks, metadata = chunk_text(pages)
    print(f"Step 2: Chunked text — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_Baseline1/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
start_time = time.time()
final_analysis = generate_llm_answer(report_ids, retrieved_chunks, llm)
print(f"\nStep 5: Generated LLM answers — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

evaluate_verdicts(validation_set, final_analysis)

# Optimising for Llama 8B and Qwen 0.6B

Llama 3.1 8B
- context length: 128k https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
- adjustments needed: batching for efficient workflow, b/c computation on own ressources

In [None]:
# reload the settings
# retrieval settings
TOP_K = 8
CHUNK_SIZE = 350
CHUNK_OVERLAP = 50

# generation settings
MAX_TOKEN=500
ANSWER_LENGTH=200
BATCH_SIZE=64
TEMPERATURE=0.01 # For more deterministic, factual output

In [None]:
### loading the model
llm_name = "meta-llama/Llama-3.1-8B-Instruct"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left" # important for batching since Llama is a decoder-only architecture

# to reduce memory usage and speed up performance
quantization_config = BitsAndBytesConfig(load_in_4bit=True, # maximizing speed and minimizing memory
                                         bnb_4bit_compute_dtype=torch.bfloat16, # computations in bfloat16
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_quant_type= "nf4"
                                         )

model = AutoModelForCausalLM.from_pretrained(
    llm_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    return_full_text=True,   # Important for parsing logic
    temperature=TEMPERATURE,        
    max_new_tokens=MAX_TOKEN, 
    batch_size=BATCH_SIZE
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded on cuda:0


In [None]:
# Initialize the embedding model
embeddings_qwen = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={'device': 'cuda'} # specify device='cpu' if GPU not available 
)

Device set to use cuda:0


In [None]:
### 5. Inference with Llama ###

    ## A) Prepare Prompts and Metadata ##
def prepare_prompts(report_list, section_text_dict):
    print("--- Step 1: Preparing all prompts ---")
    prompts_to_process = []
    metadata_for_prompts = []
    # Initialize a dict to hold results, including skipped items
    final_results = {report: {} for report in report_list}

    for report in report_list:
        for key, query_text in QUERIES.items():
            context_str = _docs_to_string(section_text_dict[report].get(key, []))

            if not context_str.strip():
                print(f"  -> Skipping '{report}/{key}' due to empty context.")
                final_results[report][key] = {
                    "verdict": "NO",
                    "analysis": "No relevant context was found to answer the question.",
                    "sources": []
                }
                continue

            prompt_text = disclosure_prompt.format(
                query=query_text,
                sources=context_str,
                guideline=GUIDELINES.get(key, ""),
                answer_length=ANSWER_LENGTH
            )
            
            prompts_to_process.append([
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt_text}
            ])
            metadata_for_prompts.append({"report": report, "key": key})
            
    return prompts_to_process, metadata_for_prompts, final_results


    ## B) Run batched Inference ##
def run_batched_inference(prompts, generate_text_pipeline, batch_size=BATCH_SIZE):
    print(f"\n--- Step 2: Sending {len(prompts)} prompts to the pipeline ---")
    if not prompts:
        print("No prompts to process.")
        return []
    
    # The pipeline automatically handles tokenization and batching
    start_time = time.time()
    responses = generate_text_pipeline(prompts, batch_size=batch_size)
    # empty GPU memory
    torch.cuda.empty_cache()
    print(f"Generated LLM answers — Computation time: {(time.time() - start_time) / 60:.2f} minutes")
    return responses


    ## C) Parse the results ##
def parse_results(responses, metadata, existing_results):
    print("\n--- Step 3: Parsing all responses ---")
    for meta, response in zip(metadata, responses):
        report = meta["report"]
        key = meta["key"]
        
        full_text = response[0]['generated_text'][-1]['content']
        
        json_match = re.search(r'\{.*?\}', full_text, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(0)
            try:
                parsed_json = json.loads(json_str)
                answer = parsed_json.get("ANSWER", "")
                sources = parsed_json.get("SOURCES", [])
            except json.JSONDecodeError as e:
                print(f"\n--- JSON decode error with {report} {key} ---\n{e}\nProblematic JSON:\n{json_str}\nFull text:\n{full_text}\n")
                answer = _find_answer(full_text)
                sources = _find_sources(full_text)
        else:
            print("No JSON found. This was the LLM response:", full_text)
            answer = _find_answer(full_text)
            sources = _find_sources(full_text)

        verdict = _find_verdict(answer)
        if verdict in [None, "N/A"]:
            answer = "N/A"
            sources = "N/A"
            print(f"\n--- Verdict not found in {report} {key} ---\nFull LLM response:\n{full_text}\n")
        
        existing_results[report][key] = {
            "verdict": verdict,
            "analysis": answer,
            "sources": sources
        }
        
    return existing_results

In [None]:
# Run the whole process again such that the generation is based on new embeddings

retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    
    # 2. Chunk the text
    chunks, metadata = chunk_text(pages)
    
    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_Baseline2/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model=embeddings_qwen)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)


 Processing: SchneiderElectric_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.04 minutes

 Processing: ContinentalAG_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes
--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 5.32 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company does not explicitly disclose whether and how it ensures that its own practices do not cause or contribute to material negative impacts on its own workforce.",
  "SOURCES": [
    216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,

{'accuracy': 0.7153846153846154,
 'precision': 0.7828682128040646,
 'recall': 0.7153846153846154,
 'f1_score': 0.7226666296198535}

## A) Prompt Templates
- based on Qwen 0.6B and Llama 8B

In [None]:
### A 1) incl. example of an output

PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability.


You are presented with the following sources from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

Please enforce to the following guidelines in your answer:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Keep your ANSWER within {answer_length} words.
4. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
5. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
6. Always acknowledge that the information provided is representing the company's view based on its report.
7. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
8. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.

Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
Format your output EXACTLY like this:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}
DO NOT include any non-JSON text, preamble, or explanation before or after the JSON output.

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 3.41 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Dropped 0 of 130 queries due to missing verdicts.


{'accuracy': 0.7615384615384615,
 'precision': 0.7629160382101557,
 'recall': 0.7615384615384615,
 'f1_score': 0.7621808771424383}

In [None]:
### A 2) seperate response and format instructions

PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability.


You are presented with the following sources from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

### Response Instructions ###
Please enforce to the following guidelines in your ANSWER:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
4. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
5. Always acknowledge that the information provided is representing the company's view based on its report.
6. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
7. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
8. Keep your ANSWER within {answer_length} words.

### Formatting Instructions ###
- Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
- Your response **must** be returned as a **valid JSON object**.
- Only output the JSON object — no preamble, no markdown, no extra commentary.
- Use this exact format for your final output:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis, verbose=True)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 3.76 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly confirm that all materially impacted individuals in its own workforce are considered. Although the report mentions that the material impacts on own workforce are not directly related to changes resulting from sustainability (Source: 175), it does not provide clear evidence that all people in its own workforce 

{'accuracy': 0.7692307692307693,
 'precision': 0.7720178372352287,
 'recall': 0.7692307692307693,
 'f1_score': 0.7704374057315233}

In [None]:
### A 3) Role definition

PROMPT_TEMPLATE = ("""
You are a senior sustainability analyst with deep expertise in European reporting standards. Your role is to critically evaluate a company's social sustainability disclosures.

You are presented with the following sources from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

### Response Instructions ###
Please enforce to the following guidelines in your ANSWER:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
4. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
5. Always acknowledge that the information provided is representing the company's view based on its report.
6. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
7. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
8. Keep your ANSWER within {answer_length} words.

### Formatting Instructions ###
- Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
- Your response **must** be returned as a **valid JSON object**.
- Only output the JSON object — no preamble, no markdown, no extra commentary.
- Use this exact format for your final output:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis, verbose=True)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 3.88 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly confirm that all materially impacted individuals in its own workforce are considered.
  ContinentalAG_2024 | S1_A2 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
report explicitly mentions characteristics of the company's employees,  acknowledges vulnerable groups such as migrant workers (page 176) and women 

{'accuracy': 0.7461538461538462,
 'precision': 0.7476118652589241,
 'recall': 0.7461538461538462,
 'f1_score': 0.7468377079258215}

In [None]:
### A 4) Explicitly Own workforce

PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability, focusing exclusively on the company's own workforce.

You are presented with the following numbered sources extracted verbatim from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Only consider sources that specifically discuss the company’s **own workforce**. Disregard any information related to contractors, suppliers, or other external workforce categories.

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial background context:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

### Response Instructions ###
Please enforce to the following guidelines in your ANSWER:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
4. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
5. Always acknowledge that the information provided is representing the company's view based on its report.
6. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
7. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
8. Keep your ANSWER within {answer_length} words.

### Formatting Instructions ###
- Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
- Your response **must** be returned as a **valid JSON object**.
- Only output the JSON object — no preamble, no markdown, no extra commentary.
- Use this exact format for your final output:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis, verbose=True)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 4.24 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly confirm that all materially impacted individuals in its own workforce are considered.
  ContinentalAG_2024 | S1_A2 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
report explicitly mentions characteristics of the company's employees,  acknowledges vulnerable groups such as migrant workers (page 176) and women 

{'accuracy': 0.7384615384615385,
 'precision': 0.7576548208127155,
 'recall': 0.7384615384615385,
 'f1_score': 0.7437331627807817}

In [None]:
### A 5) Explicitly Own workforce only with sources

PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability.

You are presented with the following numbered sources extracted verbatim from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Only consider sources that specifically discuss the company’s **own workforce**. Disregard any information related to contractors, suppliers, or other external workforce categories.

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial background context:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

### Response Instructions ###
Please enforce to the following guidelines in your ANSWER:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
4. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
5. Always acknowledge that the information provided is representing the company's view based on its report.
6. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
7. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
8. Keep your ANSWER within {answer_length} words.

### Formatting Instructions ###
- Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
- Your response **must** be returned as a **valid JSON object**.
- Only output the JSON object — no preamble, no markdown, no extra commentary.
- Use this exact format for your final output:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis, verbose=True)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 3.78 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly confirm that all materially impacted individuals in its own workforce are considered.
  ContinentalAG_2024 | S1_A2 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
report explicitly mentions characteristics of the company's employees,  acknowledges vulnerable groups such as migrant workers (page 176) and women 

{'accuracy': 0.7615384615384615,
 'precision': 0.7694943504467313,
 'recall': 0.7615384615384615,
 'f1_score': 0.7643789344388146}

## B) System Prompt 

Baseline: SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in sustainability reporting that analyzes companys' annual reports."

In [None]:
# Best performing prompt template:
### A 2) seperate response and format instructions

PROMPT_TEMPLATE = ("""
You are a senior sustainabiliy analyst with expertise in the european reporting standards evaluating a company's disclosure on social sustainability.


You are presented with the following sources from the company's annual report:
--------------------- [BEGIN OF SOURCES]\n
{sources}\n
--------------------- [END OF SOURCES]\n

Given the sources information and no prior knowledge, your main task is to respond to the posed question encapsulated in "||".
Question: ||{query}||

Please consider the following additional explanation to the question encapsulated in "+++++" as crucial for answering the question:
+++++ [BEGIN OF EXPLANATION]
{guideline}
+++++ [END OF EXPLANATION]

### Response Instructions ###
Please enforce to the following guidelines in your ANSWER:
1. Your response must be precise, thorough, and grounded on specific extracts from the report to verify its authenticity.
2. If you are unsure, simply acknowledge the lack of knowledge, rather than fabricating an answer.
3. Be skeptical to the information disclosed in the report as there might be greenwashing (exaggerating the firm's environmental responsibility). Always answer in a critical tone.
4. Cheap talks are statements that are costless to make and may not necessarily reflect the true intentions or future actions of the company. Be critical for all cheap talks you discovered in the report.
5. Always acknowledge that the information provided is representing the company's view based on its report.
6. Scrutinize whether the report is grounded in quantifiable, concrete data or vague, unverifiable statements, and communicate your findings.
7. Start your ANSWER with a "[[YES]]"" or ""[[NO]]"" depending on whether you would answer the question with a yes or no. Always complement your judgement on yes or no with a short explanation that summarizes the sources in an informative way, i.e. provide details.
8. Keep your ANSWER within {answer_length} words.

### Formatting Instructions ###
- Format your answer in JSON format with the two keys: ANSWER (this should contain your answer string without sources), and SOURCES (this should be a list of the SOURCE numbers that were referenced in your answer).
- Your response **must** be returned as a **valid JSON object**.
- Only output the JSON object — no preamble, no markdown, no extra commentary.
- Use this exact format for your final output:
{{
  "ANSWER": "[[YES]] or [[NO]] Here follows your explanation",
  "SOURCES": ["1", "216", "181-182", "174"]
}}

Your FINAL_ANSWER in JSON (ensure there's no format error):
""")

disclosure_prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["query", "sources", "guideline", "answer_length"]
)

In [15]:
### 1) Compliance & Standards Focus

SYSTEM_PROMPT = "You are an AI assistant acting as a Senior Equity Analyst specializing in sustainability reporting, with deep expertise in European Sustainability Reporting Standards (ESRS). Your role is to critically analyze company annual reports, identifying compliance gaps, potential greenwashing, and the robustness of reported sustainability metrics."

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.10 minutes

--- Step 3: Parsing all responses ---

--- Verdict not found in SchneiderElectric_2024 S1_A7 ---
Full LLM response:
{
  "ANSWER": "NO",
  "SOURCES": [
    105,
    222,
    218,
    242,
    223,
    279,
    313,
    388,
    379,
    113
  ]
}

The company does not explicitly disclose whether it has developed an understanding of how the people in its own workforce with particular characteristics, contexts, or activities may be at greater risk of harm. While the report mentions various initiatives and programs aimed at promoting a responsible workplace and mitigating risks, it does not provide specific information on identifying and addressing potential risks faced by specific groups within its workforce.

The report mentions the importance of building a culture of respect and recognizing different forms of harassment (Source 224), but it d

{'accuracy': 0.6875,
 'precision': 0.7766516516516517,
 'recall': 0.6875,
 'f1_score': 0.6932571849668386}

In [16]:
### 2) ESRS expertise

SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in European Sustainability Reporting Standards that analyzes companies' annual reports."

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.31 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company does not provide a clear description of how it tracks the effectiveness of actions taken to mitigate material risks arising from its impacts and dependencies on its own workforce.",
  "SOURCES": [
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
    "273",
   

{'accuracy': 0.7307692307692307,
 'precision': 0.7634096626696292,
 'recall': 0.7307692307692307,
 'f1_score': 0.7375120705155955}

In [17]:
### 3) social sustainability expertise

SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in social sustainability disclosures that analyzes companies' annual reports."

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.90 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company does not explicitly disclose what resources are allocated to the management of its material impacts on its own workforce.",
  "SOURCES": [
    273, 273, 119, 168, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 2

{'accuracy': 0.7615384615384615,
 'precision': 0.787806185643244,
 'recall': 0.7615384615384615,
 'f1_score': 0.7671365685189709}

In [18]:
### 4) workforce labour expertise

SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in workforce and labor practice reporting that analyzes companies' annual reports."

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.51 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company describes some initiatives and programs to mitigate material risks arising from its impacts and dependencies on its own workforce, but it does not provide a clear description of how it tracks the effectiveness of these actions.",
  "SOURCES": [
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,
    273,

{'accuracy': 0.7538461538461538,
 'precision': 0.7830901856763925,
 'recall': 0.7538461538461538,
 'f1_score': 0.7598335147106555}

In [19]:
### 5) sustainability metrics

SYSTEM_PROMPT = "You are an AI assistant in the role of a Senior Equity Analyst with expertise in sustainability performance metrics that analyzes companies' annual reports."

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)

--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.93 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company does not explicitly disclose what resources are allocated to the management of its material impacts on its own workforce.",
  "SOURCES": [
    273, 273, 119, 168, 278, 278, 278, 278, 206, 206, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 1

{'accuracy': 0.7615384615384615,
 'precision': 0.7825641025641026,
 'recall': 0.7615384615384615,
 'f1_score': 0.7666401231618623}

## C) Queries - Guidelines
### 1. shorter non-question queries

In [20]:
### 1. shorter query and all other information in the guidelines
QUERIES = {
    'S1_A1': 'Scope of disclosure for materially impacted workforce',
    'S1_A2': 'Types of employees subject to material impacts',
    'S1_A3': 'Types of non-employees in workforce subject to material impacts',
    'S1_A4': 'Activities resulting in material positive impacts on workforce',
    'S1_A5': 'Types of employees and non-employees affected by positive impacts',
    'S1_A6': 'Workforce impacts from climate transition plans',
    'S1_A7': 'Understanding of workforce groups at greater risk of harm',
    'S1_A8': 'Policies for managing material impacts, risks, and opportunities for own workforce',
    'S1_A9': 'Applicability of workforce-related policies to specific groups or entire workforce',
    'S1_A10': 'Actions to prevent or mitigate negative impacts on workforce',
    'S1_A11': 'Tracking and assessing effectiveness of actions for workforce impacts',
    'S1_A12': 'Initiatives for delivering positive impacts for own workforce',
    'S1_A13': 'Process to identify necessary actions for negative workforce impacts',
    'S1_A14': 'Actions to pursue material opportunities for own workforce',
    'S1_A15': 'Ensuring own practices do not cause negative workforce impacts',
    'S1_A16': 'Resource allocation for managing workforce impacts',
    'S1_B1': 'Material risks from dependencies on its own workforce',
    'S1_B2': 'Group-specific risks or opportunities for the workforce',
    'S1_B3': 'Actions to mitigate material risks for own workforce',
    'S1_B4': 'Tracking effectiveness of actions to mitigate workforce risks',
    'S1_C1': 'Operations at risk of forced or compulsory labour by type',
    'S1_C2': 'Operations at risk of forced or compulsory labour by country',
    'S1_C3': 'Operations at risk of child labour by type',
    'S1_C4': 'Operations at risk of child labour by country',
    'S1_C5': 'Human rights policy commitments for its own workforce',
    'S1_C6': 'General approach to respecting human rights of own workforce',
    'S1_C7': 'Alignment of workforce policies with international human rights instruments',
    'S1_C8': 'Policies addressing trafficking, forced labour, and child labour',
    'S1_C9': 'Global Framework Agreements for respecting workforce human rights',
    'S1_C10': 'Number of severe human rights incidents connected to workforce',
    'S1_C11': 'Amount of fines and compensation for workforce human rights incidents',
    'S1_D1': 'General approach to engaging with its own workforce',
    'S1_D2': 'How workforce perspectives inform decisions and activities',
    'S1_D3': 'Direct engagement vs. workers’ representatives for managing impacts',
    'S1_D4': 'Stage of decision-making for workforce engagement',
    'S1_D5': 'Type of engagement with its own workforce',
    'S1_D6': 'Frequency of engagement with its own workforce',
    'S1_D7': 'Function and senior role responsible for workforce engagement',
    'S1_D8': 'Assessing effectiveness of workforce engagement on decisions',
    'S1_D9': 'Gaining insight from vulnerable or marginalised groups in workforce',
    'S1_D10': 'Workforce engagement in setting workforce-related targets',
    'S1_D11': 'Workforce engagement in tracking performance against targets',
    'S1_D12': 'Workforce engagement in identifying lessons learned from performance',
    'S1_E1': 'Policies on eliminating discrimination and promoting equal opportunity',
    'S1_E2': 'Specific grounds of discrimination addressed in workforce policy',
    'S1_E3': 'Policy commitments to include or support at-risk groups in workforce',
    'S1_E4': 'Implementation of discrimination and inclusion policies for workforce',
    'S1_E5': 'Fines, penalties, and compensation for work-related discrimination',
    'S1_F1': 'General approach to providing remedy for human rights impacts on workforce',
    'S1_F2': 'Approach to remedy for negative impacts caused by the company',
    'S1_F3': 'Specific channels for workforce to raise concerns or needs',
    'S1_F4': 'Existence of a grievance or complaints mechanism for employees',
    'S1_F5': 'Processes for supporting channels to raise concerns',
    'S1_F6': 'Tracking and monitoring of issues raised by workforce',
    'S1_F7': 'Ensuring effectiveness of channels for raising concerns',
    'S1_F8': 'Assessing workforce awareness of channels for raising concerns',
    'S1_F9': 'Assessing workforce trust in channels for raising concerns',
    'S1_F10': 'Policies protecting against retaliation for raising concerns',
    'S1_F11': 'Actions taken to provide remedy for actual material impacts on workforce',
    'S1_G1': 'Social protection coverage for sickness for all employees',
    'S1_G2': 'Social protection coverage for unemployment for all employees',
    'S1_G3': 'Social protection coverage for employment injury and disability for all employees',
    'S1_G4': 'Social protection coverage for parental leave for all employees',
    'S1_G5': 'Social protection coverage for retirement for all employees',
    'S1_G6': 'Disclosure of countries with gaps in social protection coverage'
}

GUIDELINES = {
    'S1_A1': 'Assess if the company explicitly confirms that ALL people in its own workforce who could be materially impacted are included in the scope of disclosure. Answer "YES" only if this is clearly stated.',
    'S1_A2': 'Assess if the company describes specific categories or types of EMPLOYEES (e.g., by contract type, job level) that are subject to material impacts. Answer "YES" if employee types are detailed.',
    'S1_A3': 'Assess if the company describes specific categories of NON-EMPLOYEES in its workforce (e.g., contractors, agency workers) that are subject to material impacts. Answer "YES" if non-employee types are detailed.',
    'S1_A4': 'Assess if the company describes activities, programs, or policies that lead to positive outcomes for its workforce, such as skill development, improved well-being, or benefits. Answer "YES" if such activities are described.',
    'S1_A5': 'Assess if the company identifies the specific groups of employees and non-employees who benefit from its positive impact activities. Answer "YES" if these groups are clearly identified.',
    'S1_A6': 'Assess if the report mentions impacts on the workforce specifically resulting from the company\'s transition plans for greener or climate-neutral operations, such as restructuring, job creation, or reskilling programs. Answer "YES" if these impacts are discussed.',
    'S1_A7': 'Assess if the company explains its understanding of how specific groups (e.g., by gender, age, migrant status, or those in hazardous roles) may face a higher risk of harm. Answer "YES" if this awareness and analysis are disclosed.',
    'S1_A8': 'Assess if the company discloses the existence of policies designed to manage its workforce-related impacts, risks, and opportunities. Answer "YES" if policies are mentioned.',
    'S1_A9': 'Assess if the company specifies whether its workforce policies apply to everyone or only to certain groups or locations. Answer "YES" if this distinction is made clear.',
    'S1_A10': 'Assess if the company details actions it has taken, is taking, or plans to take to prevent or reduce negative impacts. Answer "YES" if specific actions are described.',
    'S1_A11': 'Assess if the company describes HOW it tracks the effectiveness of its actions, using methods like audits, impact assessments, stakeholder feedback, or other metrics. Answer "YES" if a tracking process is mentioned.',
    'S1_A12': 'Assess if the company describes initiatives whose main goal is to create positive outcomes for its workforce. Answer "YES" if such initiatives are detailed.',
    'S1_A13': 'Assess if the company explains the process it follows to decide on the appropriate actions when an actual or potential negative impact is identified. Answer "YES" if this decision-making process is described.',
    'S1_A14': 'Assess if the company describes actions it has taken, is taking, or plans to take to capitalize on material opportunities related to its workforce. Answer "YES" if specific actions are described.',
    'S1_A15': 'Assess if the company explains HOW its own business practices (e.g., procurement, sales) are reviewed or designed to prevent causing negative impacts on its workforce. Answer "YES" if this process is described.',
    'S1_A16': 'Assess if the company provides information on the financial or other resources it allocates to manage its workforce impacts. Answer "YES" if resource allocation is mentioned.',
    'S1_B1': 'Assess if the company identifies risks to its business that arise from its reliance on its workforce, such as risks from high turnover or skills shortages. Answer "YES" if such dependencies and risks are discussed.',
    'S1_B2': 'Assess if the company specifies that certain risks or opportunities apply only to specific groups within the workforce, rather than to everyone. Answer "YES" if this differentiation is made.',
    'S1_B3': 'Assess if the company describes actions taken to lessen material risks that arise from its workforce impacts and dependencies. Answer "YES" if mitigation actions are described.',
    'S1_B4': 'Assess if the company explains how it tracks the effectiveness of its risk mitigation actions for workforce-related issues. Answer "YES" if a tracking process is mentioned.',
    'S1_C1': 'Assess if the company identifies specific types of operations (e.g., manufacturing, logistics) at significant risk of forced labour incidents. Answer "YES" if operation types are specified.',
    'S1_C2': 'Assess if the company identifies specific countries or geographic areas where its operations are at significant risk of forced labour incidents. Answer "YES" if countries/areas are specified.',
    'S1_C3': 'Assess if the company identifies specific types of operations (e.g., manufacturing, agriculture) at significant risk of child labour incidents. Answer "YES" if operation types are specified.',
    'S1_C4': 'Assess if the company identifies specific countries or geographic areas where its operations are at significant risk of child labour incidents. Answer "YES" if countries/areas are specified.',
    'S1_C5': 'Assess if the company describes its policy commitments on human rights as they apply to its own workforce, referencing standards like the UN Guiding Principles or ILO declarations. Answer "YES" if such commitments are detailed.',
    'S1_C6': 'Assess if the company describes its overall approach to respecting the human and labour rights of its own workforce. Answer "YES" if an approach is described.',
    'S1_C7': 'Assess if the company explicitly states how its workforce policies align with international instruments like the UN Guiding Principles on Business and Human Rights. Answer "YES" if this alignment is discussed.',
    'S1_C8': 'Assess if the company’s policies are stated to explicitly address human trafficking, forced labour, and child labour. Answer "YES" if these issues are explicitly covered.',
    'S1_C9': 'Assess if the company explains the role of any Global Framework Agreements or similar agreements with worker representatives concerning human rights. Answer "YES" if such agreements are discussed.',
    'S1_C10': 'Assess if the company provides the number of severe human rights incidents (e.g., forced labour, child labour) involving its workforce during the period. Answer "YES" if a number (even zero) is provided.',
    'S1_C11': 'Assess if the company discloses the monetary amount of fines, penalties, or compensation paid related to severe human rights incidents in its workforce. Answer "YES" if a monetary amount is provided.',
    'S1_D1': 'Assess if the company describes its general approach or strategy for engaging with its workforce. Answer "YES" if an approach is described.',
    'S1_D2': 'Assess if the company explains whether and how the feedback and perspectives gathered from its workforce are used to inform its decisions or activities. Answer "YES" if this link is described.',
    'S1_D3': 'Assess if the company specifies whether its engagement is directly with workers or through their representatives (e.g., unions, works councils). Answer "YES" if this distinction is made.',
    'S1_D4': 'Assess if the company clarifies at what stage of a project or decision-making process (e.g., planning, implementation, review) it engages with its workforce. Answer "YES" if the stage is specified.',
    'S1_D5': 'Assess if the company describes the methods or types of engagement it uses (e.g., surveys, meetings, collective bargaining). Answer "YES" if engagement types are mentioned.',
    'S1_D6': 'Assess if the company discloses how often it engages with its workforce (e.g., annually, quarterly, continuously). Answer "YES" if a frequency is stated.',
    'S1_D7': 'Assess if the company names the function (e.g., HR department) and the most senior role responsible for ensuring workforce engagement happens. Answer "YES" if both are specified.',
    'S1_D8': 'Assess if the company explains how it evaluates the effectiveness of its workforce engagement processes. Answer "YES" if an assessment method is described.',
    'S1_D9': 'Assess if the company describes specific steps taken to ensure the perspectives of vulnerable or marginalized workers are heard. Answer "YES" if such steps are detailed.',
    'S1_D10': 'Assess if the company states whether it engaged with its workforce or their representatives when creating its workforce-related targets. Answer "YES" if this engagement is mentioned.',
    'S1_D11': 'Assess if the company states whether it engaged with its workforce or their representatives while tracking performance against its targets. Answer "YES" if this engagement is mentioned.',
    'S1_D12': 'Assess if the company states whether it engaged with its workforce or their representatives to review past performance and identify improvements. Answer "YES" if this engagement is mentioned.',
    'S1_E1': 'Assess if the company discloses having policies focused on anti-discrimination, equal opportunity, diversity, and inclusion. Answer "YES" if such policies are mentioned.',
    'S1_E2': 'Assess if the company’s anti-discrimination policy explicitly names specific grounds such as race, gender, sexual orientation, disability, age, religion, etc. Answer "YES" if specific grounds are listed.',
    'S1_E3': 'Assess if the company describes policy commitments aimed at including or supporting groups at risk of vulnerability within its workforce. Answer "YES" if such commitments are mentioned.',
    'S1_E4': 'Assess if the company explains HOW its anti-discrimination and inclusion policies are put into practice (e.g., through training, recruitment processes). Answer "YES" if implementation methods are described.',
    'S1_E5': 'Assess if the company discloses information on fines, penalties, or compensation paid due to work-related discrimination or harassment cases. Answer "YES" if this is mentioned.',
    'S1_F1': 'Assess if the company describes its general approach or framework for providing or enabling remedy when human rights impacts occur. Answer "YES" if an approach is described.',
    'S1_F2': 'Assess if the company describes its process for providing remedy for negative impacts it directly caused or contributed to. Answer "YES" if this process is detailed.',
    'S1_F3': 'Assess if the company describes the specific channels its workforce can use to raise concerns (e.g., hotlines, grievance mechanisms, unions). Answer "YES" if channels are described.',
    'S1_F4': 'Assess if the company states that it has a formal grievance or complaints handling mechanism for its employees. Answer "YES" if one is mentioned.',
    'S1_F5': 'Assess if the company explains the processes it uses to support and ensure the availability of these channels for its workforce. Answer "YES" if support processes are described.',
    'S1_F6': 'Assess if the company describes how it tracks and monitors issues from the moment they are raised until they are resolved. Answer "YES" if a tracking process is mentioned.',
    'S1_F7': 'Assess if the company explains how it ensures its channels are effective, for example, by involving the users of the channels in the review process. Answer "YES" if effectiveness checks are described.',
    'S1_F8': 'Assess if the company explains HOW it evaluates whether its workforce is aware of the available channels to raise concerns. Answer "YES" if an assessment process is described.',
    'S1_F9': 'Assess if the company explains HOW it evaluates whether its workforce trusts that the channels are safe and effective. Answer "YES" if an assessment process is described.',
    'S1_F10': 'Assess if the company states it has policies to protect individuals from retaliation after raising a concern. Answer "YES" if anti-retaliation policies are mentioned.',
    'S1_F11': 'Assess if the company provides examples or descriptions of actions it has taken to provide remedy for specific material impacts that have occurred. Answer "YES" if such actions are described.',
    'S1_G1': 'Assess if the company states that all its employees are covered by social protection for sickness. Answer "YES" if universal coverage is confirmed.',
    'S1_G2': 'Assess if the company states that all its employees are covered by social protection for unemployment. Answer "YES" if universal coverage is confirmed.',
    'S1_G3': 'Assess if the company states that all its employees are covered by social protection for employment injury and acquired disability. Answer "YES" if universal coverage is confirmed.',
    'S1_G4': 'Assess if the company states that all its employees are covered by social protection for parental leave. Answer "YES" if universal coverage is confirmed.',
    'S1_G5': 'Assess if the company states that all its employees are covered by social protection for retirement. Answer "YES" if universal coverage is confirmed.',
    'S1_G6': 'Assess if the company discloses the specific countries where gaps in social protection coverage exist for its employees, if not all are covered. Answer "YES" if countries with gaps are named.'
}

In [None]:
retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    
    # 2. Chunk the text
    chunks, metadata = chunk_text(pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_Baseline2/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model=embeddings_qwen)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id, top_k=TOP_K)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)


 Processing: SchneiderElectric_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes

 Processing: ContinentalAG_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes
--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 6.87 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[YES]] The company explains how it ensures its channels are effective, but with some limitations.",
  "SOURCES": [
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    108,
    10

{'accuracy': 0.6614173228346457,
 'precision': 0.6554773966191288,
 'recall': 0.6614173228346457,
 'f1_score': 0.6580829612098102}

### 2. guidelines with "answer 'yes' if ... answer 'no' if ..."

In [22]:
### 2. 
QUERIES = {
    'S1_A1': 'Does the company disclose whether all people in its own workforce who could be materially impacted are included in the scope of its disclosure?',
    'S1_A2': 'Does the company describe the types of employees in its own workforce that are subject to material impacts?',
    'S1_A3': 'Does the company describe the types of non-employees in its own workforce that are subject to material impacts?',
    'S1_A4': 'Does the company describe the activities that result in material positive impacts on its own workforce?',
    'S1_A5': 'Does the company identify the types of employees and non-employees positively affected by activities that result in material positive impacts on its own workforce?',
    'S1_A6': 'Does the company report material impacts on its workforce resulting from its climate transition plans?',
    'S1_A7': 'Does the company disclose whether it has developed an understanding of how people in its own workforce may be at greater risk of harm?',
    'S1_A8': 'Does the company disclose policies to manage its material impacts, risks, and opportunities related to its own workforce?',
    'S1_A9': 'Does the company specify whether its workforce-related policies apply to specific groups or the entire workforce?',
    'S1_A10': 'Does the company describe actions taken, planned or underway to prevent or mitigate material negative impacts on its own workforce?',
    'S1_A11': 'Does the company describe how it tracks and assesses the effectiveness of actions for its own workforce?',
    'S1_A12': 'Does the company describe actions or initiatives that have the primary purpose of delivering positive impacts for its own workforce?',
    'S1_A13': 'Does the company describe the processes it uses to identify what action is needed in response to negative impacts on its own workforce?',
    'S1_A14': 'Does the company describe actions planned or underway to pursue material opportunities related to its own workforce?',
    'S1_A15': 'Does the company disclose whether and how it ensures that its own practices do not cause or contribute to material negative impacts on its own workforce?',
    'S1_A16': 'Does the company disclose what resources are allocated to the management of its material impacts on its own workforce?',
    'S1_B1': 'Does the company disclose any material risks arising from its dependencies on its own workforce?',
    'S1_B2': 'Does the company disclose whether any material risks or opportunities related to its workforce apply specifically to certain groups?',
    'S1_B3': 'Does the company describe actions planned or underway to mitigate material risks arising from its impacts and dependencies on its own workforce?',
    'S1_B4': 'Does the company describe how it tracks the effectiveness of actions taken to mitigate material risks for its own workforce?',
    'S1_C1': 'Does the company disclose operations that are at significant risk of incidents of forced labour by type of operation?',
    'S1_C2': 'Does the company disclose operations that are at significant risk of incidents of forced labour by countries or geographic areas?',
    'S1_C3': 'Does the company disclose operations that are at significant risk of incidents of child labour by type of operation?',
    'S1_C4': 'Does the company disclose operations that are at significant risk of incidents of child labour by countries or geographic areas?',
    'S1_C5': 'Does the company describe its human rights policy commitments relevant to its own workforce?',
    'S1_C6': 'Does the company describe its general approach to respecting the human rights of people in its own workforce?',
    'S1_C7': 'Does the company disclose whether and how its policies for its own workforce align with international human rights instruments?',
    'S1_C8': 'Does the company state whether its policies for its own workforce explicitly address trafficking, forced labour, or child labour?',
    'S1_C9': 'Does the company explain the existence and role of any Global Framework Agreements related to the human rights of its own workforce?',
    'S1_C10': 'Does the company disclose the number of severe human rights incidents connected to its workforce?',
    'S1_C11': 'Does the company disclose the total amount of fines, penalties, and compensation paid related to severe human rights incidents?',
    'S1_D1': 'Does the company disclose its general approach to engaging with people in its own workforce?',
    'S1_D2': 'Does the company disclose whether and how the perspectives of its own workforce inform its decisions or activities?',
    'S1_D3': 'Does the company disclose whether engagement to inform decisions occurs directly with its own workforce or with workers’ representatives?',
    'S1_D4': 'Does the company explain the stage(s) at which engagement with its own workforce occurs?',
    'S1_D5': 'Does the company disclose the type of engagement it conducts with its own workforce?',
    'S1_D6': 'Does the company disclose the frequency of engagement with its own workforce?',
    'S1_D7': 'Does the company disclose the function and the most senior role responsible for workforce engagement?',
    'S1_D8': 'Does the company assess the effectiveness of its engagement with its own workforce?',
    'S1_D9': 'Does the company disclose the steps it takes to gain insight into the perspectives of vulnerable or marginalised people in its own workforce?',
    'S1_D10': 'Does the company disclose whether and how it engaged with its workforce when setting workforce-related targets?',
    'S1_D11': 'Does the company disclose whether and how it engaged with its workforce in tracking performance against workforce-related targets?',
    'S1_D12': 'Does the company disclose whether and how it engaged with its workforce in identifying lessons learned from performance?',
    'S1_E1': 'Does the company report whether it has specific policies aimed at eliminating discrimination and promoting equal opportunities?',
    'S1_E2': 'Does the company report whether its policy on discrimination among its workforce specifically addresses grounds of discrimination?',
    'S1_E3': 'Does the company report whether it has policy commitments that aim to include or support people from groups at particular risk of vulnerability?',
    'S1_E4': 'Does the company report whether and how its discrimination and inclusion policies for its workforce are implemented?',
    'S1_E5': 'Does the company disclose information about fines or compensation as a result of work-related discrimination in its workforce?',
    'S1_F1': 'Does the company disclose its general approach to providing or enabling remedy for human rights impacts on its own workforce?',
    'S1_F2': 'Does the company describe its approach for providing remedy for material negative impacts it has caused on its own workforce?',
    'S1_F3': 'Does the company describe specific channels for its own workforce to raise concerns or needs?',
    'S1_F4': 'Does the company report having a grievance or complaints handling mechanism for employee matters?',
    'S1_F5': 'Does the company describe the processes through which it supports the availability of channels for its own workforce to raise concerns?',
    'S1_F6': 'Does the company describe how it tracks and monitors issues raised by its own workforce?',
    'S1_F7': 'Does the company describe how it ensures the effectiveness of the channels for its workforce to raise concerns?',
    'S1_F8': 'Does the company disclose whether and how it assesses that its own workforce is aware of the channels for raising concerns?',
    'S1_F9': 'Does the company disclose whether and how it assesses that its own workforce trusts the channels for raising concerns?',
    'S1_F10': 'Does the company disclose whether it has policies in place to protect individuals against retaliation for raising concerns?',
    'S1_F11': 'Does the company describe actions it has taken to provide remedy for actual material impacts on its own workforce?',
    'S1_G1': 'Does the company disclose whether all its employees are covered by social protection for sickness?',
    'S1_G2': 'Does the company disclose whether all its employees are covered by social protection for unemployment?',
    'S1_G3': 'Does the company disclose whether all its employees are covered by social protection for employment injury and disability?',
    'S1_G4': 'Does the company disclose whether all its employees are covered by social protection for parental leave?',
    'S1_G5': 'Does the company disclose whether all its employees are covered by social protection for retirement?',
    'S1_G6': 'If not all employees are covered by social protection, does the company disclose the countries where gaps in social protection exist?'
}

GUIDELINES = {
    'S1_A1': 'Focus on whether the company explicitly confirms that all materially impacted individuals in its own workforce are considered. Answer "YES" if this is explicitly stated. Answer "NO" if the inclusion of all materially impacted individuals is not clearly mentioned.',
    'S1_A2': 'Focus on whether the company identifies which types of employees are affected by material impacts. Answer "YES" if employee types (e.g., by contract type, job level) are explicitly described. Answer "NO" if such types are not specified.',
    'S1_A3': 'Focus on whether the company identifies non-employees (e.g., self-employed individuals, contractors, or agency workers) who are materially affected by its operations. Answer "YES" if such categories are clearly mentioned. Answer "NO" if no non-employee categories are described.',
    'S1_A4': 'Focus on the disclosure of activities that result in material positive impacts. Answer "YES" if such activities are described. Answer "NO" otherwise.',
    'S1_A5': 'Focus on whether the company identifies the specific types of employees and non-employees positively affected by its activities. Answer "YES" if such groups are identified. Answer "NO" otherwise.',
    'S1_A6': 'Focus on workforce-related impacts, risks and opportunities that arise from the company\'s transition plans for greener and climate-neutral operations. Answer "YES" if such impacts are described. Answer "NO" otherwise.',
    'S1_A7': 'Focus on whether the company explicitly states that it has developed an understanding of increased risk for specific groups in its workforce (e.g., by age, gender, migrant status, or job context). Answer "YES" if this understanding is disclosed. Answer "NO" if no such understanding is mentioned.',
    'S1_A8': 'Focus on the disclosure of policies to manage material impacts, risks, and opportunities. Answer "YES" if such policies are disclosed. Answer "NO" otherwise.',
    'S1_A9': 'Focus on whether the company specifies if its workforce-related policies apply to the entire workforce or only to specific groups. Answer "YES" if this is specified. Answer "NO" otherwise.',
    'S1_A10': 'Focus on the disclosure of actions taken, planned, or underway to prevent or mitigate negative impacts. Answer "YES" if such actions are described. Answer "NO" otherwise.',
    'S1_A11': 'Focus on whether the company explains how it monitors and evaluates the effectiveness of its actions using methods like audits, impact assessments, or stakeholder feedback. Answer "YES" if a process for assessing effectiveness is described. Answer "NO" otherwise.',
    'S1_A12': 'Focus on the disclosure of actions or initiatives with the primary purpose of delivering positive impacts. Answer "YES" if such actions are described. Answer "NO" otherwise.',
    'S1_A13': 'Focus on the disclosure of processes used to identify necessary actions in response to negative impacts. Answer "YES" if such processes are described. Answer "NO" otherwise.',
    'S1_A14': 'Focus on the disclosure of actions planned or underway to pursue material opportunities. Answer "YES" if such actions are described. Answer "NO" otherwise.',
    'S1_A15': 'Focus on whether the company explains how it ensures its own practices do not cause or contribute to negative impacts. Answer "YES" if this is explained. Answer "NO" otherwise.',
    'S1_A16': 'Focus on the disclosure of resources allocated to manage workforce impacts. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_B1': 'Focus on whether the company reports risks resulting from its dependence on its workforce, such as high employee turnover or lack of skills. Answer "YES" if such risks are disclosed. Answer "NO" otherwise.',
    'S1_B2': 'Focus on whether the company identifies if material risks or opportunities apply to specific groups rather than the entire workforce. Answer "YES" if group-specific risks or opportunities are mentioned. Answer "NO" otherwise.',
    'S1_B3': 'Focus on the disclosure of actions planned or underway to mitigate material risks. Answer "YES" if such actions are described. Answer "NO" otherwise.',
    'S1_B4': 'Focus on whether the company describes how it tracks the effectiveness of actions taken to mitigate material risks. Answer "YES" if this is described. Answer "NO" otherwise.',
    'S1_C1': 'Focus on whether the company discloses operations at significant risk of forced labour by type of operation (e.g., manufacturing plant). Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_C2': 'Focus on whether the company discloses operations at significant risk of forced labour by countries or geographic areas. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_C3': 'Focus on whether the company discloses operations at significant risk of child labour by type of operation. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_C4': 'Focus on whether the company discloses operations at significant risk of child labour by countries or geographic areas. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_C5': 'Focus on whether the company discloses its human rights policy commitments for its own workforce, referencing standards like the UN Guiding Principles or ILO. Answer "YES" if such commitments are disclosed. Answer "NO" otherwise.',
    'S1_C6': 'Focus on the disclosure of the company\'s general approach to respecting the human rights of its workforce. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_C7': 'Focus on whether the company discloses the alignment of its policies with international human rights instruments. Answer "YES" if alignment is discussed. Answer "NO" otherwise.',
    'S1_C8': 'Focus on whether the company states its policies explicitly address trafficking, forced labour, and child labour. Answer "YES" if they are explicitly addressed. Answer "NO" otherwise.',
    'S1_C9': 'Focus on whether the company explains the existence and role of any Global Framework Agreements. Answer "YES" if this is explained. Answer "NO" otherwise.',
    'S1_C10': 'Focus on whether the company discloses the number of severe human rights incidents during the reporting period. Answer "YES" if a number (including zero) is provided. Answer "NO" otherwise.',
    'S1_C11': 'Focus on whether the company discloses the total amount of fines, penalties, and compensation paid related to human rights incidents. Answer "YES" if an amount is disclosed. Answer "NO" otherwise.',
    'S1_D1': 'Focus on the disclosure of the company\'s general approach to engaging with its workforce. Answer "YES" if an approach is described. Answer "NO" otherwise.',
    'S1_D2': 'Focus on whether the company discloses how workforce perspectives inform its decisions. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_D3': 'Focus on whether the company discloses if engagement is direct with the workforce or with workers’ representatives. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_D4': 'Focus on whether the company explains the stage(s) at which engagement occurs. Answer "YES" if stages are explained. Answer "NO" otherwise.',
    'S1_D5': 'Focus on the disclosure of the type of engagement conducted. Answer "YES" if types are disclosed. Answer "NO" otherwise.',
    'S1_D6': 'Focus on the disclosure of the frequency of engagement. Answer "YES" if a frequency is disclosed. Answer "NO" otherwise.',
    'S1_D7': 'Focus on whether the company discloses the function and the most senior role responsible for engagement. Answer "YES" if both are disclosed. Answer "NO" otherwise.',
    'S1_D8': 'Focus on whether the company assesses the effectiveness of its engagement with its workforce. Answer "YES" if an assessment is described. Answer "NO" otherwise.',
    'S1_D9': 'Focus on whether the company discloses steps to gain insight from vulnerable or marginalised people in its workforce. Answer "YES" if steps are disclosed. Answer "NO" otherwise.',
    'S1_D10': 'Focus on whether the company discloses engagement with its workforce when setting targets. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_D11': 'Focus on whether the company discloses engagement with its workforce in tracking performance against targets. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_D12': 'Focus on whether the company discloses engagement with its workforce in identifying lessons learned from performance. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_E1': 'Focus on whether the company reports having policies aimed at eliminating discrimination and promoting equal opportunities. Answer "YES" if such policies are reported. Answer "NO" otherwise.',
    'S1_E2': 'Focus on whether the company\'s discrimination policy specifically addresses grounds of discrimination (e.g., race, gender, age). Answer "YES" if specific grounds are addressed. Answer "NO" otherwise.',
    'S1_E3': 'Focus on whether the company reports policy commitments to include or support people from groups at risk of vulnerability. Answer "YES" if such commitments are reported. Answer "NO" otherwise.',
    'S1_E4': 'Focus on whether the company reports how its discrimination and inclusion policies are implemented. Answer "YES" if implementation is described. Answer "NO" otherwise.',
    'S1_E5': 'Focus on the disclosure of information about fines, penalties, or compensation related to work-related discrimination. Answer "YES" if this is disclosed. Answer "NO" otherwise.',
    'S1_F1': 'Focus on the disclosure of the company\'s general approach to providing or enabling remedy for human rights impacts. Answer "YES" if an approach is described. Answer "NO" otherwise.',
    'S1_F2': 'Focus on whether the company describes its approach for providing remedy for material negative impacts it has caused. Answer "YES" if this is described. Answer "NO" otherwise.',
    'S1_F3': 'Focus on the disclosure of specific channels for its workforce to raise concerns (e.g., hotlines, unions). Answer "YES" if channels are described. Answer "NO" otherwise.',
    'S1_F4': 'Focus on whether the company reports having a grievance or complaints handling mechanism for employees. Answer "YES" if this is reported. Answer "NO" otherwise.',
    'S1_F5': 'Focus on the disclosure of processes through which the company supports the availability of channels to raise concerns. Answer "YES" if these processes are described. Answer "NO" otherwise.',
    'S1_F6': 'Focus on whether the company describes how it tracks and monitors issues raised by its workforce. Answer "YES" if this is described. Answer "NO" otherwise.',
    'S1_F7': 'Focus on whether the company describes how it ensures the effectiveness of its channels for raising concerns. Answer "YES" if this is described. Answer "NO" otherwise.',
    'S1_F8': 'Focus on whether the company discloses if and how it assesses workforce awareness of channels for raising concerns. Answer "YES" if an assessment is described. Answer "NO" otherwise.',
    'S1_F9': 'Focus on whether the company discloses if and how it assesses workforce trust in the channels for raising concerns. Answer "YES" if an assessment is described. Answer "NO" otherwise.',
    'S1_F10': 'Focus on the disclosure of policies that protect individuals against retaliation for raising concerns. Answer "YES" if such policies are disclosed. Answer "NO" otherwise.',
    'S1_F11': 'Focus on the disclosure of actions taken to provide remedy for actual material impacts on its workforce. Answer "YES" if actions are described. Answer "NO" otherwise.',
    'S1_G1': 'Focus on whether the company discloses that all its employees are covered by social protection for sickness. Answer "YES" if universal coverage is confirmed. Answer "NO" otherwise.',
    'S1_G2': 'Focus on whether the company discloses that all its employees are covered by social protection for unemployment. Answer "YES" if universal coverage is confirmed. Answer "NO" otherwise.',
    'S1_G3': 'Focus on whether the company discloses that all its employees are covered by social protection for employment injury and disability. Answer "YES" if universal coverage is confirmed. Answer "NO" otherwise.',
    'S1_G4': 'Focus on whether the company discloses that all its employees are covered by social protection for parental leave. Answer "YES" if universal coverage is confirmed. Answer "NO" otherwise.',
    'S1_G5': 'Focus on whether the company discloses that all its employees are covered by social protection for retirement. Answer "YES" if universal coverage is confirmed. Answer "NO" otherwise.',
    'S1_G6': 'Focus on whether the company discloses the countries where gaps in social protection exist, if not all employees are covered. Answer "YES" if countries with gaps are disclosed. Answer "NO" otherwise.'
}

In [None]:
retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    
    # 2. Chunk the text
    chunks, metadata = chunk_text(pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_Baseline2/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model=embeddings_qwen)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id, top_k=TOP_K)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)


 Processing: SchneiderElectric_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes

 Processing: ContinentalAG_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes
--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 6.85 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[NO]] The company does not disclose the function and the most senior role responsible for workforce engagement.",
  "SOURCES": [
    "182",
    "182",
    "182",
    "182",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216",
    "216

{'accuracy': 0.676923076923077,
 'precision': 0.7194862063932272,
 'recall': 0.676923076923077,
 'f1_score': 0.6854730175484892}

### 3. focus on own workforce

In [24]:
### 3.
QUERIES = {
    'S1_A1': 'Does the company disclose the scope of disclosure for its own workforce?',
    'S1_A2': 'Does the company describe the types of employees within its own workforce subject to material impacts?',
    'S1_A3': 'Does the company describe the types of non-employees within its own workforce subject to material impacts?',
    'S1_A4': 'Does the company describe activities resulting in material positive impacts on its own workforce?',
    'S1_A5': 'Does the company identify the types of employees and non-employees within its own workforce affected by positive impacts?',
    'S1_A6': 'Does the company report material impacts on its own workforce from climate transition plans?',
    'S1_A7': 'Does the company disclose its understanding of how people in its own workforce may be at greater risk of harm?',
    'S1_A8': 'Does the company disclose policies for managing impacts, risks, and opportunities related to its own workforce?',
    'S1_A9': 'Does the company specify if its own workforce-related policies apply to specific groups or the entire workforce?',
    'S1_A10': 'Does the company describe actions to prevent or mitigate material negative impacts on its own workforce?',
    'S1_A11': 'Does the company describe how it tracks the effectiveness of actions for its own workforce?',
    'S1_A12': 'Does the company describe initiatives with the primary purpose of delivering positive impacts for its own workforce?',
    'S1_A13': 'Does the company describe processes to identify necessary actions in response to negative impacts on its own workforce?',
    'S1_A14': 'Does the company describe actions to pursue material opportunities related to its own workforce?',
    'S1_A15': 'Does the company disclose how it ensures its own practices do not cause negative impacts on its own workforce?',
    'S1_A16': 'Does the company disclose resources allocated to manage impacts on its own workforce?',
    'S1_B1': 'Does the company disclose material risks from dependencies on its own workforce?',
    'S1_B2': 'Does the company disclose if material risks or opportunities for its own workforce apply to specific groups?',
    'S1_B3': 'Does the company describe actions to mitigate material risks related to its own workforce?',
    'S1_B4': 'Does the company describe how it tracks the effectiveness of actions to mitigate risks for its own workforce?',
    'S1_C1': 'Does the company disclose its own operations at risk of forced labour by type of operation?',
    'S1_C2': 'Does the company disclose its own operations at risk of forced labour by country?',
    'S1_C3': 'Does the company disclose its own operations at risk of child labour by type of operation?',
    'S1_C4': 'Does the company disclose its own operations at risk of child labour by country?',
    'S1_C5': 'Does the company describe its human rights policy commitments for its own workforce?',
    'S1_C6': 'Does the company describe its general approach to respecting the human rights of its own workforce?',
    'S1_C7': 'Does the company disclose how its policies for its own workforce align with international human rights instruments?',
    'S1_C8': 'Does the company state if its policies for its own workforce explicitly address trafficking, forced labour, or child labour?',
    'S1_C9': 'Does the company explain Global Framework Agreements for respecting the human rights of its own workforce?',
    'S1_C10': 'Does the company disclose the number of severe human rights incidents connected to its own workforce?',
    'S1_C11': 'Does the company disclose the total amount of fines and compensation for human rights incidents in its own workforce?',
    'S1_D1': 'Does the company disclose its general approach to engaging with its own workforce?',
    'S1_D2': 'Does the company disclose how the perspectives of its own workforce inform its decisions?',
    'S1_D3': 'Does the company disclose if engagement occurs directly with its own workforce or with workers’ representatives?',
    'S1_D4': 'Does the company explain the stage(s) at which engagement with its own workforce occurs?',
    'S1_D5': 'Does the company disclose the type of engagement it conducts with its own workforce?',
    'S1_D6': 'Does the company disclose the frequency of engagement with its own workforce?',
    'S1_D7': 'Does the company disclose the function and senior role responsible for engagement with its own workforce?',
    'S1_D8': 'Does the company assess the effectiveness of its engagement with its own workforce?',
    'S1_D9': 'Does the company disclose steps to gain insight from vulnerable or marginalised people in its own workforce?',
    'S1_D10': 'Does the company disclose engagement with its own workforce when setting workforce-related targets?',
    'S1_D11': 'Does the company disclose engagement with its own workforce when tracking performance against targets?',
    'S1_D12': 'Does the company disclose engagement with its own workforce when identifying lessons learned from performance?',
    'S1_E1': 'Does the company report having policies to eliminate discrimination and promote equal opportunities for its own workforce?',
    'S1_E2': 'Does the company report if its discrimination policy for its own workforce addresses specific grounds of discrimination?',
    'S1_E3': 'Does the company report policy commitments to support at-risk groups within its own workforce?',
    'S1_E4': 'Does the company report how its discrimination and inclusion policies for its own workforce are implemented?',
    'S1_E5': 'Does the company disclose information on fines or compensation from discrimination cases in its own workforce?',
    'S1_F1': 'Does the company disclose its approach to providing remedy for human rights impacts on its own workforce?',
    'S1_F2': 'Does the company describe its approach to providing remedy for negative impacts it caused on its own workforce?',
    'S1_F3': 'Does the company describe specific channels for its own workforce to raise concerns?',
    'S1_F4': 'Does the company report having a grievance or complaints mechanism for its own employees?',
    'S1_F5': 'Does the company describe processes for supporting channels for its own workforce to raise concerns?',
    'S1_F6': 'Does the company describe how it tracks and monitors issues raised by its own workforce?',
    'S1_F7': 'Does the company describe how it ensures the effectiveness of channels for its own workforce?',
    'S1_F8': 'Does the company disclose how it assesses if its own workforce is aware of channels for raising concerns?',
    'S1_F9': 'Does the company disclose how it assesses if its own workforce trusts the channels for raising concerns?',
    'S1_F10': 'Does the company disclose policies to protect individuals in its own workforce from retaliation?',
    'S1_F11': 'Does the company describe actions taken to provide remedy for actual impacts on its own workforce?',
    'S1_G1': 'Does the company disclose if all employees in its own workforce are covered by social protection for sickness?',
    'S1_G2': 'Does the company disclose if all employees in its own workforce are covered by social protection for unemployment?',
    'S1_G3': 'Does the company disclose if all employees in its own workforce are covered by social protection for injury and disability?',
    'S1_G4': 'Does the company disclose if all employees in its own workforce are covered by social protection for parental leave?',
    'S1_G5': 'Does the company disclose if all employees in its own workforce are covered by social protection for retirement?',
    'S1_G6': 'If not all of its own employees are covered, does the company disclose countries where social protection gaps exist?'
}

GUIDELINES = {
    'S1_A1': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explicitly confirms that all materially impacted individuals in its own workforce are considered. Answer "YES" only if this is clearly stated.',
    'S1_A2': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes specific categories or types of EMPLOYEES (e.g., by contract type, job level) that are subject to material impacts. Answer "YES" if employee types are detailed.',
    'S1_A3': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes specific categories of NON-EMPLOYEES (e.g., contractors, agency workers) that are subject to material impacts. Answer "YES" if non-employee types are detailed.',
    'S1_A4': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes activities, programs, or policies that lead to positive outcomes for its workforce. Answer "YES" if such activities are described.',
    'S1_A5': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company identifies the specific groups of employees and non-employees who benefit from its positive impact activities. Answer "YES" if these groups are clearly identified.',
    'S1_A6': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the report mentions impacts on the workforce resulting from the company\'s climate transition plans (e.g., restructuring, job creation, reskilling). Answer "YES" if these impacts are discussed.',
    'S1_A7': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains its understanding of how specific groups (e.g., by gender, age, migrant status) may face a higher risk of harm. Answer "YES" if this awareness and analysis are disclosed.',
    'S1_A8': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses the existence of policies designed to manage its workforce-related impacts, risks, and opportunities. Answer "YES" if policies are mentioned.',
    'S1_A9': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company specifies whether its workforce policies apply to everyone or only to certain groups or locations. Answer "YES" if this distinction is made clear.',
    'S1_A10': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company details actions it has taken, is taking, or plans to take to prevent or reduce negative impacts. Answer "YES" if specific actions are described.',
    'S1_A11': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes HOW it tracks the effectiveness of its actions, using methods like audits, impact assessments, or other metrics. Answer "YES" if a tracking process is mentioned.',
    'S1_A12': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes initiatives whose main goal is to create positive outcomes for its workforce. Answer "YES" if such initiatives are detailed.',
    'S1_A13': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains the process it follows to decide on appropriate actions when a negative impact is identified. Answer "YES" if this decision-making process is described.',
    'S1_A14': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes actions it has taken, is taking, or plans to take to capitalize on material opportunities. Answer "YES" if specific actions are described.',
    'S1_A15': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains HOW its own business practices (e.g., procurement, sales) are reviewed or designed to prevent causing negative impacts on its workforce. Answer "YES" if this process is described.',
    'S1_A16': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company provides information on the financial or other resources it allocates to manage its workforce impacts. Answer "YES" if resource allocation is mentioned.',
    'S1_B1': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company identifies risks to its business that arise from its reliance on its workforce, such as risks from high turnover or skills shortages. Answer "YES" if such dependencies and risks are discussed.',
    'S1_B2': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company specifies that certain risks or opportunities apply only to specific groups within the workforce, rather than to everyone. Answer "YES" if this differentiation is made.',
    'S1_B3': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes actions taken to lessen material risks that arise from its workforce impacts and dependencies. Answer "YES" if mitigation actions are described.',
    'S1_B4': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains how it tracks the effectiveness of its risk mitigation actions for workforce-related issues. Answer "YES" if a tracking process is mentioned.',
    'S1_C1': 'Focus strictly on the company\'s own operations. Disregard the value chain. Assess if the company identifies specific types of operations (e.g., manufacturing, logistics) at significant risk of forced labour incidents. Answer "YES" if operation types are specified.',
    'S1_C2': 'Focus strictly on the company\'s own operations. Disregard the value chain. Assess if the company identifies specific countries or geographic areas where its operations are at significant risk of forced labour incidents. Answer "YES" if countries/areas are specified.',
    'S1_C3': 'Focus strictly on the company\'s own operations. Disregard the value chain. Assess if the company identifies specific types of operations (e.g., manufacturing, agriculture) at significant risk of child labour incidents. Answer "YES" if operation types are specified.',
    'S1_C4': 'Focus strictly on the company\'s own operations. Disregard the value chain. Assess if the company identifies specific countries or geographic areas where its operations are at significant risk of child labour incidents. Answer "YES" if countries/areas are specified.',
    'S1_C5': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes its policy commitments on human rights as they apply to its own workforce. Answer "YES" if such commitments are detailed.',
    'S1_C6': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes its overall approach to respecting the human and labour rights of its own workforce. Answer "YES" if an approach is described.',
    'S1_C7': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explicitly states how its workforce policies align with international instruments like the UN Guiding Principles. Answer "YES" if this alignment is discussed.',
    'S1_C8': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company’s policies are stated to explicitly address human trafficking, forced labour, and child labour. Answer "YES" if these issues are explicitly covered.',
    'S1_C9': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains the role of any Global Framework Agreements or similar agreements with worker representatives concerning human rights. Answer "YES" if such agreements are discussed.',
    'S1_C10': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company provides the number of severe human rights incidents involving its workforce. Answer "YES" if a number (even zero) is provided.',
    'S1_C11': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses the monetary amount of fines, penalties, or compensation paid related to severe human rights incidents. Answer "YES" if a monetary amount is provided.',
    'S1_D1': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes its general approach or strategy for engaging with its workforce. Answer "YES" if an approach is described.',
    'S1_D2': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains whether and how feedback from its workforce is used to inform its decisions. Answer "YES" if this link is described.',
    'S1_D3': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company specifies whether its engagement is directly with workers or through their representatives. Answer "YES" if this distinction is made.',
    'S1_D4': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company clarifies at what stage of a project or decision-making process it engages with its workforce. Answer "YES" if the stage is specified.',
    'S1_D5': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes the methods or types of engagement it uses (e.g., surveys, meetings). Answer "YES" if engagement types are mentioned.',
    'S1_D6': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses how often it engages with its workforce. Answer "YES" if a frequency is stated.',
    'S1_D7': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company names the function and the most senior role responsible for ensuring workforce engagement. Answer "YES" if both are specified.',
    'S1_D8': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains how it evaluates the effectiveness of its workforce engagement processes. Answer "YES" if an assessment method is described.',
    'S1_D9': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes specific steps taken to ensure the perspectives of vulnerable or marginalized workers are heard. Answer "YES" if such steps are detailed.',
    'S1_D10': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company states whether it engaged with its workforce or their representatives when creating its targets. Answer "YES" if this engagement is mentioned.',
    'S1_D11': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company states whether it engaged with its workforce while tracking performance against its targets. Answer "YES" if this engagement is mentioned.',
    'S1_D12': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company states whether it engaged with its workforce to review past performance. Answer "YES" if this engagement is mentioned.',
    'S1_E1': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses having policies focused on anti-discrimination, equal opportunity, and diversity. Answer "YES" if such policies are mentioned.',
    'S1_E2': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the anti-discrimination policy explicitly names specific grounds such as race, gender, sexual orientation, disability, etc. Answer "YES" if specific grounds are listed.',
    'S1_E3': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes policy commitments aimed at including or supporting groups at risk of vulnerability. Answer "YES" if such commitments are mentioned.',
    'S1_E4': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains HOW its anti-discrimination and inclusion policies are put into practice. Answer "YES" if implementation methods are described.',
    'S1_E5': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses information on fines or compensation paid due to work-related discrimination. Answer "YES" if this is mentioned.',
    'S1_F1': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes its general approach for providing or enabling remedy for human rights impacts. Answer "YES" if an approach is described.',
    'S1_F2': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes its process for providing remedy for negative impacts it directly caused. Answer "YES" if this process is detailed.',
    'S1_F3': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes the specific channels its workforce can use to raise concerns. Answer "YES" if channels are described.',
    'S1_F4': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company states that it has a formal grievance or complaints mechanism. Answer "YES" if one is mentioned.',
    'S1_F5': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains the processes it uses to support and ensure the availability of these channels. Answer "YES" if support processes are described.',
    'S1_F6': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company describes how it tracks and monitors issues from the moment they are raised until they are resolved. Answer "YES" if a tracking process is mentioned.',
    'S1_F7': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains how it ensures its channels are effective. Answer "YES" if effectiveness checks are described.',
    'S1_F8': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains HOW it evaluates whether its workforce is aware of the available channels to raise concerns. Answer "YES" if an assessment process is described.',
    'S1_F9': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company explains HOW it evaluates whether its workforce trusts that the channels are safe and effective. Answer "YES" if an assessment process is described.',
    'S1_F10': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company states it has policies to protect individuals from retaliation after raising a concern. Answer "YES" if anti-retaliation policies are mentioned.',
    'S1_F11': 'Focus strictly on the company\'s own workforce. Disregard any information about workers in the value chain or suppliers. Assess if the company provides examples or descriptions of actions it has taken to provide remedy for specific material impacts. Answer "YES" if such actions are described.',
    'S1_G1': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company states that all its employees are covered by social protection for sickness. Answer "YES" if universal coverage is confirmed.',
    'S1_G2': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company states that all its employees are covered by social protection for unemployment. Answer "YES" if universal coverage is confirmed.',
    'S1_G3': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company states that all its employees are covered by social protection for employment injury and disability. Answer "YES" if universal coverage is confirmed.',
    'S1_G4': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company states that all its employees are covered by social protection for parental leave. Answer "YES" if universal coverage is confirmed.',
    'S1_G5': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company states that all its employees are covered by social protection for retirement. Answer "YES" if universal coverage is confirmed.',
    'S1_G6': 'Focus strictly on the company\'s own employees. Disregard any information about workers in the value chain or suppliers. Assess if the company discloses the specific countries where gaps in social protection coverage exist, if not all are covered. Answer "YES" if countries with gaps are named.'
}

In [None]:
retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    
    # 2. Chunk the text
    chunks, metadata = chunk_text(pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_Baseline2/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model=embeddings_qwen)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id, top_k=TOP_K)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis)


 Processing: SchneiderElectric_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes

 Processing: ContinentalAG_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes
--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 8.19 minutes

--- Step 3: Parsing all responses ---
No JSON found. This was the LLM response: {
  "ANSWER": "[[YES]] The company discloses policies for managing impacts, risks, and opportunities related to its own workforce.",
  "SOURCES": [
    119,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222,
    222

{'accuracy': 0.6846153846153846,
 'precision': 0.7368047337278107,
 'recall': 0.6846153846153846,
 'f1_score': 0.6930657531301556}

In [26]:
evaluate_verdicts(validation_set, final_analysis, verbose=True)


--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly confirm that all materially impacted individuals in its own workforce are considered.
  ContinentalAG_2024 | S1_A2 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
report explicitly mentions characteristics of the company's employees,  acknowledges vulnerable groups such as migrant workers (page 176) and women (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not describe specific categories or types of employees within its own workforce subject to material impacts.
  Co

{'accuracy': 0.6846153846153846,
 'precision': 0.7368047337278107,
 'recall': 0.6846153846153846,
 'f1_score': 0.6930657531301556}

### 4. based on output improved queries - guidelines

In [14]:
QUERIES = {
    'S1_A1': 'Does the company disclose whether all people in its own workforce who could be materially impacted are included in the scope of its disclosure?', 
    'S1_A2': 'Does the company describe the types of employees in its own workforce that are subject to material impacts?', 
    'S1_A3': 'Does the company describe the types of non-employees in its own workforce that are subject to material impacts?', 
    'S1_A4': 'Does the company describe activities that result in material positive impacts on its own workforce?', 
    'S1_A5': 'Does the company identify the types of employees and non-employees positively affected or potentially affected by activities that result in material positive impacts on its own workforce?', 
    'S1_A6': 'Does the company report material impacts, risks and opportunities on its workforce resulting from transition plans for greener and climate-neutral operations?', 
    'S1_A7': 'Has the company developed an understanding of how the people in its own workforce with particular characteristics, contexts, or activities may be at greater risk of harm?',
    'S1_A8': 'Does the company disclose policies to manage its material impacts, risks, and opportunities related to its own workforce?', 
    'S1_A9': 'Does the company specify whether its workforce-related policies apply to specific groups or the entire workforce?', 
    'S1_A10': 'Does the company describe actions taken, planned or underway to prevent or mitigate material negative impacts on its own workforce?', 
    'S1_A11': 'Does the company describe how it tracks and assesses the effectiveness of actions and initiatives in delivering outcomes positive impacts or mitigarte negative impacts for its own workforce?', 
    'S1_A12': 'Does the company describe actions or initiatives that have the primary purpose of delivering positive impacts for its own workforce?', 
    'S1_A13': 'Does the company describe the processes it uses to identify what action is needed in response to actual or potential negative impacts on its own workforce?', 
    'S1_A14': 'Does the company describe actions planned or underway to pursue material opportunities related to its own workforce?', 
    'S1_A15': 'Does the company disclose whether and how it ensures that its own practices do not cause or contribute to material negative impacts on its own workforce?', 
    'S1_A16': 'Does the company disclose what resources are allocated to the management of its material impacts on its own workforce?', 
    'S1_B1': 'Does the company disclose any material risks arising from its dependencies on its own workforce?', 
    'S1_B2': 'Does the company disclose whether any material risks or opportunities arising from impacts and dependencies on people in its own workforce apply specifically to certain groups rather than to the entire workforce?', 
    'S1_B3': 'Does the company describe actions planned or underway to mitigate material risks arising from its impacts and dependencies on its own workforce?', 
    'S1_B4': 'Does the company describe how it tracks the effectiveness of actions taken to mitigate material risks arising from its impacts and dependencies on its own workforce?', 
    'S1_C1': 'Does the company disclose operations that are at significant risk of incidents of forced labour or compulsory labour by type of operation (e.g., manufacturing plant)?', 
    'S1_C2': 'Does the company disclose operations that are at significant risk of incidents of forced labour or compulsory labour by countries or geographic areas?', 
    'S1_C3': 'Does the company disclose operations that are at significant risk of incidents of child labour by type of operation (e.g., manufacturing plant)?', 
    'S1_C4': 'Does the company disclose operations that are at significant risk of incidents of child labour by countries or geographic areas?', 
    'S1_C5': 'Does the company describe its human rights policy commitments relevant to its own workforce?', 
    'S1_C6': 'Does the company describe its general approach to respect human rights, including labour rights, of people in its own workforce?', 
    'S1_C7': 'Does the company disclose whether its policies with regard to its own workforce are aligned with relevant internationally recognised instruments, including the UN Guiding Principles on Business and Human Rights?', 
    'S1_C8': 'Does the company state whether its policies related to its own workforce explicitly address trafficking in human beings, forced labour or compulsory labour, and child labour?', 
    'S1_C9': 'Does the company explain the existence and role of any Global Framework Agreement or other agreements with workers’ representatives related to respecting the human rights of its own workforce?', 
    'S1_C10': 'Does the company disclose the number of severe human rights incidents connected to its workforce during the reporting period?', 
    'S1_C11': 'Does the company disclose the total amount of fines, penalties, and compensation paid related to the severe human rights incidents connected to its workforce?', 
    'S1_D1': 'Does the company disclose its general approach to engage with people in its own workforce?', 
    'S1_D2': 'Does the company disclose whether and how the perspectives of its own workforce inform its decisions or activities aimed at managing actual and potential impacts on its workforce?', 
    'S1_D3': 'Does the company disclose whether engagement to inform decisions or activities occurs directly with its own workforce or with workers’ representatives?', 
    'S1_D4': 'Does the company explain the stage(s) at which engagement with its own workforce occurs?', 
    'S1_D5': 'Does the company disclose the type of engagement it conducts with its own workforce?', 
    'S1_D6': 'Does the company disclose the frequency of engagement with its own workforce?', 
    'S1_D7': 'Does the company disclose the function and the most senior role responsible for ensuring that engagement with its own workforce takes place and informs the company’s decisions or activities?', 
    'S1_D8': 'Does the company assesses the effectiveness of its engagement with its own workforce to inform its decisions?', 
    'S1_D9': 'Does the company disclose the steps it takes to gain insight into the perspectives of particularly vulnerable or marginalised people in its own workforce?', 
    'S1_D10': 'Does the company disclose whether and how it engaged directly with its own workforce or workers’ representatives when setting workforce-related targets?', 
    'S1_D11': 'Does the company disclose whether and how it engaged directly with its own workforce or workers’ representatives in tracking performance against workforce-related targets?', 
    'S1_D12': 'Does the company disclose whether and how it engaged directly with its own workforce or workers’ representatives in identifying lessons learned or improvements based on performance against workforce-related targets?', 
    'S1_E1': 'Does the company report whether it has specific policies aimed at eliminating discrimination, including harassment, and promoting equal opportunities, diversity and inclusion among its workforce?', 
    'S1_E2': 'Does the company report whether its policy on discrimination among its workforce specifically addresses grounds of discrimination?', 
    'S1_E3': 'Does the company report whether it has policy commitments that aim to include or support people from groups at particular risk of vulnerability in its workforce?', 
    'S1_E4': 'Does the company report whether and how its discrimination and inclusion policies for its workforce are implemented?', 
    'S1_E5': 'Does the company disclose information about reconciliation of fines, penalties, and compensation for damages as a result of work-related discrimination and harassment in its workforce?', 
    'S1_F1': 'Does the company disclose its general approach to providing and/or enabling remedy for human rights impacts on its own workforce?', 
    'S1_F2': 'Does the company describe its approach and processes for providing or contributing to remedy for material negative impacts it has caused or contributed to on its own workforce?', 
    'S1_F3': 'Does the company describe it having specific channels for its own workforce to raise concerns or needs?', 
    'S1_F4': 'Does the company report having a grievance or complaints handling mechanism for employee matters?', 
    'S1_F5': 'Does the company describe the processes through which it supports the availability of channels in the workplace of its own workforce to raise concerns or needs?', 
    'S1_F6': 'Does the company describe how it tracks and monitors issues raised by its own workforce and how they are addressed?', 
    'S1_F7': 'Does the company describe how it ensures the effectiveness of the channels for its workforce to raise concerns or needs?', 
    'S1_F8': 'Does the company disclose whether and how it assesses that its own workforce is aware of the channels for raising concerns?', 
    'S1_F9': 'Does the company disclose whether and how it assesses that its own workforce trusts the channels for raising concerns?', 
    'S1_F10': 'Does the company disclose whether it has policies in place to protect individuals against retaliation for raising concerns?', 
    'S1_F11': 'Does the company describe actions it has taken to provide or enable remedy for actual material impacts on its own workforce?', 
    'S1_G1': 'Does the company disclose whether all its employees are covered by social protection for sickness?', 
    'S1_G2': 'Does the company disclose whether all its employees are covered by social protection for unemployment starting from when they work for the company?', 
    'S1_G3': 'Does the company disclose whether all its employees are covered by social protection for employment injury and acquired disability?', 
    'S1_G4': 'Does the company disclose whether all its employees are covered by social protection for parental leave?', 
    'S1_G5': 'Does the company disclose whether all its employees are covered by social protection for retirement?', 
    'S1_G6': 'If not all employees are covered by social protection, does the company disclose the countries where gaps in social protection exist?'}
GUIDELINES = {
    'S1_A1': 'Focus on whether the company discloses if all materially impacted individuals in its own workforce are considered or not. Answer "YES" if this is disclosed. Answer "NO" if the company does not explicitly state who included in the scope of its disclosure.',
    'S1_A2': 'Check whether the company describes types of employees that are affected by material impacts arising from its operations. Answer "YES" if employee types are described. Answer "NO" if such types are not specified.', 
    'S1_A3': 'Check whether the company describes types of non-employees who are materially affected by its operations. Answer "YES" if such types are described. Answer "NO" if no non-employee types are described.', 
    'S1_A4': 'No additional guidelines', 
    'S1_A5': 'No additional guidelines', 
    'S1_A6': 'Impacts include restructuring and employment loss as well as opportunities arising from job creation and reskilling or upskilling.', 
    'S1_A7': 'Focus on whether the company explicitly states that it has developed an understanding of increased risk of harm for specific groups in its own workforce. These include people with particular characteristics (e.g., young people, women, migrants), those working in specific contexts (e.g., poorly regulated labour markets), or performing certain activities (e.g., handling chemicals, zero-hours contracts).', 
    'S1_A8': 'No additional guidelines', 
    'S1_A9': 'No additional guidelines', 
    'S1_A10': 'Check whether the company reports on actions that are already implemented, in progress, or planned, which aim to prevent or reduce material negative impacts on its own workforce.', 
    'S1_A11': 'Focus on whether the company explains how it monitors and evaluates whether its actions and initiatives have effectively addressed material impacts on its own workforce. Tracking and assessment processes may include, but are not limited to, internal or external audits, verification systems, court decisions, impact assessments, measurement systems, stakeholder feedback, grievance mechanisms, benchmarking, or external performance ratings. State "YES" if any process for assessing effectiveness is described. State "NO" if the report does not include such information.', 
    'S1_A12': 'No additional guidelines', 
    'S1_A13': 'Answer "YES" if the company explicitly mentions processes to identify actions needed in response to actual or potential negative impacts on its own workforce. Answer "NO" if the company does not explicitly mention processes to identify needed actions.', 
    'S1_A14': 'No additional guidelines', 
    'S1_A15': 'Focus on whether the company explains how it ensures that its internal practices — including procurement, sales, or data use, where relevant — do not cause or contribute to material negative impacts on its own workforce. The disclosure may also include how the company handles tensions between preventing such impacts and other business pressures. State "YES" if the report explains whether and how the company addresses this. State "NO" if there is no such disclosure or the explanation is missing.', 
    'S1_A16': 'No additional guidelines', 
    'S1_B1': 'Focus on whether the company reports risks resulting from its dependence on workforce-related conditions. This may include risks of operational disruption due to high employee turnover or lack of skills and training development.', 
    'S1_B2': 'Check whether the company discloses whether any material risks or opportunities arising from impacts and dependencies on people in its own workforce apply to specific groups (e.g., age groups, workers in a particular country or site), instead of applying broadly to the whole workforce (e.g., general pay cut or training offered to all people in its own workforce). Answer "YES" if this is disclosed. Answer "NO" if the company does not explicitly state who included in the scope of its disclosure.', 
    'S1_B3': 'No additional guidelines', 
    'S1_B4': 'No additional guidelines', 
    'S1_C1': 'No additional guidelines', 
    'S1_C2': 'No additional guidelines', 
    'S1_C3': 'No additional guidelines', 
    'S1_C4': 'No additional guidelines', 
    'S1_C5': 'Focus on whether the company discloses its human rights policy commitments specifically related to its own workforce. This includes policies addressing respect for human rights and labour rights, as well as processes and mechanisms to monitor compliance with the UN Guiding Principles on Business and Human Rights, the ILO Declaration on Fundamental Principles and Rights at Work, and the OECD Guidelines for Multinational Enterprises.', 
    'S1_C6': 'No additional guidelines', 
    'S1_C7': 'No additional guidelines', 
    'S1_C8': 'No additional guidelines', 
    'S1_C9': 'No additional guidelines', 
    'S1_C10': 'Focus on whether the company discloses the total number of severe human rights incidents, such as forced labour, human trafficking, or child labour, linked to its workforce during the reporting period.', 
    'S1_C11': 'No additional guidelines', 
    'S1_D1': 'No additional guidelines', 
    'S1_D2': 'No additional guidelines', 
    'S1_D3': 'No additional guidelines', 
    'S1_D4': 'No additional guidelines', 
    'S1_D5': 'Focus on the type of engagement of the workforce when informing decisions or activities related to managing actual or potential impacts on its own workforce.', 
    'S1_D6': 'Focus on the frequency of engagement with its own workforce when informing decisions or activities related to managing actual or potential impacts on its workforce.', 
    'S1_D7': 'Focus on whether the company specifies the function and the most senior role within the undertaking that has operational responsibility for ensuring engagement with the workforce occurs and that the results are integrated into the company’s decisions or activities. ', 
    'S1_D8': 'No additional guidelines', 
    'S1_D9': 'Focus on whether the company reports any specific actions or procedures it undertakes to understand the perspectives of vulnerable or marginalised groups within its own workforce. These groups may include, but are not limited to, women, migrants, and people with disabilities. ', 
    'S1_D10': 'Focus on whether the company describes any direct engagement with its own workforce or workers’ representatives specifically during the process of setting workforce-related targets. ', 
    'S1_D11': 'Focus on whether the company reports involving its workforce or workers’ representatives in the monitoring or tracking of progress toward workforce-related targets.', 
    'S1_D12': 'Focus on whether the company reports involving its own workforce or workers’ representatives in assessing outcomes or lessons learned from past performance on workforce-related targets, and in identifying improvements.', 
    'S1_E1': 'No additional guidelines', 
    'S1_E2': 'Check whether the company specifically addresses any of the following grounds: racial and ethnic origin, colour, sex, sexual orientation, gender identity, disability, age, religion, political opinion, national extraction, social origin, or other forms of discrimination covered by EU or national law.', 
    'S1_E3': 'No additional guidelines', 
    'S1_E4': 'Check whether the company describes how policies are implemented to ensure discrimination is prevented, mitigated and acted upon once detected, as well as to advance diversity and inclusion in general.', 
    'S1_E5': 'No additional guidelines', 
    'S1_F1': 'Focus on whether the company describes any general approach, measures, or processes aimed at providing or enabling remedy for actual or potential human rights impacts affecting for its own workforce.', 
    'S1_F2': 'No additional guidelines', 
    'S1_F3': 'Focus on whether the company describes the channels available to its own workforce. Examples can be grievance mechanisms, hotlines, trade unions, works councils, or dialogue processes.', 
    'S1_F4': 'No additional guidelines', 'S1_F5': 'No additional guidelines', 
    'S1_F6': 'No additional guidelines', 
    'S1_F7': 'No additional guidelines', 
    'S1_F8': 'Focus on the disclosure of an assessment methods regarding workforce awareness of structures like grievance mechanisms or hotlines. ',
    'S1_F9': 'No additional guidelines', 
    'S1_F10': 'Focus on the existence of policies protecting individuals, including workers’ representatives, from retaliation when using channels to raise concerns. ', 
    'S1_F11': "Focus on the description of actions taken to provide or enable remedy for actual material impacts that have occurred on the company's own workforce.", 
    'S1_G1': 'No additional guidelines', 
    'S1_G2': 'No additional guidelines', 
    'S1_G3': 'No additional guidelines', 
    'S1_G4': 'No additional guidelines', 
    'S1_G5': 'No additional guidelines', 
    'S1_G6': 'No additional guidelines'}

In [16]:
retrieved_chunks = {}  # Holds retrieved text chunks per report
report_ids = []        # Stores report IDs for final LLM analysis

# Loop through each report in your sample DataFrame
for idx, row in sample.iterrows():
    company_name = row['company_withAccessInfo']
    report_id = f"{prepare_filename(company_name)}_2024".replace(" ", "")
    print(f"\n Processing: {report_id}")

    # 1. Parse the document
    path = f"./sample_reports/{report_id}.pdf"
    pages, _ = parse_pdf(path=path)
    
    # 2. Chunk the text
    chunks, metadata = chunk_text(pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    # 3. Generate and store vector representations
    start_time = time.time()
    db_path = f"./faiss_db_D1/{report_id}"
    vectorstore = get_vectorstore(chunks, metadata, db_path, embedding_model=embeddings_qwen)
    print(f"Step 3: Generated vectorstore — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

    # 4. Retrieve the relevant chunks
    start_time = time.time()
    result = retrieve_chunks(vectorstore, queries=QUERIES, report_id=report_id, top_k=TOP_K)
    retrieved_chunks.update(result)
    report_ids.append(report_id)
    print(f"Step 4: Retrieved chunks — Computation time: {(time.time() - start_time) / 60:.2f} minutes")

# 5. Generate LLM answers
    # 1. Prepare prompts
prompts, metadata, final_analysis = prepare_prompts(
    report_list=report_ids,
    section_text_dict=retrieved_chunks
)
    # 2. Run inference
model_responses = run_batched_inference(prompts, generate_text)
    # 3. Parse and finalize results
final_analysis = parse_results(
    model_responses,
    metadata,
    final_analysis
)

# Evaluate the performance
evaluate_verdicts(validation_set, final_analysis, verbose=True)


 Processing: SchneiderElectric_2024
Step 3: Generated vectorstore — Computation time: 0.01 minutes
Step 4: Retrieved chunks — Computation time: 0.04 minutes

 Processing: ContinentalAG_2024
Step 3: Generated vectorstore — Computation time: 0.00 minutes
Step 4: Retrieved chunks — Computation time: 0.03 minutes
--- Step 1: Preparing all prompts ---

--- Step 2: Sending 130 prompts to the pipeline ---
Generated LLM answers — Computation time: 4.10 minutes

--- Step 3: Parsing all responses ---

--- LLM Evaluation: Evaluating performance on verdicts ---

 Mismatches:
  ContinentalAG_2024 | S1_A1 
 TRUE VERDICT: YES, TRUE ANALYSIS   : [[YES]] 
"The consolidation at group level of the assessment of the impacts,
risks and opportunities for own workforce includes Continental’s
entire workforce." (page 175)
It also highlights that vulnerable groups, including migrant workers, are considered within this scope (page 176)
 PRED VERDICT: NO, PRED ANALYSIS   : [[NO]] The company does not explicitly

{'accuracy': 0.7538461538461538,
 'precision': 0.7538461538461538,
 'recall': 0.7538461538461538,
 'f1_score': 0.7538461538461538}