In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import re
import json
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from sentence_transformers import SentenceTransformer
import faiss
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
login(HF_TOKEN)

Reading Files

Reading Files

In [None]:
file_name = "esg-report-v1/pdf_17_processed.txt"
with open(file_name, 'r') as file:
    pdf_report = file.read()

Changing tables from html to csv format

In [None]:
def html_to_csv(html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')
    def get_cell_text(cell):
        return cell.get_text(strip=True)
    rows = []
    for tr in table.find_all('tr'):
        cells = tr.find_all(['td', 'th'])
        row = [get_cell_text(cell) for cell in cells]
        rows.append(row)
    op = ""
    num_col = 0
    for i, row in enumerate(rows):
        if i == 0:
            num_col += len(row)
        else:
            cur_len = len(row)
            if cur_len > num_col:
                remove_len = cur_len - num_col
                rmd = 0
                for i in range(remove_len):
                    if rmd >= remove_len:
                        break
                    for i, elem in enumerate(row):
                        if len(elem) <= 1:
                            row = row[:i] + row[i+1:]
                            break
            elif cur_len < num_col:
                row.extend(["" for i in range(num_col - cur_len)])
        for i, elem in enumerate(row):
            if elem == "":
                elem = '""'
            if re.findall(r"(\d+,\d+)+", elem):
                elem = re.sub(",", "", elem)
            if re.findall(r",", elem):
                elem = f"[{elem}]"
            row[i] = elem
        op += ", ".join(row)
        op += "\n"
    return op

In [None]:
pdf_report_tt = ""
for para in pdf_report.split("\n\n"):
    if re.findall("<html>.+</html>", para):
        para = html_to_csv(para)
        para = '```{table}\n' + para
        para = para + '```'
    pdf_report_tt += para
    pdf_report_tt += "\n\n"

Dividing pdf into sections

In [None]:
pattern = re.compile(r'(?s)(#.*?\n.*?\S)(?=\s*#|$)') # Matches each header sections.
sections = pattern.findall(pdf_report_tt)

Loading LLM Model for Summarization

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading ESG Relevant Data Stored in JSON

In [None]:
with open("esg_indicators.json", 'r') as file:
    esg_indicators = json.load(file)

In [None]:
INDICATORS = list(esg_indicators.keys())

In [None]:
with open('esg_context.json', 'r') as file:
    ESG_CONTEXT = json.load(file)

In [None]:
KEYWORDS = []
[KEYWORDS.extend(indicators['keywords']) for indicators in esg_indicators.values()]
KEYWORDS = list(set(KEYWORDS))

Prompt creation and summarizing

TIP: For the prompt, if the summary keeps returning strange results for list of contents, can add in condition to ignore list of contents.

In [None]:
base_msg = f"""You are an expert ESG reporting analyst specializing in the automotive industry. Your task is to generate a concise, retrieval-optimized summary for each section of an ESG report. Each section is introduced by markdown headers (e.g., '# Section Name') and contains detailed ESG-related information. Your summary must capture the key insights, metrics, and trends described in the section, while naturally incorporating relevant ESG terminology from {ESG_CONTEXT} and aligning with potential future changes in {INDICATORS} and {KEYWORDS}.

Note: Any table appearing in the text is enclosed by "```{{table}}...```" and is represented in CSV format.

Instructions:
1. Analyze the provided section carefully and extract its unique, most salient data points and insights.
2. Write a single, clear sentence that summarizes the section in an information-dense manner. Include numerical details, trends, or context where available.
3. Where applicable, naturally integrate relevant ESG keywords and indicator concepts from {KEYWORDS} and {INDICATORS} without simply listing them. The summary should read as a coherent description of the section’s content.
4. Ensure that each summary is distinct and tailored to its specific section, avoiding generic or repetitive phrasing across sections.
5. If the section contains no meaningful content, return an empty string.
6. The entire input (including whitespaces such as newlines) represents one section; return one summarization statement accordingly.

Output:
Return only the section header and its corresponding summary enclosed within "<summary>...</summary>" tags (without any markdown formatting).

Example:
Input: "## Energy and Emissions Reduction\n\nIn 2023, our facilities consumed 2.5 million MWh of energy (3% reduction YoY), with energy consumption per vehicle produced dropping to 1.8 MWh/unit. Scope 1-2 GHG emissions totaled 450,000 metric tons, while ZEV sales grew to 12% of total vehicles sold..."
Output: "<summary>This section outlines a 3% YoY reduction in energy consumption (2.5M MWh total and 1.8 MWh per vehicle), alongside 450k metric tons of GHG emissions and a 12% increase in ZEV sales, reflecting key energy, emissions, and ZEV performance trends.</summary>"

Here is the input section to summarize:\n
"""


In [None]:
summarized = []
for section in sections[:50]:
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        model.config.pad_token_id = tokenizer.pad_token_id
    section_without_newline = re.sub("\n", " ", section)
    ip_msg = f'"""{section_without_newline}"""'
    msg = base_msg + ip_msg
    messages = [{'role': 'user', 'content': msg}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
    outputs = model.generate(inputs, max_new_tokens=100000)
    op = tokenizer.decode(outputs[0]).split("<|end_header_id|>")[-1].strip("\n").split("<|eot_id|>")[0]
    op = re.findall(r'<summary>(.*?)</summary>', op)
    op = ' '.join(op)
    summarized.append(op)

Creating seperate vector db for semantic search and key word search

In [None]:
vmodel = SentenceTransformer("BAAI/bge-m3")
embeddings = vmodel.encode(summarized, convert_to_numpy=True)
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)

In [None]:
tokenized_summaries = [word_tokenize(text.lower()) for text in summarized]
bm25 = BM25Okapi(tokenized_summaries)

Actual Searching, Reranking, and Output

TIP: You can tweak `alpha` to control the importance of semantic search over key word search. Higher the `alpha`, more weight given to the semantic search result.

`semantic_query`: The query to the used for semantic search.

`keyword_query_list`: The list of keywords to be sent for key word search.

`rerank_k`: The top k results that will be chosen from the 2 vector dbs before reranking.

`top_k`: The ultimate top k results chosen after reranking.

In [None]:
def search(semantic_query, keyword_query_list, rerank_k=200, top_k=50, alpha=0.7):
    # --- Semantic search ---
    query_embedding = vmodel.encode([semantic_query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    D, I = index.search(query_embedding, rerank_k)
    semantic_results = {i: D[0][idx] for idx, i in enumerate(I[0])}
    
    # --- BM25 keyword search ---
    bm25_scores = bm25.get_scores(keyword_query_list)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:rerank_k]
    bm25_results = {i: bm25_scores[i] for i in bm25_top_indices}
    
    # --- Combine and rerank ---
    combined_results = {}
    
    for i, score in semantic_results.items():
        combined_results[i] = alpha * score
    
    for i, score in bm25_results.items():
        if i in combined_results:
            combined_results[i] += (1 - alpha) * score
        else:
            combined_results[i] = (1 - alpha) * score
    
    ranked_indices = sorted(combined_results, key=lambda i: combined_results[i], reverse=True)[:top_k]
    
    final_results = [(sections[i], combined_results[i]) for i in ranked_indices]
    return final_results

Extraction of data. Indicator by indicator.

In [None]:

# The one-shot examples are stored in a JSON file
with open("oneshots.json", 'r') as file:
    examples = json.load(file)

In [None]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
res = []
for cur_indicator in INDICATORS:
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        model.config.pad_token_id = tokenizer.pad_token_id
    semantic_query = cur_indicator
    keyword_query_list = esg_indicators[cur_indicator]['keywords']
    results = search(semantic_query, keyword_query_list, rerank_k = 200, top_k=50, alpha=0.7)
    text_input = ""
    for text, score in results:
        if score > 0.7:
            text_input += text
            text_input += "\n\n"
    msg = f"""You are an expert ESG reporting analyst specializing in the automotive industry. Your task is to extract '{cur_indicator}' indicator given a text to extract from.

        Instructions:
        """
    if esg_indicators[cur_indicator]['data_type'] == "Quantitative":
        msg += f"""        1. The current indicator is a quantitative indicator. You must extract relevant numeric data or a short string that includes one of the possible units specified in the "unit" instruction.
        2. The allowed list of units are: {esg_indicators[cur_indicator]['unit']}. For any other units found, convert into one of the units in the list.
        """
    else:
        msg += f"""        1. The current indicator is a qualitative indicator. You must locate the sentence(s) that best describe the indicator within the text to extract.
        2. If the relevant information is spread across disjoint sentences, return them as a separate element of a list; otherwise, return a list with single sentence.
        """
    msg += f"""        3. The following key words: {esg_indicators[cur_indicator]['keywords']} are some words that you can watch out for in extraction, but consider the overall context to ensure accurate extraction.
    4. You must follow the additional instructions specified here: {esg_indicators[cur_indicator]['extraction_notes']}.
    5. The final output should be enclosed with <output> and </output> tags.
    6. If there are no data to be found, return <output>No data available for {cur_indicator}</output>.

    The following are some contexts that you may refer to when understanding the text to extract from:
    Specific to '{cur_indicator}': {esg_indicators[cur_indicator]['background']}
    General ESG background: {ESG_CONTEXT}

    Example:
    {examples[cur_indicator]}.

    Here is the text to extract '{cur_indicator}' from:
    '''
    {text_input}
    ''''
    """
    messages = [{'role': 'user', 'content': msg}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
    outputs = model.generate(inputs, max_new_tokens=100000)
    op = tokenizer.decode(outputs[0]).split("<|end_header_id|>")[-1].strip("\n").split("<|eot_id|>")[0]
    op = re.findall(r'<output>(.*?)</output>', op)[0]
    res.append(op)

In [None]:
df = pd.DataFrame([res], columns=INDICATORS)
df