In [None]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, MPNetPreTrainedModel, MPNetModel
import torch
import re
import json
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from torch.utils.data import DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import OrderedDict

load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
login(HF_TOKEN)

In [7]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=quant_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
data_path = ""
name = f"{data_path}/pdf_{0}_processed.txt"
with open(name, "r", encoding="utf-8") as file:
    content = file.read()
pattern = re.compile(r'(?s)(#.*?\n.*?\S)(?=\s*#|$)') # Matches each header sections.
subs = pattern.findall(content)

In [None]:
## The ESG-BERT Model to be Used.
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
    """Model for Classification ESG risks from text."""

    def __init__(self,config): #tuning only the head
        """
        """
        super().__init__(config)
        # Instantiate Parts of model
        self.mpnet = MPNetModel(config,add_pooling_layer=False)
        self.id2label =  config.id2label
        self.label2id =  config.label2id
        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
                                                ('linear',torch.nn.Linear(768,512)),
                                                ('act',torch.nn.ReLU()),
                                                ('batch_n',torch.nn.BatchNorm1d(512)),
                                                ('drop_class', torch.nn.Dropout(0.2)),
                                                ('class_l',torch.nn.Linear(512 ,47))]))


    def forward(self, input_ids, attention_mask):
         # Feed input to mpnet model
        outputs = self.mpnet(input_ids=input_ids,
                             attention_mask=attention_mask)

        # mean pooling dataset and eed input to classifier to compute logits
        logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))

        # apply sigmoid
        logits  = 1.0 / (1.0 + torch.exp(-logits))
        return logits
esg_model = ESGify.from_pretrained('ai-lab/ESGify')
esg_tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')

In [None]:
def esg_classify(sub_splitted):
    res = []
    for para in sub_splitted:
        to_model = esg_tokenizer.batch_encode_plus(
                      [para],
                      add_special_tokens=True,
                      max_length=512,
                      return_token_type_ids=False,
                      padding="max_length",
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt',
                    )
        results = esg_model(**to_model)
        for i in torch.topk(results, k=1).indices.tolist()[0]:
            res.append(esg_model.id2label[i])
    return res

In [None]:
## All categories predicted by ESG-BERT
topics = {
    'Air Pollution': 'Environmental - Air Pollution',
    'Animal Welfare': 'Environmental - Animal Welfare',
    'Biodiversity': 'Environmental - Biodiversity',
    'Climate Risks': 'Environmental - Climate Risks',
    'Communities Health and Safety': 'Social - Communities Health and Safety',
    'Corporate Governance': 'Governance - Corporate Governance',
    'Cultural Heritage': 'Social - Cultural Heritage',
    'Data Safety': 'Social - Data Safety',
    'Disclosure': 'Governance - Disclosure',
    'Discrimination': "Social - Discrimination",
    'Economic Crime': 'Governance - Economic Crime',
    'Emergencies (Environmental)': 'Environmental - Emergencies',
    'Emergencies (Social)': 'Social - Emergencies',
    'Employee Health and Safety': 'Social - Employee Health and Safety',
    'Energy Efficiency and Renewables': 'Environmental - Energy Efficiency and Renewables',
    'Environmental Management': 'Environmental - Environmental Management',
    'Forced Labour': 'Social - Forced Labor',
    'Freedom of Association and Right to Organise': 'Social - Freedom of Association and Right to Organise',
    'Greenhouse Gas Emissions': 'Environmental - Greenhouse Gas Emissions',
    'Hazardous Materials Management': 'Environmental - Hazardous Material Management',
    'Human Rights': 'Social - Human Rights',
    'Indigenous People': 'Social - Indigenous People',
    'Labor Relations Management': 'Social - Labor Relations Management',
    'Land Acquisition and Resettlement (E)': 'Environmental - Land Acquisition and Resettlement',
    'Land Acquisition and Resettlement (S)': 'Social - Land Acquisition and Resettlement',
    'Land Rehabilitation': 'Environmental - Land Rehabilitation',
    'Landscape Transformation': 'Environmental - Land Transformation',
    'Legal Proceedings & Law Violations': 'Governance - Legal Proceedings & Law Violations',
    'Minimum Age and Child Labour': 'Social - Minimum Age and Child Labor',
    'Natural Resources': 'Environmental - Natural Resources',
    'Not Relevant to ESG': 'Not Relevant to ESG - NA',
    'Physical Impacts': 'Environmental - Physical Impacts',
    'Planning Limitations': 'Environmental - Planning Limitations',
    'Product Safety and Quality': 'Social - Product Safety and Quality',
    'Responsible Investment & Greenwashing': 'Governance - Responsible Investment & Greenwashing',
    'Retrenchment': 'Social - Retrenchment',
    'Risk Management and Internal Control': 'Governance - Risk Management and Internal Control',
    'Soil and Groundwater Impact': 'Environmental - Soil and Groundwater Impact',
    'Strategy Implementation': 'Governance - Strategy Implementation',
    'Supply Chain (Economic / Governance)': 'Governance - Supply Chain (Economic / Governance)',
    'Supply Chain (Environmental)': 'Environmental - Supply Chain',
    'Supply Chain (Social)': 'Social - Supply Chain',
    'Surface Water Pollution': 'Environmental - Surface Water Pollution', 
    'Values and Ethics': 'Governance - Values and Ethics',
    'Waste Management': 'Environmental - Waste Management',
    'Wastewater Management': 'Environmental - Wastewater Management',
    'Water Consumption': 'Environmental - Water Consumption'
}

In [None]:
message_body = f"""You are provided with three inputs:
 - paragraph: The paragraph in focus.
 - context: The surrounding text that includes the paragraph itself (i.e., the actual portions of text around the paragraph, not a summary).
 - topic: The predicted topic for the paragraph.

The possible predicted topics are: {topics.keys()}

Task:
Using the provided context (which includes the paragraph itself) and the predicted topic, determine if the paragraph is related to ESG (Environmental, Social, Governance) issues.
 - Output "ESG" if the paragraph is related to ESG.
 - Output "Non-ESG" if the paragraph is not related to ESG.

Important:
 - The predicted topic is a hint; however, it may not always be accurate—especially if the paragraph contains HTML, LaTeX, or other non-standard formats.
 - Your output must be exactly either "ESG" or "Non-ESG" without any additional commentary, text, or extra whitespace.

Example 1:
'''
Input = 
paragraph: "The company has implemented a new system to monitor greenhouse gas emissions more efficiently."
context: "In this feature article, the company’s environmental initiatives are discussed in detail. The company has implemented a new system to monitor greenhouse gas emissions more efficiently."
topic: "Environmental - Greenhouse Gas Emissions"

Output = 
ESG
'''

Example 2:
'''
Input = 
paragraph: "The latest model of the smartphone offers a vibrant display and a sleek design."
context: "The review covers various aspects of the new smartphone. The latest model of the smartphone offers a vibrant display and a sleek design."
topic: "Not Relevant to ESG - NA"

Output = 
Non-ESG
'''

Example 3:
'''
Input = 
paragraph: "A community safety initiative was launched to improve local neighborhood security."
context: "The news report details several local programs. A community safety initiative was launched to improve local neighborhood security."
topic: "Social - Communities Health and Safety"

Output = 
ESG
'''

Now, here is the input that you need to classify:
"""

In [None]:
op = []
pattern = re.compile(r"\bNon-ESG\b|\bESG\b")
for sub in subs[:10]:
    sub_splitted = sub.split("\n\n")
    esg_topics = esg_classify(sub_splitted)
    for i, para in enumerate(sub_splitted):
        msg = message_body + "paragraph: " + para + "\n" + "context: " + sub + "\n" + "topic: " + esg_topics[i]
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
            model.config.pad_token_id = tokenizer.pad_token_id
        messages = [{'role': 'user', 'content': msg}]
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
        outputs = model.generate(inputs, max_new_tokens=100000)
        res = tokenizer.decode(outputs[0]).split("<|end_header_id|>")[-1].strip("\n").strip("<|eot_id|>")
        res = pattern.findall(res)[0]
        op.append([para, res])

#### Below is a another prompt that predicts, E, S, G, or None, rather than ESG vs. None.

In [None]:
message_body = f"""You are provided with three inputs:
 - paragraph: The paragraph in focus.
 - context: The surrounding text that includes the paragraph itself (i.e., the actual portions of text around the paragraph, not a summary).
 - topic: The predicted topic for the paragraph.

The possible predicted topics are: {topics.values()}
"""
message_body +="""
Task:
Using the provided context (which includes the paragraph itself) and the predicted topics, determine if the paragraph is related to ESG (Environmental, Social, Governance) issues. For this task, evaluate each ESG factor separately:
 - "E" for Environmental
 - "S" for Social
 - "G" for Governance

If the paragraph is related to one or more ESG factors, mark the corresponding keys as true. If the paragraph is not related to any ESG factors, mark "N" (for Non-ESG) as true and the ESG keys as false.

Return your result as a JSON object exactly in the following format:
{"E": <boolean>, "S": <boolean>, "G": <boolean>, "N": <boolean>}

Important:
 - The predicted topics are only hints; they might not always be accurate—especially if the paragraph contains HTML, LaTeX, or other non-standard formats.
 - Your output must be exactly in the JSON format with the specified keys and boolean values, with no additional commentary, text, or extra whitespace.

Examples:

Example 1:
'''
Input = 
paragraph: "The company has implemented a new system to monitor greenhouse gas emissions more efficiently."
context: "In this feature article, the company’s environmental initiatives are discussed in detail. The company has implemented a new system to monitor greenhouse gas emissions more efficiently."
topic: "Environmental - Greenhouse Gas Emissions"

Output = 
{"E": true, "S": false, "G": false, "N": false}
'''

Example 2:
'''
Input = 
paragraph: "The latest model of the smartphone offers a vibrant display and a sleek design."
context: "The review covers various aspects of the new smartphone. The latest model of the smartphone offers a vibrant display and a sleek design."
topic: "Not Relevant to ESG - NA"

Output = 
{"E": false, "S": false, "G": false, "N": true}
'''

Example 3:
'''
Input = 
paragraph: "A community safety initiative was launched to improve local neighborhood security."
context: "The news report details several local programs. A community safety initiative was launched to improve local neighborhood security."
topic: "Social - Communities Health and Safety"

Output = 
{"E": false, "S": true, "G": false, "N": false}
'''

Now, here is the input that you need to classify:
"""

In [None]:
op = []
for sub in subs[:1]:
    sub_splitted = sub.split("\n\n")
    esg_topics = esg_classify(sub_splitted)
    for i, para in enumerate(sub_splitted):
        msg = message_body + "paragraph: " + para + "\n" + "context: " + sub + "\n" + "topic: " + topics[esg_topics[i]]
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
            model.config.pad_token_id = tokenizer.pad_token_id
        messages = [{'role': 'user', 'content': msg}]
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_attention_mask=True).to("cuda")
        outputs = model.generate(inputs, max_new_tokens=100000)
        res = tokenizer.decode(outputs[0]).split("<|end_header_id|>")[-1].strip("\n").strip("<|eot_id|>")
        res = re.findall("{.+}", res, re.DOTALL)[0]
        res = re.sub('\s', "", res)
        res = json.loads(res)
        op.append([para, topics[esg_topics[i]], res])