In [1]:
import glob
import pandas as pd
import os
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import concurrent.futures

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Specify the folder path
folder_path = 'C:/Users/lenovo/Desktop/UCL/Final dissertation/code/filtered data'
# Use glob to get all .csv files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

In [3]:
def split_long_sentence(sentence, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) <= max_length:
        return [sentence]
    
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        chunk_sentence = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_sentence)
    
    return chunks

In [4]:
def loadFinbertESG(df, tokenizer, model, labels, device, batch_size=8):
    all_sen = []
    for i, row in df.iterrows():
        sen = row['Sentences']
        all_sen.extend(split_long_sentence(sen, tokenizer, 512))
    
    results_list = []
    for i in range(0, len(all_sen), batch_size):
        batch = all_sen[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, max_length=512, padding='max_length')
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
        results_list.append(probs.cpu())  # Move results back to CPU to save GPU memory

    all_probs = torch.cat(results_list, dim=0)

    idx = 0
    for i, row in df.iterrows():
        sen_len = len(split_long_sentence(row['Sentences'], tokenizer, 512))
        avg_probs = torch.mean(all_probs[idx:idx + sen_len], dim=0)
        results = {label: prob.item() for label, prob in zip(labels, avg_probs)}
        df.loc[i, 'Env'] = results['Environmental']
        df.loc[i, 'Soc'] = results['Social']
        df.loc[i, 'Gov'] = results['Governance']
        df.loc[i, 'None'] = results['None']
        idx += sen_len
    return df

In [5]:
def process_file(file, tokenizer, model, labels, device):
    df = pd.read_csv(file)
    df = df.rename(columns={'0': 'Sentences'})
    if len(df) > 10:
        df = loadFinbertESG(df, tokenizer, model, labels, device)
        processed_file_path = os.path.join(folder_path, os.path.basename(file))
        df.to_csv(processed_file_path, index=False)

In [6]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg', num_labels=4)
    finbert.to(device)
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
    labels = ['None', 'Environmental', 'Social', 'Governance']

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_file, file, tokenizer, finbert, labels, device) for file in csv_files]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
