In [1]:
import glob
import pandas as pd
import os
# tested in transformers==4.18.0 
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import torch.nn.functional as F
import concurrent.futures
import multiprocessing as mp
import threading

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(mp.cpu_count())

20


In [3]:
# Specify the folder path
folder_path = 'C:/Users/lenovo/Desktop/UCL/Final dissertation/code/filtered data'
# Use glob to get all .csv files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

In [4]:
def split_long_sentence(sentence, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) <= max_length:
        return [sentence]
    
    # Split the tokens into chunks
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        chunk_sentence = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_sentence)
        print("Run Chunk tokens process")
    
    return chunks

In [5]:
# Load the pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
labels = ['None', 'Environmental', 'Social', 'Governance']

In [6]:
def loadFinbertESG (df:pd):
    for i,row in df.iterrows():
        sen = row['Sentences']
        sen = split_long_sentence(sen, tokenizer, 512)
        inputs = tokenizer(sen, return_tensors='pt',truncation=True, max_length=512, padding='max_length')
        outputs = finbert(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        results = {label: prob.item() for label, prob in zip(labels, probs[0])}
        df.loc[i,'Env'] = results['Environmental']
        df.loc[i,'Soc'] = results['Social']
        df.loc[i,'Gov'] = results['Governance']
        df.loc[i,'None'] = results['None']
        #print(f'{i}+:::{results}')
    return df


In [7]:
def process_file(file):
    df = pd.read_csv(file)
    df = df.rename(columns={'0': 'Sentences'})
    if len(df) > 10:
        # print(f"{file}: {len(df)}")
        loadFinbertESG(df)
        processed_file_path = os.path.join(folder_path, os.path.basename(file))
        df.to_csv(processed_file_path, index=False)

In [8]:
# Run multi threading to boost the speed with NLP with 30%
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_file, file) for file in csv_files]
    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"An error occurred: {e}")