<a href="https://colab.research.google.com/github/Hyungsoo-Lim-87/CBSA_Analytics_Challenge/blob/main/CBSA_Analytics_Challenge_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# load the FinBERT fine-tuned ESG 9-category model and the corresponding tokernizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg-9-categories', num_labels = 9)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg-9-categories')

finbert_sent = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer_sent = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

# load model on GPU/CPU
import torch
# load model on GPU if available
if torch.cuda.is_available():       
    device = torch.device("cuda")
    # put the model on GPU
    finbert.to(device)
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Model loaded on:', torch.cuda.get_device_name(0))

# load model on CPU if GPU is not available
else:
    device = torch.device("cpu")
    # put the model on CPU
    finbert.to(device)
    print('No GPU available, model loaded on CPU.')
    
import pandas as pd
import time
from datetime import datetime
import pytz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
There are 1 GPU(s) available.
Model loaded on: Tesla T4


In [None]:
# import sample data
data = pd.read_excel('FINBERT_label_sample.xlsx')
# 9 categories and labels
label_map = {'Climate Change': 'CC',
             'Natural Capital': 'NC',
             'Pollution & Waste': 'PW',
             'Human Capital': 'HC',
             'Product Liability': 'PL',
             'Community Relations': 'CR',
             'Corporate Governance': 'CG',
             'Business Ethics & Values': 'BE',
             'Non-ESG': 'N'}
             
# use pipeline in transformers to assemble the steps of finbert prediction 
if torch.cuda.is_available(): # has GPU
  nlp = pipeline("text-classification", model = finbert, tokenizer = tokenizer, device = 0, max_length = 512, truncation = True)
  nlp_sent = pipeline("sentiment-analysis", model=finbert_sent, tokenizer=tokenizer_sent, device = 0, max_length = 512, truncation = True)
else: # CPU only
  nlp = pipeline("text-classification", model = finbert, tokenizer = tokenizer)

In [None]:
data.head()

Unnamed: 0,unq_id,translated
0,709443,The weather in Hong Kong is still unstable. It...
1,396564,"The weather in Hong Kong is volatile, and the ..."
2,630076,"The weather in Hong Kong is volatile, and the ..."
3,709212,The weather in Hong Kong is affected by an act...
4,398644,The weather in Hong Kong is affected by the tr...


In [None]:
%%time
start_time = time.time()
prt_part = 10

# define the output structure
header = ['No.', 'Translated', 'FinBERT sent', 'FinBERT sent score', 'FinBERT label', 'FinBERT score']
result = []

# loop through the contents
for i, row in data.iterrows():

  # content number
  id = row['unq_id']

  # translated

  translated = row['translated']

  # use the nlp function to process the translated, get finbert label and score
  finbert_output = nlp(translated)[0]
  finbert_sent = nlp_sent(translated)[0]['label']
  finbert_sent_score = nlp_sent(translated)[0]['score']
  # label and score
  finbert_label = label_map[finbert_output['label']]
  finbert_score = finbert_output['score']
  
  # combine all the output above into one row of observation
  row = [id, translated, finbert_sent, finbert_sent_score, finbert_label, finbert_score]
  result.append(row)
  
  if (i+1)%prt_part == 0:
      iter_time = time.time()
      current_time = datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime("%H:%M:%S")
      prt_time = round((iter_time - start_time) * (data.shape[0] + 1 - i)) / i
      print("Row Number:", (i + 1), ", Remaining Time :", prt_time//3600, "hour", prt_time%3600//60, "min.", round(prt_time%3600%60), "sec.,  Currnet Time :", current_time)

# consolidate the output into a dataframe
result_df_label = pd.DataFrame(result, columns = header)
#result_df_label = pd.DataFrame(result)



Row Number: 1000 , Remaining Time : 0.0 hour 35.0 min. 26 sec.,  Currnet Time : 16:33:06
Row Number: 2000 , Remaining Time : 0.0 hour 33.0 min. 48 sec.,  Currnet Time : 16:34:59
Row Number: 3000 , Remaining Time : 0.0 hour 31.0 min. 55 sec.,  Currnet Time : 16:36:52
Row Number: 4000 , Remaining Time : 0.0 hour 29.0 min. 48 sec.,  Currnet Time : 16:38:41
Row Number: 5000 , Remaining Time : 0.0 hour 27.0 min. 39 sec.,  Currnet Time : 16:40:27
Row Number: 6000 , Remaining Time : 0.0 hour 26.0 min. 16 sec.,  Currnet Time : 16:42:29
Row Number: 7000 , Remaining Time : 0.0 hour 24.0 min. 19 sec.,  Currnet Time : 16:44:19
Row Number: 8000 , Remaining Time : 0.0 hour 22.0 min. 10 sec.,  Currnet Time : 16:46:00
Row Number: 9000 , Remaining Time : 0.0 hour 19.0 min. 59 sec.,  Currnet Time : 16:47:34
Row Number: 10000 , Remaining Time : 0.0 hour 17.0 min. 59 sec.,  Currnet Time : 16:49:12
Row Number: 11000 , Remaining Time : 0.0 hour 16.0 min. 7 sec.,  Currnet Time : 16:50:55
Row Number: 12000 , 

In [None]:
result_df_label

Unnamed: 0,No.,Translated,FinBERT sent,FinBERT sent score,FinBERT label,FinBERT score
0,709443,The weather in Hong Kong is still unstable. It...,Positive,0.665770,CC,0.539686
1,396564,"The weather in Hong Kong is volatile, and the ...",Neutral,0.999932,CR,0.699923
2,630076,"The weather in Hong Kong is volatile, and the ...",Neutral,0.999946,CR,0.828538
3,709212,The weather in Hong Kong is affected by an act...,Neutral,0.982191,NC,0.562376
4,398644,The weather in Hong Kong is affected by the tr...,Neutral,0.999791,NC,0.597167
...,...,...,...,...,...,...
19995,722051,A heat wave hits Hong Kong｜The Observatory rec...,Neutral,0.998701,CC,0.779109
19996,721938,Heat wave hits Hong Kong｜The Observatory recor...,Neutral,0.999953,CC,0.514309
19997,721943,Heat wave hits Hong Kong｜The Observatory recor...,Neutral,0.999931,CC,0.631999
19998,721975,Heat wave hits Hong Kong｜The Observatory recor...,Neutral,0.999834,CC,0.752070


In [None]:
# save the output in the csv format
result_df_label.to_excel('FinBERT_result.xlsx', index = False)
# download the output to the local computer
from google.colab import files
files.download('FinBERT_result.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>