## Google Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd 'drive/My Drive/master-thesis/'

#!pip3 install finetune
!pip install --upgrade --user pandas==1.3 # needed to use pickle
!pip install scikit-multilearn

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/master-thesis


In [None]:
# Finetune installation
!git clone -b master https://github.com/IndicoDataSolutions/finetune && cd finetune
!python3 setup.py develop              
!pip3 install tensorflow-gpu --upgrade 
!python3 -m spacy download en          

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
from tqdm.auto import tqdm  
import torch
import warnings
import tensorflow

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, make_scorer
from skmultilearn.model_selection import IterativeStratification
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
np.random.seed(0)

In [3]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## 1. Load data

In [4]:
# Load dataset
df = pd.read_pickle('reports_tokenized.p')
df

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17,CSR_Filename,CSR_Text,CSR_Text_clean,CSR_Text_tokenized
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False,0_888.L_2020.pdf,888 HOLDINGS PLC\n\nANNUAL REPORT & ACCOUNTS 2020\n\nA YEAR OF \nSTRONG GROWTH \n\n888 IS ONE ...,HOLDINGS PLC ANNUAL REPORT & ACCOUNTS 2020 A YEAR OF STRONG GROWTH 888 IS ONE OF THE WORLDS LEA...,holding plc annual report account year strong growth one world leading online betting gaming com...
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False,1_A.N_2020.pdf,Delivering on \nour Promises\n\n2020 Corporate Social Responsibility Report\n\n1\n\nLetter fro...,Delivering on our Promises 2020 Corporate Social Responsibility Report 1 Letter from the Presid...,delivering promise corporate social responsibility report letter president stakeholder engagemen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8154,12676,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2016,FY-4,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2016.pdf,False,False,True,True,True,True,True,True,False,,False,True,False,,False,True,False,12676_ZBH.N_2016.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2016 T A B L E ...,e n v r n e n l c l g v e r n n c e sustainability report b l e f c n e n company profile corpor...
8155,12677,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2015,FY-5,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2015.pdf,False,False,True,True,False,True,True,False,False,,False,True,False,,False,False,False,12677_ZBH.N_2015.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2015 Investing ...,e n v r n e n l c l g v e r n n c e sustainability report investing future world b l e f c n e n...


In [5]:
# Store labels
labels = ['SDG_1', 'SDG_2', 'SDG_3', 'SDG_4', 'SDG_5', 'SDG_6', 'SDG_7', 'SDG_8', 'SDG_9', 'SDG_11', 'SDG_12', 'SDG_13', 'SDG_15', 'SDG_16', 'SDG_17']
Y = df[labels].to_numpy().astype(int)

In [6]:
# Split into training and test data (stratified for multi-label, adapted to work with df column)

def iterative_train_test_split(X, y, test_size):
    stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0-test_size])
    train_indexes, test_indexes = next(stratifier.split(X, y))

    X_train, y_train = X.iloc[train_indexes], y[train_indexes, :]
    X_test, y_test = X.iloc[test_indexes], y[test_indexes, :]

    return X_train, y_train, X_test, y_test

X_train, Y_train, X_test, Y_test = iterative_train_test_split(df['CSR_Text_clean'], Y, test_size=0.2)

In [7]:
# Transform labels to list of lists (needed for finetune multilabel classifier)

# Y_train
df_temp = pd.DataFrame(data=Y_train,   
             index=range(np.shape(Y_train)[0]),    
             columns=labels) 
df_temp = df_temp.astype(bool)
Y_train_new = []
for row in df_temp.apply(lambda x: ','.join(x.index[x]), axis=1):
  Y_train_new.append(row.split(','))

# Y_test
df_temp = pd.DataFrame(data=Y_test,   
             index=range(np.shape(Y_test)[0]),    
             columns=labels) 
df_temp = df_temp.astype(bool)
Y_test_new = []
for row in df_temp.apply(lambda x: ','.join(x.index[x]), axis=1):
  Y_test_new.append(row.split(','))

## 2. Train model with finetune API

In [8]:
from finetune import MultiLabelClassifier
from finetune.base_models import BERT, BERTLarge, GPT2, GPT2Medium, GPT2Large, TextCNN, TCN, RoBERTa, DistilBERT
model = MultiLabelClassifier(base_model=RoBERTa, low_memory_mode=True, chunk_long_sequences=False, class_weights='linear', n_epochs=3)

INFO:finetune:Saving tensorboard output to /tmp/Finetune7mmsjklw


In [9]:
model.fit(X_train, Y_train_new) 

INFO:finetune: Visible GPUs: {GPU:/physical_device:GPU:0}
Epoch 3/3: 100%|██████████| 6254/6254 [38:26<00:00,  2.71it/s]


In [10]:
predictions = model.predict(X_test)

Inference: 1887it [17:35,  1.79it/s]


In [11]:
print(classification_report(model.input_pipeline.label_encoder.transform(Y_test_new), model.input_pipeline.label_encoder.transform(predictions)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.65      0.58      0.61       206
           2       0.62      0.57      0.59       199
           3       0.91      0.74      0.82      1552
           4       0.77      0.59      0.67      1146
           5       0.68      0.72      0.70       563
           6       0.79      0.79      0.79      1248
           7       0.59      0.56      0.58       199
           8       0.53      0.59      0.56        27
           9       0.87      0.74      0.80      1500
          10       0.85      0.65      0.74      1441
          11       0.80      0.82      0.81      1307
          12       0.88      0.70      0.78      1421
          13       0.83      0.66      0.73      1318
          14       0.78      0.80      0.79      1220
          15       0.27      0.62      0.38        40

   micro avg       0.81      0.71      0.76     13387
   macro avg       0.68   