In [1]:
import os

os.chdir("..")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# Evaluation
from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix,
    make_scorer
)

# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

# Other
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from skmultilearn.model_selection import IterativeStratification

# Classifiers
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from src.models.evaluation import evaluation
from src.data.iterative_train_test_split import iterative_train_test_split

In [3]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## 1. Load and transform data

In [4]:
# Load data
df = pd.read_pickle("data/processed/reports_tokenized.p")
df

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17,CSR_Filename,CSR_Text,CSR_Text_clean,CSR_Text_tokenized
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False,0_888.L_2020.pdf,888 HOLDINGS PLC\n\nANNUAL REPORT & ACCOUNTS 2020\n\nA YEAR OF \nSTRONG GROWTH \n\n888 IS ONE ...,888 ORG ANNUAL REPORT & ACCOUNTS 2020 A YEAR OF STRONG GROWTH ORG IS ONE OF THE WORLDS LEADING O...,org annual report account year strong growth org one world leading online betting gaming company...
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False,1_A.N_2020.pdf,Delivering on \nour Promises\n\n2020 Corporate Social Responsibility Report\n\n1\n\nLetter fro...,Delivering on our Promises 2020 Corporate ORG 1 Letter from the President Stakeholder engagement...,delivering promise corporate org letter president stakeholder engagement environment table conte...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101,8101,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2016,FY-4,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2016.pdf,False,False,True,True,True,True,True,True,False,,False,True,False,,False,True,False,12676_ZBH.N_2016.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2016 T A B L E O...,sustainability report company profile corporate overview purpose mission value sustainability co...
8102,8102,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2015,FY-5,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2015.pdf,False,False,True,True,False,True,True,False,False,,False,True,False,,False,False,False,12677_ZBH.N_2015.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2015 Investing n...,sustainability report investing future world letter ceo stakeholder company profile corporate ov...


In [5]:
# Transform labels to numpy array: Exclude SDGs 2 and 9 (too few aligned examples) and 10 and 14 (no mapping) -> 13 labels left
labels = [
    "SDG_1",
    "SDG_3",
    "SDG_4",
    "SDG_5",
    "SDG_6",
    "SDG_7",
    "SDG_8",
    "SDG_11",
    "SDG_12",
    "SDG_13",
    "SDG_15",
    "SDG_16",
    "SDG_17",
]
Y = df[labels].to_numpy().astype(int)

In [6]:
# Split into 70% training, 15% validation, and 15% test data (stratified for multi-label, adapted to work with df column)

np.random.seed(7)

X_train, Y_train, X_test, Y_test = iterative_train_test_split(df['CSR_Text_tokenized'], Y, test_size=0.3)
X_val, Y_val, X_test, Y_test = iterative_train_test_split(X_test, Y_test, test_size=0.5)

## 2. Predict on test set

In [7]:
# Load model
grid_svm = joblib.load('models/tfidf_svm.p')

In [8]:
# Predict
predictions = grid_svm.predict(X_test)

## 3. Evaluate

In [57]:
# Get correct predictions
idx_correct = []
i=0
for idx in X_test.index:
    if (predictions[i] == Y_test[i]).all():
        print(str(i) + ' - ' + df.loc[idx]['Company_Name'] + ' - ' + str(df.loc[idx]['Financial_Period_Absolute']))
        idx_correct.append(i)
    i+=1

0 - Alcoa Corp - 2020
2 - Amerisourcebergen Corp - 2020
3 - Abacus Property Group - 2020
6 - Adobe Inc - 2020
7 - Autodesk Inc - 2021
8 - Agnico Eagle Mines Ltd - 2020
17 - Amgen Inc - 2020
28 - AusNet Services Ltd - 2021
29 - ATS Automation Tooling Systems Inc - 2021
30 - Atento SA - 2019
31 - Aviva PLC - 2020
35 - Autozone Inc - 2020
37 - Barratt Developments P L C - 2020
41 - Biffa PLC - 2021
49 - Brewin Dolphin Holdings PLC - 2020
50 - Burlington Stores Inc - 2021
60 - Centamin PLC - 2020
64 - Church & Dwight Co Inc - 2020
67 - Cigna Corp - 2020
68 - Circassia Group PLC - 2020
71 - CenterPoint Energy Inc - 2019
73 - CommScope Holding Company Inc - 2020
76 - CRH PLC - 2020
80 - ConvaTec Group PLC - 2020
86 - Diversified Energy Company PLC - 2020
90 - Dunelm Group PLC - 2020
99 - Energy Fuels Inc - 2020
103 - Enable Midstream Partners LP - 2019
109 - Eaton Corporation PLC - 2020
113 - Expedia Group Inc - 2020
116 - First Hawaiian Inc - 2020
117 - Flowserve Corp - 2020
119 - First Qua

653 - Kinross Gold Corp - 2017
654 - Link Administration Holdings Ltd - 2017
655 - London Stock Exchange Group PLC - 2017
657 - Marks and Spencer Group PLC - 2018
659 - Mothercare PLC - 2018
662 - ON Semiconductor Corp - 2017
663 - Principal Financial Group Inc - 2017
664 - Pentair PLC - 2017
665 - Qantas Airways Ltd - 2017
667 - Redde Northgate PLC - 2018
668 - Relx PLC - 2017
669 - Rolls-Royce Holdings PLC - 2017
670 - South32 Ltd - 2018
673 - Shaftesbury PLC - 2017
674 - Senior PLC - 2017
675 - Savills PLC - 2017
676 - Synthomer PLC - 2017
677 - Tate & Lyle PLC - 2018
678 - Transcontinental Inc - 2017
679 - Toromont Industries Ltd - 2017
681 - Vipshop Holdings Ltd - 2017
685 - Ampol Ltd - 2016
686 - ALS Ltd - 2017
691 - Capital & Counties Properties PLC - 2016
693 - Canadian Natural Resources Ltd - 2016
695 - Delta Air Lines Inc - 2015
702 - First Solar Inc - 2016
705 - Gentex Corp - 2016
707 - Hays PLC - 2016
709 - HSBC Holdings PLC - 2016
711 - Kinross Gold Corp - 2016
712 - KLA C

In [59]:
# Get incorrect predictions (false positives) for SDG 1 (no poverty)
idx_incorrect_fp_1 = []
i=0
for idx in X_test.index:
    if ((predictions[i][0] == 1) & (Y_test[i][0] == 0)):
        print(str(i) + ' - ' + df.loc[idx]['Company_Name'] + ' - ' + str(df.loc[idx]['Financial_Period_Absolute']))
        idx_incorrect_fp_1.append(i)
    i+=1

42 - Biogen Inc - 2020
231 - Portland General Electric Co - 2020
320 - Zions Bancorporation NA - 2020
444 - National Bank of Canada - 2019
499 - United Community Banks Inc - 2019
521 - Bank of Georgia Group PLC - 2018
533 - CNO Financial Group Inc - 2018
924 - Annaly Capital Management Inc - 2019
1095 - China Mobile Ltd - 2015
1096 - China Mobile Ltd - 2011
1185 - Duke Energy Corp - 2019
1186 - Duke Energy Corp - 2018
1229 - Laurentian Bank of Canada - 2014
1282 - Sun Life Financial Inc - 2017
1305 - Webster Financial Corp - 2018


In [60]:
# Get incorrect predictions (false negatives) for SDG 1 (no poverty)
idx_incorrect_fn_1 = []
i=0
for idx in X_test.index:
    if ((predictions[i][0] == 0) & (Y_test[i][0] == 1)):
        print(str(i) + ' - ' + df.loc[idx]['Company_Name'] + ' - ' + str(df.loc[idx]['Financial_Period_Absolute']))
        idx_incorrect_fn_1.append(i)
    i+=1

32 - Avalonbay Communities Inc - 2020
70 - Centrica PLC - 2020
72 - Americold Realty Trust - 2020
84 - California Water Service Group - 2020
124 - General Electric Co - 2020
126 - Galliford Try Holdings PLC - 2020
135 - Howard Bancorp Inc - 2020
188 - Mears Group PLC - 2019
240 - Rogers Communications Inc - 2020
245 - RMR Group Inc - 2020
249 - Sunrun Inc - 2020
275 - Synchrony Financial - 2020
293 - UGI Corp - 2020
294 - Unilever PLC - 2020
347 - BCE Inc - 2019
350 - Banco Espirito Santo SA em Liquidacao - 2013
362 - Cogeco Inc - 2019
374 - Canadian Western Bank - 2019
405 - Howard Bancorp Inc - 2019
422 - Jefferies Financial Group Inc - 2019
616 - AGL Energy Ltd - 2017
621 - Barratt Developments P L C - 2017
656 - Medtronic PLC - 2017
688 - Barratt Developments P L C - 2016
741 - Barratt Developments P L C - 2015
745 - Comerica Inc - 2015
783 - Insurance Australia Group Ltd - 2014
826 - X5 Retail Group NV - 2012
840 - Stagecoach Group PLC - 2011
848 - X5 Retail Group NV - 2010
849 - 

In [61]:
# Get incorrect predictions (false positives) for SDG 5 (gender equality)
idx_incorrect_fp_5 = []
i=0
for idx in X_test.index:
    if ((predictions[i][3] == 1) & (Y_test[i][3] == 0)):
        print(str(i) + ' - ' + df.loc[idx]['Company_Name'] + ' - ' + str(df.loc[idx]['Financial_Period_Absolute']))
        idx_incorrect_fp_5.append(i)
    i+=1

4 - American Campus Communities Inc - 2020
24 - Ares Management Corp - 2020
45 - Bushveld Minerals Ltd - 2019
48 - Bathurst Resources Ltd - 2020
51 - N Brown Group PLC - 2021
56 - Canadian Apartment Properties Real Estate Investment Trust - 2020
58 - Carnival PLC - 2020
61 - Citizens Financial Group Inc - 2020
69 - Continental Resources Inc - 2020
79 - Centerspace - 2020
92 - Duke Realty Corp - 2020
100 - Eleco PLC - 2020
106 - Ensign Energy Services Inc - 2020
135 - Howard Bancorp Inc - 2020
162 - J B Hunt Transport Services Inc - 2020
225 - Primary Health Properties PLC - 2020
227 - Planet Fitness Inc - 2020
231 - Portland General Electric Co - 2020
233 - Perpetual Ltd - 2020
245 - RMR Group Inc - 2020
252 - SBA Communications Corp - 2019
257 - Shanta Gold Ltd - 2020
274 - Southwest Gas Holdings Inc - 2019
286 - Targa Resources Corp - 2019
320 - Zions Bancorporation NA - 2020
321 - Zentalis Pharmaceuticals Inc - 2020
336 - Apollo Global Management Inc - 2019
361 - Canfor Corp - 2019


In [62]:
# Get incorrect predictions (false negatives) for SDG 5 (no poverty)
idx_incorrect_fn_5 = []
i=0
for idx in X_test.index:
    if ((predictions[i][3] == 0) & (Y_test[i][3] == 1)):
        print(str(i) + ' - ' + df.loc[idx]['Company_Name'] + ' - ' + str(df.loc[idx]['Financial_Period_Absolute']))
        idx_incorrect_fn_5.append(i)
    i+=1

1 - American Assets Trust Inc - 2020
43 - Black Hills Corp - 2020
130 - Canada Goose Holdings Inc - 2021
166 - KB Home - 2020
172 - Kennedy-Wilson Holdings Inc - 2020
173 - Lithia Motors Inc - 2020
189 - MGE Energy Inc - 2019
191 - Mount Gibson Iron Ltd - 2020
226 - PHX Energy Services Corp - 2020
235 - Public Storage - 2020
238 - Quebecor Inc - 2020
241 - Renewable Energy Group Inc - 2020
271 - Steppe Gold Ltd - 2020
287 - Renewables Infrastructure Group Ltd - 2020
288 - Triton International Ltd - 2020
314 - Whitestone REIT - 2020
334 - Austin Engineering Ltd - 2019
343 - Acuity Brands Inc - 2019
350 - Banco Espirito Santo SA em Liquidacao - 2013
351 - Booking Holdings Inc - 2019
367 - Canadian National Railway Co - 2019
378 - Domino's Pizza Enterprises Ltd - 2019
382 - DWF Group PLC - 2020
399 - Golar LNG Ltd - 2019
419 - Intertape Polymer Group Inc - 2019
427 - Kennedy-Wilson Holdings Inc - 2019
431 - Lancashire Holdings Ltd - 2019
451 - Orora Ltd - 2019
470 - Restaurant Group PLC -

## 4. Plot most important words
- 0: SDG 1 - No poverty
- 1: SDG 3 - Good health and well-being
- 2: SDG 4 - Quality education
- 3: SDG 5 - Gender equality
- 4: SDG 6 - Clean water and sanitation
- 5: SDG 7 - Affordable and clean energy
- 6: SDG 8 - Decent work and economic growth
- 7: SDG 11 - Sustainable cities and communities
- 8: SDG 12 - Responsible consumption and production
- 9: SDG 13 - Climate action
- 10: SDG 15 - Life on land
- 11: SDG 16 - Peace, justice and strong institutions
- 12: SDG 17 - Partnerships for the goals

In [66]:
pd.set_option("display.max_rows", 25)

In [37]:
# Functions to display most important words

# Names
features = grid_svm.named_steps["tfidf"].get_feature_names()
# Values
Xtr = grid_svm.named_steps["tfidf"].transform(X_test)

# Function that takes a single row of the tf-idf matrix and returns the n highest scoring words
def top_tfidf_feats(row, features, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

# Function to convert a single row into dense format
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

# Function to get mean scores across a set of CSRs
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [69]:
# Correct predictions
top_mean_feats(Xtr[idx_correct], features)

Unnamed: 0,feature,tfidf
0,org,0.420058
1,employee,0.02257
2,fy,0.021354
3,year,0.015684
4,financial,0.00907
5,customer,0.008034
6,mine,0.007641
7,associate,0.007374
8,client,0.006657
9,report,0.00626


In [67]:
# False Positives for SDG 1
top_mean_feats(Xtr[idx_incorrect_fp_1], features)

Unnamed: 0,feature,tfidf
0,org,0.432121
1,duke energy,0.064864
2,duke,0.049808
3,webster,0.044265
4,bank,0.039541
5,client,0.038691
6,employee,0.038253
7,energy,0.032062
8,customer,0.030322
9,loan,0.029808


In [68]:
# False Negatives for SDG 1
top_mean_feats(Xtr[idx_incorrect_fn_1], features)

Unnamed: 0,feature,tfidf
0,org,0.409299
1,employee,0.038482
2,client,0.023921
3,boq,0.017014
4,fy,0.016072
5,synchrony,0.015274
6,customer,0.014488
7,varians,0.01444
8,radiotherapy,0.010673
9,associate,0.00987


In [70]:
# False Positives for SDG 5
top_mean_feats(Xtr[idx_incorrect_fp_5], features)

Unnamed: 0,feature,tfidf
0,org,0.382615
1,cid,0.023769
2,cid cid,0.023376
3,employee,0.021027
4,simon,0.015047
5,ni,0.010764
6,year,0.010564
7,fy,0.009475
8,associate,0.009411
9,financial,0.00927


In [71]:
# False Negatives for SDG 5
top_mean_feats(Xtr[idx_incorrect_fn_5], features)

Unnamed: 0,feature,tfidf
0,org,0.387689
1,year,0.03659
2,employee,0.025569
3,financial,0.023316
4,kennedy wilson,0.020993
5,asset,0.012586
6,associate,0.011879
7,client,0.011072
8,kennedy,0.010673
9,mountie,0.01031
