In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import random

In [None]:
df = pd.read_csv('https://www.dropbox.com/scl/fi/88ntlykbuntymbyk6ccaz/training_data_companydata.csv?rlkey=4z4mopwiqv4dw27kmvo3cz1k6&dl=1')
df.columns

Index(['id', 'text', 'phrase', 'role in text', 'expertA_level0',
       'expertA_level1', 'expertB_level2', 'expertB_level0', 'expertB_level1',
       'expertB_level2.1', 'label_letter', 'label', 'label1', 'label2'],
      dtype='object')

In [None]:
df = df[['text', 'phrase', 'label1']]

In [None]:
df['label1'].value_counts()

NP     12532
INV     1013
UC       693
EMP      204
SOC       35
CUS       31
Name: label1, dtype: int64

In [None]:
df['performance']=1
df.loc[(df['label1']=='NP') | (df['label1']=='UC'), 'performance']=0
df['performance'].value_counts()

0    13225
1     1283
Name: performance, dtype: int64

#Rule-based model as the benchark

In [None]:
kw_cus = ['customer', 'client', 'consumer', 'user', 'policyholder', 'beneficiar']
kw_inv = ['investor', 'financ', 'shareholder', 'stockholder', 'owners', 'investment', 'credit rating', 'return on', 'interest rate', 'net income', 'profit', 'sales', 'revenue', 'earnings']
kw_emp = ['employee', 'worker', 'staff', 'manager']
kw_soc = ['society', 'societal', 'social responsib', 'social performance', 'communit', 'energy environment', 'natural environment', 'ecolog', 'water', 'waste', ' pollu', 'emission', ' gas']

kw_cus_ = ['customer', 'client', 'consumer']
kw_inv_ = ['investor', 'financ', 'shareholder', 'stockholder', 'owners', 'investment', 'return on', 'net income', 'profit', 'revenue', 'earnings']
kw_emp_ = ['employee', 'worker', 'manager']
kw_soc_ = ['society', 'societal', 'social responsib', 'social performance', 'communit', 'natural environment', 'ecolog']

stopwords = ['community college',
             'financial institution', 'financial statement', 'financial reporting', 'financial markets', 'financial measures', 'financial services',  'financing activit', 'financial information', 'financial compan', 'financial firm', 'financial entiti', 'financial regulat', 'financing',
             'investment compan', 'investment firm', 'investment partner', 'investment industr', 'investment regulat', 'investment law', 'investment activit', 'investigat',
             ' Financ', ' Invest']

In [None]:
for p, kw in zip(['CUS', 'INV', 'EMP', 'SOC'], [kw_cus, kw_inv, kw_emp, kw_soc]):
  df[p] = (df['label1'] == p).astype('int')
  for word in kw:
    df.loc[df['text'].str.contains(word), p] = 1
    df.loc[df['text'].str.contains(word), 'performance'] = 1
    for stopword in stopwords:
      df.loc[df['text'].str.contains(stopword), p] = 0
      df.loc[df['text'].str.contains(stopword), 'performance'] = 0

In [None]:
df = df[['text', 'performance', 'CUS', 'INV', 'EMP', 'SOC']].groupby('text').max().reset_index()

In [None]:
df.sample(5)

Unnamed: 0,text,performance,CUS,INV,EMP,SOC
416,"The increase was primarily due to revenue growth, partially offset by an increase in operating costs to support revenue growth and business initiatives at Platts, including Asia expansion initiatives, an increase in compensation costs due to annual merit increases and increased headcount, higher technology costs, an increase in the bad debt provision in the current year and one-time costs related to the discontinuation of a product line at Platts.",1,0,1,0,0
5511,These initiatives have increased compliance costs and regulatory risks and may lead to financial and reputational damage in the event of a compliance violation.,1,0,1,0,0
6357,"because our debt issuances generate a measurable income stream for each lender, the income approach was deemed to be an appropriate methodology for valuing the private placement long-term debt",0,0,0,0,0
657,"Accrued compensation and benefits at December 31, 2020 increased $442 million from December 31, 2019, primarily due to higher 2020 incentive compensation accruals",1,0,1,1,0
2401,"GSE securities and agency MBS: GSE securities consist of debt obligations issued by HUD, the FHLB, and other agencies, as well as securities collateralized by loans that are guaranteed by the SBA, and thus, are backed by the full faith and credit of the U.S. government.",0,0,0,0,0


In [None]:
tot = 0
for p in ['CUS', 'INV', 'EMP', 'SOC']:
  print(p, df[p].mean())
  tot += df[p].mean()
print('Sum:', tot)
print('performance', df['performance'].mean())

CUS 0.09963436928702012
INV 0.3452163315051798
EMP 0.04616087751371115
SOC 0.009750152346130409
Sum: 0.5007617306520414
performance 0.4379951249238269


In [None]:
df.to_csv('training_data_companydata_adjusted_by_keywords.csv', index=False)

# Rule-based search

In [None]:
df['performance_']=0
for p, kw in zip(['CUS_', 'INV_', 'EMP_', 'SOC_'], [kw_cus_, kw_inv_, kw_emp_, kw_soc_]):
  df[p] = 0
  for word in kw:
    df.loc[df['text'].str.contains(word), p] = 1
    df.loc[df['text'].str.contains(word), 'performance_'] = 1

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_scores = []
recall_scores = []
precision_scores = []
f1_scores = []
N = []

classes = ['CUS', 'INV', 'EMP', 'SOC', 'performance']

for p in classes:
  y_true = df[p]
  y_pred = df[p+'_']

  accuracy_scores.append(accuracy_score(y_true, y_pred))
  recall_scores.append(recall_score(y_true, y_pred))
  precision_scores.append(precision_score(y_true, y_pred))
  f1_scores.append(f1_score(y_true, y_pred))
  N.append(df[p+'_'].sum())

pd.DataFrame({
    'Class': classes,
    'Accuracy': accuracy_scores,
    'Recall': recall_scores,
    'Precision': precision_scores,
    'F1': f1_scores,
    'Support': N
})

Unnamed: 0,Class,Accuracy,Recall,Precision,F1,Support
0,CUS,0.982328,0.883792,0.935275,0.908805,618
1,INV,0.837904,0.711827,0.796937,0.751981,2024
2,EMP,0.981718,0.627063,0.964467,0.76,197
3,SOC,0.99284,0.296875,0.904762,0.447059,21
4,performance,0.824345,0.744348,0.836591,0.787778,2558
