## XGBoost model on 10Ks
Summary: Using XGBoost (not blended with any other models) to build a simple relevance classifier that indicates whether an excerpt in the SASB dataset is or is not an HCM-related disclosure. Success is moderate - when maximizing for recall, we get about 93% recall and about 64% precision.  

**Next steps:**
1. Refine classifier to get higher precision with same level or better recall
2. Or, create a cascade of weak models
3. Once relevance meets our recall and precision threshold, take positive cases and build multiclass classifier on those
4. Collect and clean new 10-Ks; apply classifier to this new dataset
5. Visualize the share of excerpts that are positive for relevance for each industry, including those that do not have HCM materiality in 2018 standards

In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import os
import re

In [None]:
# Import json file from SASB
path = "/Users/ishashah/Documents/DFG/dfg-humanrights0/from-sasb"
os.chdir(path)

In [None]:
json = pd.read_json("di_hc_rel_train.json")
json.head()

In [None]:
# Import csv lookup
toplabel = pd.read_csv("disclosure_topic.csv")
toplabel.columns = map(str.lower, toplabel.columns)
toplabel.head()

# How many are related to labor?
labor = toplabel[toplabel["disclosure_topic_name"].str.contains("labor", case = False) ]
labor

# Create new label that flags labor only
toplabel["disclosure_islabor"] = toplabel["disclosure_topic_name"].str.contains("labor", case = False)
toplabel.head()
json = pd.merge(json, toplabel, how = "left",
                on = "disclosure_topic_id")
json.columns
json.head()

In [None]:
# Keep all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Check excerpts more closely
pd.options.display.max_colwidth = 500
json["excerpt"].head()

# Are there any duplicates in text, in excerpt_id?
json.shape
json.drop_duplicates("excerpt").shape
json.drop_duplicates("excerpt_id").shape
# So, there are duplicates in text but not in excerpt_id

# What proportion are quality_assessment, relevance_assessment categories?
json.columns
json.groupby("relevance_assessment").agg("count")
json.groupby("source_document").agg("count")
# Comes from more than one source document - keep all, even the ones that are
# not 10-Ks

In [None]:
# Figuring out why there are certain exerpts marked with disclosure labels that have
# "No disclosure" relevance assessments
json["disclosure_topic_name"].value_counts()
json["relevance_assessment"].value_counts()

json[(json["relevance_assessment"] == "No Disclosure") & (json["disclosure_islabor"])]["excerpt"].tail()
json[(json["relevance_assessment"] == "Relevant") & (json["disclosure_islabor"])]["excerpt"].tail()
# From these, it looks like we will have to use only those
# excerpts which are both marked Relevant and where disclosure topic is related to labor - 
# If we include where the disclosure topic is related to labor and No Disclosure, then
# it looks like we'll get irrelevant entries

# Create a flag for these
json["relevant_islabor"] = ((json["disclosure_islabor"]) & (json["relevance_assessment"] == "Relevant"))
json["relevant_islabor"].value_counts()
# Amounts to about 10% of dataset

# See what the breakdown is of sub-topics within this
json[json["relevant_islabor"]]["disclosure_topic_name"].value_counts()
# 75% are labor practices, around 20% are labor relations, 5% labor conditions
json[json["disclosure_islabor"]]["disclosure_topic_name"].value_counts()
# Breakdown is about the same when including all excerpts labeled as related, not just those
# that are relevant


In [None]:
# Import libraries to handle text

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import normalize


In [None]:
# Preprocess 10K text similarly to proxy statements

# Cleaning function
stopset = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(in_text):
    # Remove line breaks
    text = in_text.replace(r'\\n', ' ')
    
    # Lowercase
    text = word_tokenize(re.sub('[^A-z ]+', '', text.lower()))
    
    # Remove stopwords, remove numbers and punctuation, stem
    text = [stemmer.stem(w) for w in text if w.isalpha() and w not in stopset]
    
    # Return joined version
    text = (" ".join(text))
    
    return text


In [None]:
# Progress tracker

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()


In [None]:
# Apply cleaning function to json file text
json["clean_text"] = json["excerpt"].progress_apply(clean_text)

json.head()

In [None]:
# Export csv of cleaned dataset
json.to_csv("json_clean.csv")

In [None]:
# Import csv of cleaned dataset
json = pd.read_csv("json_clean.csv")

In [None]:
# Check value counts for HCM flag
json["relevance_assessment"].value_counts()

In [None]:
# Import libraries to model, create word embeddings

from sklearn.model_selection import train_test_split, GridSearchCV, \
StratifiedKFold, cross_val_predict, \
StratifiedShuffleSplit
from sklearn.feature_selection import chi2
from sklearn.metrics import roc_curve, \
precision_recall_curve, auc, make_scorer, \
recall_score, accuracy_score, precision_score, \
confusion_matrix, classification_report, roc_auc_score

from gensim.models import Word2Vec, KeyedVectors
import gensim 

In [None]:
# Create list of cleaned words in each excerpt
json["cleantext_list"] = json["clean_text"].apply(lambda x: ','.join(word_tokenize(x)))
sent = [row.split(',') for row in json["cleantext_list"]]

# Train on corpus
model = Word2Vec(sent, min_count=5, size= 300,workers=3, window =3, sg = 1)

# Check vector size
model.vector_size

In [None]:
#  Save trained word embeddings
model.wv.save_word2vec_format('model.txt', binary=False)

In [None]:
# Load trained word embeddings
model = KeyedVectors.load_word2vec_format('model.txt', binary=False)

In [None]:
# Vectorize using trained word embeddings
import numpy as np

def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
   
    return np.asarray(sent_vec) / numw


V=[]

for sentence in sent:
    V.append(sent_vectorizer(sentence, model))   
    
    

In [None]:
# Join back to dataset

json2 = pd.DataFrame(V, index = json['excerpt_id'])
json2 = json2.merge(right=json[["excerpt_id", "relevance_assessment"]], 
         left_index=True, right_on="excerpt_id")


In [None]:
# Save vectorized dataset
json2.to_csv("json2_clean.csv")

In [None]:
# Load vectorized dataset
json2 = pd.read_csv("json2_clean.csv")

In [None]:
# Import XGboost, Keras for modeling
import xgboost as xgb
from sklearn.metrics import mean_squared_error, recall_score, precision_score


In [None]:
# Split X and y, convert to DMatrix
X = json2.iloc[:,0:300]
y = pd.DataFrame(pd.get_dummies(json2['relevance_assessment']))["Relevant"]

data_dmatrix = xgb.DMatrix(data=X,label=y)


In [None]:
# Train/test split

# Set aside 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20,
                                                    stratify = y,
                                                    random_state=8) 
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)


In [None]:
# Instantiate, fit, transform, XGB regressor

# Instantiate
xg_class = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

# Fit and predict on training only
xg_class.fit(X_train,y_train)

preds = xg_class.predict(X_test)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=10, n_jobs=0, num_parallel_tree=1,
       objective='reg:logistic', random_state=0, reg_alpha=10,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

  if diff:


In [None]:
# Check accuracy without cross validation
print("Accuracy on test\n" , accuracy_score(y_test,preds))
print("Recall on test\n" , recall_score(y_test,preds))
print("Precision on test\n" , precision_score(y_test,preds))

# Baseline is 77% accuracy

Accuracy on test
 0.7690256907416384
Recall on test
 0.6955810147299509
Precision on test
 0.7634730538922155


In [None]:
# Make train and test DMatrices

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

X_test.shape
y_test.shape
dtest.num_row()
dtest.num_col()


In [None]:
# Use gridsearch

params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:logistic',
    'eval_metric':"auc"
}

num_boost_round = 999

# First check to see how optimal number of boosting rounds works
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best accuracy: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

# 0.85 with 31 rounds <- baseline + optimal number of boosting rounds

In [None]:
# Tuning max_depth / min_child_weight
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,8)
    for min_child_weight in range(5,10,2)
]

# Define initial best params and AUC
max_auc = 0
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'auc'},
        early_stopping_rounds=10
    )
    # Update best AUC
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))



In [None]:
# Assign best min child weight and max depth to parameter grid
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]

# #0.8461913999999998

In [None]:
# Tune subsample and colsample_bytree
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(8,12,2)]
    for colsample in [i/10. for i in range(8,12,2)]
]
max_auc = 0
best_params = None
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'auc'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (subsample,colsample)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

In [None]:
# Assign best subsample and col sample by tree to parameter grid
params['subsample'] = best_params[0]
params['colsample_bytree'] = best_params[1]

In [None]:
# Tune eta (learning rate)
%time

max_auc = 0
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics=['auc'],
        early_stopping_rounds=10
      )
    # Update best score
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds\n".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = eta
print("Best params: {}, AUC: {}".format(best_params, max_auc))

In [None]:
# Store chosen eta (actually running grid search took too long)
params['eta'] = 0.05

In [None]:
# Train model with optimal parameters
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
# Save model
best_model.save_model("xgb_model.model")

In [None]:
# Predict on test dataset
test_pred = best_model.predict(dtest)
test_pred = pd.DataFrame(test_pred)
y_testdf = pd.DataFrame(y_test).reset_index()
test_pred = pd.concat([y_testdf, test_pred], axis = 1)
test_pred.head()

# Choose 0.5 as threshold, standard 
test_pred['pred'] = test_pred[0].apply(lambda x: 0 if x < 0.5 else 1)
test_pred = test_pred.set_index('index')
test_pred = pd.merge(test_pred, json[["excerpt", "relevance_assessment"]], 
                     left_index = True, right_index = True)
pd.set_option('max_colwidth', 350)
test_pred.head()

print("On test:")
print("Accuracy score is:\n", accuracy_score(test_pred['pred'], y_test))
print("AUC score is:\n", roc_auc_score(test_pred['pred'], y_test))
print("Recall score is:\n", recall_score(test_pred['pred'], y_test))
print("Precision score is:\n", precision_score(test_pred['pred'], y_test))



In [None]:
# Change threshold to get recall up to 0.8 at least
test_pred['pred'] = test_pred[0].apply(lambda x: 0 if x < 0.2 else 1)
print("On test with lower threshold:")
print("Final recall score is:\n", recall_score(y_test, test_pred['pred']))
print("Final precision score is:\n", precision_score(y_test, test_pred['pred']))
print("Final accuracy score is:\n", accuracy_score(y_test, test_pred['pred']))
print("Final AUC score is:\n", roc_auc_score(y_test, test_pred['pred']))

on test:
final recall score is:
 0.9274413529732679
final precision score is:
 0.6376594148537135
final accuracy score is:
 0.7336403296170625
final AUC score is:
 0.7530795949340827


In [None]:
# Join with original dataset
all_pred = best_model.predict(data_dmatrix)

# Predict using best model
all_pred = pd.DataFrame(all_pred)
all_pred = pd.concat([json, all_pred], axis = 1)

all_pred['pred'] = all_pred[0].apply(lambda x: 0 if x < 0.2 else 1)

In [None]:
# Create Paul exhibit
json[['industry_id_x', 'industry_id_y', 'sustainability_dimension', 
     'company_ticker']].head()

json['industry_id_x'].value_counts()
json['disclosure_topic_name'].value_counts()

In [None]:
# Import and merge on industry names
os.chdir(path)
inds = pd.read_csv("industry.csv")

In [None]:
# Munge for Paul exhibit
predex = all_pred.groupby(['industry_id_x','relevance_assessment']).agg({'pred' : 'sum'})
predex['share_disc'] = predex.groupby(['industry_id_x'])['pred'].transform(lambda x: x/x.sum())
predex = pd.merge(predex.reset_index(), inds, left_on = 'industry_id_x', right_on = 'INDUSTRY_ID')

# Visualize
predex[(predex["relevance_assessment"] == "Relevant")].sort_values('industry_id_x').plot.bar('INDUSTRY_NAME', 'share_disc', rot = 90, color = 'SECTOR_ID')

In [None]:
# Export predictions
predex[['industry_id_x', 'INDUSTRY_NAME']].drop_duplicates().to_csv("hcm_materiality_empty.csv")