In [219]:
from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, accuracy_score

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

# Bernoulli NB

In [225]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
data['EarningTag2'] = data['EarningTag2'].str.strip()

#Add Lagged Column
data['Lag1'] = data.groupby(["Company", "Participants", "Date", "EventName", "EventType"])['EarningTag2'].shift(1)

#Add Year and Month from Data
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

#Drop non-earning calls
nn_data = data.loc[data['EventType']=="Earnings call", ['Company', 'Participants', 'Month', 'Year', 'AnalystName',	'AnalystCompany', 'EventName', 'Lag1', 'EarningTag2']].copy()
#Add quarter
nn_data['Quarter'] = nn_data['EventName'].str.split("Q").str[0]
#Drop bad features
nn_data = nn_data[['Company', "Participants", "AnalystName", "AnalystCompany", "Month", "Year", "Quarter", "Lag1", "EarningTag2"]].copy()

#One-hot-encode categorical columns
nn_data_encoded = pd.concat([nn_data, 
                             pd.get_dummies(nn_data['Company'], prefix='C', prefix_sep="_"),
                             pd.get_dummies(nn_data['Participants'], prefix='P', prefix_sep="_"),
                             pd.get_dummies(nn_data['AnalystName'], prefix='AN', prefix_sep="_"),
                             pd.get_dummies(nn_data['AnalystCompany'], prefix='AC', prefix_sep="_"),
                             pd.get_dummies(nn_data['Month'], prefix='M', prefix_sep="_"),
                             pd.get_dummies(nn_data['Quarter'], prefix='Q', prefix_sep="_")], axis=1)

new_cols = pd.get_dummies(nn_data['Company'], prefix='C', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Participants'], prefix='P', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['AnalystName'], prefix='AN', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['AnalystCompany'], prefix='AC', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Month'], prefix='M', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Quarter'], prefix='Q', prefix_sep="_").columns.tolist()

nn_data_encoded = nn_data_encoded[["Year", "Lag1", "EarningTag2"] + new_cols].copy()
new_cols = [col.replace(" ", "") for col in new_cols]
nn_data_encoded.columns = ["Year", "Lag1", "EarningTag2"] + new_cols

In [224]:
#Final cleaning
nn_data_encoded.dropna(inplace=True)
nn_data_encoded.reset_index(inplace=True, drop=True)

#Binarize lag and earnings column
binarizer = LabelBinarizer().fit(nn_data_encoded['Lag1'])
lag = pd.DataFrame(binarizer.transform(nn_data_encoded['Lag1']), 
                   columns=["lag_{}".format(c) for c in binarizer.classes_])
y = pd.DataFrame(binarizer.transform(nn_data_encoded['EarningTag2']), 
                   columns=["y_{}".format(c) for c in binarizer.classes_])
nn_data_encoded = pd.concat([nn_data_encoded, lag, y], axis=1)

#Split into train and test
train = nn_data_encoded.loc[nn_data_encoded['Year']!=2018]
test = nn_data_encoded.loc[nn_data_encoded['Year']==2018]

X_train = train.drop(["Year", "Lag1", "EarningTag2"]+y.columns.tolist(), axis=1)
X_test = test.drop(["Year", "Lag1", "EarningTag2"]+y.columns.tolist(), axis=1)
y_train = train[y.columns].values
y_test = test[y.columns].values

classifier = OneVsRestClassifier(GradientBoostingClassifier())
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

n_classes = y_score.shape[1]

lw=2
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print ("{:<3}|{:<30}|{:<3}".format(" ", 'Key','Label'))
for i, (k, v) in enumerate(roc_auc.items()):
    print("-"*40)
    if type(k) == int:
        key = y.columns[k].split('y_')[1].title().replace(" ", "")
    else:
        key = k
    print ("{:<3}|{:<30}|{:.3}".format(i, key, v))

   |Key                           |Label
----------------------------------------
0  |Awm                           |0.761
----------------------------------------
1  |AccountingAndTaxes            |0.741
----------------------------------------
2  |BalanceSheet                  |0.656
----------------------------------------
3  |Cb                            |0.812
----------------------------------------
4  |Ccb                           |0.621
----------------------------------------
5  |Cib                           |0.828
----------------------------------------
6  |Capital                       |0.688
----------------------------------------
7  |CreditCosts                   |0.623
----------------------------------------
8  |Expenses                      |0.628
----------------------------------------
9  |Legal                         |0.96
----------------------------------------
10 |MacroeconomicUpdate           |0.67
----------------------------------------
11 |OtherTopics   

# Multinomial NB

In [157]:
#All columns to string
nn_data_str = nn_data.copy()
nn_data_str['Year'] = nn_data_str['Year'].apply(str)
nn_data_str['Quarter'] = nn_data_str['Quarter'].apply(str)
nn_data_str['Month'] = nn_data_str['Month'].apply(str)

#Remove spaces
nn_data_str = nn_data_str.apply(lambda x: x.str.replace(" ", ""), axis=1)

#Train-Test Split
train = nn_data_str.loc[nn_data_str['Year']!="2018"]
test = nn_data_str.loc[nn_data_str['Year']=="2018"]

#X-y split
X_train = train.drop('EarningTag2', axis=1).values
X_train_str = [' '.join(x) for x in X_train]
y_train = train['EarningTag2'].values

X_test = test.drop("EarningTag2", axis=1).values
X_test_str = [' '.join(x) for x in X_test]
y_test = test['EarningTag2'].values

tfidf_vec = TfidfVectorizer(lowercase=False).fit(X_train_str)
X_train_tfidf = tfidf_vec.transform(X_train_str)
X_test_tfidf = tfidf_vec.transform(X_test_str)

#Encode test
encoder = LabelEncoder().fit(y_train)
y_train_str = encoder.transform(y_train)
y_test_str = encoder.transform(y_test)

model = MultinomialNB().fit(X_train_tfidf, y_train_str)  
preds = model.predict_proba(X_test_tfidf)
accuracy_score(y_test_str, np.argmax(preds, axis=1))

0.27314814814814814