In [2]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF, PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [3]:
#Set for your computer
data_directory = '/'.join(os.getcwd().split("/")[:-1]) + '/data/'

In [7]:
test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169,   8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

def historicTagPct(var):
    overall_hist_pct_list = [0 for _ in range(orig_data[var].unique().shape[0])]
    hist_tag_cols = ['{}Hist_{}'.format(var, t) for t in tag_cols]

    for c, (i, a_group) in enumerate(orig_data.groupby([var])):

        subgroups = [(d, g) for d, g in a_group.groupby(['Date'])]
        hist_pct_list = [pd.DataFrame(columns=[var, 'Date'] + hist_tag_cols) for _ in range(len(subgroups))]


        for j, (d,g) in enumerate(subgroups):
            if j == 0:
                hist_pct_list[j] = hist_pct_list[j].append(pd.DataFrame({var:[i], 'Date':[subgroups[j][0]]}), sort=False).fillna(0)
                continue

            hist_data = pd.concat([s_g[1] for s_g in subgroups[:j]]).groupby(['Tag']).size().reset_index()
            hist_data = hist_data.append(pd.DataFrame([(t, 0.0) for t in np.setdiff1d(tag_cols, hist_data['Tag'].values)], columns=['Tag', 0]), sort=False)
            hist_data['TagPct'] = hist_data[0]/hist_data[0].sum()
            hist_data['Date'] = d

            hist_pivot = hist_data.pivot(index='Date', columns='Tag', values='TagPct').fillna(0).reset_index()
            hist_pivot.columns = ['Date'] + ["{}Hist_{}".format(var, c) for c in hist_pivot.columns[1:]]
            hist_pivot.index = [i]
            hist_pivot = hist_pivot.rename_axis(None, axis=1).rename_axis(var).reset_index()
            hist_pct_list[j] = hist_pct_list[j].append(hist_pivot, sort=False).fillna(0)

        overall_hist_pct_list[c] = pd.concat(hist_pct_list)

    overall_hist_pct = pd.concat(overall_hist_pct_list)
    
    return(overall_hist_pct)

In [None]:
#Read in data
orig_data = pd.read_csv(data_directory + 'qaData.csv', parse_dates=['Date'])
orig_data['EarningTag2'] = orig_data['EarningTag2'].str.strip()

#Add Year and Month, Quarter from Data
orig_data['Year'] = orig_data['Date'].dt.year
orig_data['Month'] = orig_data['Date'].dt.month
orig_data['Quarter'] = orig_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
orig_data['Company'] = orig_data['Company'].str.title().str.replace(" ", "")
orig_data['EventType'] = orig_data['EventType'].str.title().str.replace(" ", "")
orig_data['Participants'] = orig_data['Participants'].str.title().str.replace(" ", "")
orig_data['AnalystName'] = orig_data['AnalystName'].str.title().str.replace(" ", "")
orig_data['AnalystCompany'] = orig_data['AnalystCompany'].str.title().str.replace(" ", "")
orig_data['Tag'] = orig_data['EarningTag2'].str.title().str.replace(" ", "")

orig_data = orig_data.loc[~orig_data['AnalystName'].isna()].copy()

tag_cols = orig_data['Tag'].unique().tolist()

#orig_data['MonthSin'] = np.sin((orig_data['Month']-1)*(2.*np.pi/12))
#orig_data['MonthCos'] = np.cos((orig_data['Month']-1)*(2.*np.pi/12))

#orig_data['QuarterSin'] = np.sin((orig_data['Quarter'])*(2.*np.pi/12))
#orig_data['QuarterCos'] = np.cos((orig_data['Quarter'])*(2.*np.pi/12))

#Pivot tag
pivot_data = (pd.pivot_table(orig_data, index=['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType', 'Date'], columns='Tag', aggfunc='size', fill_value=0)).reset_index()

#Melt data
pivot_melt_data = pd.melt(pivot_data, id_vars=['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType', 'Date'], var_name='Tag', value_name='NumQ')
#One-hot encode
pivot_melt_data = pd.concat([pivot_melt_data, 
                             pd.get_dummies(pivot_melt_data['Company'], prefix='C', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['AnalystName'], prefix='A', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['EventType'], prefix='ET', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_")], axis=1).reset_index(drop=True)

#Analysts Present Data
event_analyst_data = orig_data[['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType', 'Date']].drop_duplicates().reset_index(drop=True)
event_analyst_data = pd.concat([event_analyst_data, 
                                pd.get_dummies(event_analyst_data['AnalystName'], prefix='AP', prefix_sep="_")], axis=1).drop(['AnalystName', 'AnalystCompany'], axis=1)
event_analyst_data = event_analyst_data.groupby(['Company', 'Participants', 'Year', 'Month', 'Quarter', 'EventType', 'Date']).sum().reset_index()

all_features_data = pd.merge(pivot_melt_data, event_analyst_data, on=['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType', 'Date'])

#Participants Present Data
event_part_raw_data = orig_data[['Company', 'Participants', 'Month', 'Quarter', 'Year', 'EventType']].drop_duplicates().reset_index(drop=True)
event_part_raw_data = pd.concat([event_part_raw_data, event_part_raw_data['Participants'].str.split(',', expand=True)], axis=1).drop('Participants', axis=1)

event_part_melt_data = pd.melt(event_part_raw_data, id_vars=['Company', 'Month', 'Quarter', 'Year', 'EventType'], value_name='Participant').dropna().reset_index(drop=True)
event_part_ohe_data = pd.concat([event_part_melt_data, pd.get_dummies(event_part_melt_data['Participant'], prefix='P', prefix_sep="_")], axis=1).drop(['Participant', 'variable'], axis=1)
event_part_data = event_part_ohe_data.groupby(['Company', 'Month', 'Quarter', 'Year', 'EventType']).sum().reset_index()

all_features_data = pd.merge(all_features_data, event_part_data, on=['Company', 'Month', 'Year', 'Quarter', 'EventType'])

for i in ['AnalystName','Company', 'Quarter']:

    all_features_data = pd.merge(all_features_data, historicTagPct(i), on=[i, 'Date'])

#Index Data
groups = []
for i, (name, group) in enumerate(all_features_data.groupby(['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType', 'Date'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    groups.append(g2)
    
indexed_data = pd.concat(groups)

#Merge
indexed_data = indexed_data.drop(['Company', 'AnalystName', 'AnalystCompany', 'Participants', 'Tag', 'EventType', 'Date'], axis=1)
indexed_data = indexed_data.reset_index(drop=True)
indexed_data['NumQ'] = indexed_data['NumQ'].astype(bool).astype(int)

train, test = indexed_data.loc[~indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True), \
                indexed_data.loc[indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ','EventNumber'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ', 'EventNumber'], axis=1), test['NumQ'].values

cols = train.drop(['NumQ','EventNumber'], axis=1).columns    

In [None]:
max_comp = 100

scores = np.zeros(max_comp)
acc = np.zeros(max_comp)
scores_gbc = np.zeros(max_comp)
acc_gbc = np.zeros(max_comp)
scores_rf = np.zeros(max_comp)
acc_rf = np.zeros(max_comp)

estimator = LogisticRegression().fit(X_train, y_train)
preds = estimator.predict_proba(X_test)[:,1]
scores[0] = roc_auc_score(y_test, preds)
acc[0] = accuracy_score(y_test, preds.round())
    
estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train, y_train)
preds_gbc = estimator_gbc.predict_proba(X_test)[:,1]
scores_gbc[0] = roc_auc_score(y_test, preds_gbc)
acc_gbc[0] = accuracy_score(y_test, preds_gbc.round())
    
estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train, y_train)
preds_rf = estimator_rf.predict_proba(X_test)[:,1]
scores_rf[0] = roc_auc_score(y_test, preds_rf)
acc_rf[0] = accuracy_score(y_test, preds_rf.round())

for comp in range(1, max_comp):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator = LogisticRegression().fit(X_train_W, y_train)
    preds = estimator.predict_proba(X_test_W)[:,1]
    scores[comp] = roc_auc_score(y_test, preds)
    acc[comp] = accuracy_score(y_test, preds.round())
    
    estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_gbc = estimator_gbc.predict_proba(X_test_W)[:,1]
    scores_gbc[comp] = roc_auc_score(y_test, preds_gbc)
    acc_gbc[comp] = accuracy_score(y_test, preds_gbc.round())
    
    estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_rf = estimator_rf.predict_proba(X_test_W)[:,1]
    scores_rf[comp] = roc_auc_score(y_test, preds_rf)
    acc_rf[comp] = accuracy_score(y_test, preds_rf.round())
    

print('logit ROC:', scores.max(), scores.argmax())
print('logit ACC', acc.max(), acc.argmax())
print('GBC ROC', scores_gbc.max(), scores_gbc.argmax())
print('GBC ACC', acc_gbc.max(), acc_gbc.argmax())
print('RF ROC', scores_rf.max(), scores_rf.argmax())
print('RF ACC', acc_rf.max(), acc_rf.argmax())

In [None]:
grid = GridSearchCV(GradientBoostingClassifier(warm_start=True), cv=5, param_grid={'learning_rate':10.0**np.arange(-3,0,1)}, return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train, y_train)
print(grid.best_score_)
best_learning_rate = grid.best_params_['learning_rate']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'min_samples_split':np.arange(2, 10, 2, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train, y_train)
print(grid.best_score_)
best_min_samples_split = grid.best_params_['min_samples_split']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'max_features':['auto', 'sqrt', 'log2']}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train, y_train)
print(grid.best_score_)
best_max_features = grid.best_params_['max_features']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               max_features = best_max_features,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'max_depth':np.arange(1, 10, 1, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train, y_train)
print(grid.best_score_)
best_max_depth = grid.best_params_['max_depth']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               max_features = best_max_features,
                                               max_depth = best_max_depth,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'min_samples_leaf':np.arange(1, 11, 2, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train, y_train)
print(grid.best_score_)
best_min_samples_leaf = grid.best_params_['min_samples_leaf']

In [None]:
model = NMF(n_components=44).fit(X_train)
X_train_W = model.transform(X_train)


grid = GridSearchCV(GradientBoostingClassifier(warm_start=True), cv=5, param_grid={'learning_rate':10.0**np.arange(-3,0,1)}, return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)
print(grid.best_score_)
best_learning_rate = grid.best_params_['learning_rate']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'min_samples_split':np.arange(2, 10, 2, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)
print(grid.best_score_)
best_min_samples_split = grid.best_params_['min_samples_split']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'max_features':['auto', 'sqrt', 'log2']}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)
print(grid.best_score_)
best_max_features = grid.best_params_['max_features']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               max_features = best_max_features,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'max_depth':np.arange(1, 10, 1, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)
print(grid.best_score_)
best_max_depth = grid.best_params_['max_depth']

grid = GridSearchCV(GradientBoostingClassifier(learning_rate=best_learning_rate,
                                               min_samples_split=best_min_samples_split,
                                               max_features = best_max_features,
                                               max_depth = best_max_depth,
                                               warm_start=True), 
                    cv=5, 
                    param_grid={'min_samples_leaf':np.arange(1, 11, 2, dtype=int)}, 
                    return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)
print(grid.best_score_)
best_min_samples_leaf = grid.best_params_['min_samples_leaf']

In [None]:
model = NMF(n_components=49).fit(X_train)
X_train_W = model.transform(X_train)
X_test_W = model.transform(X_test)

estimator = GradientBoostingClassifier().fit(X_train_W, y_train)
roc_auc_score(y_test, estimator.predict_proba(X_test_W)[:,1])