In [1]:
import os
import sys

import random
random.seed(1006)

from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, Lasso
from sklearn.metrics import make_scorer, mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [2]:
#Set for your computer
data_directory = '/'.join(os.getcwd().split("/")[:-1]) + '/data/'

In [2]:
test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169,   8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

In [20]:
#Read in data
orig_data = pd.read_csv(data_directory+'qaData.csv', parse_dates=['Date'])
orig_data['EarningTag2'] = orig_data['EarningTag2'].str.strip()

#Add Year and Month, Quarter from Data
orig_data['Year'] = orig_data['Date'].dt.year
orig_data['Month'] = orig_data['Date'].dt.month
orig_data['Quarter'] = orig_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
orig_data['Company'] = orig_data['Company'].str.title().str.replace(" ", "")
orig_data['EventType'] = orig_data['EventType'].str.title().str.replace(" ", "")
orig_data['Participants'] = orig_data['Participants'].str.title().str.replace(" ", "")
orig_data['AnalystName'] = orig_data['AnalystName'].str.title().str.replace(" ", "")
orig_data['AnalystCompany'] = orig_data['AnalystCompany'].str.title().str.replace(" ", "")
orig_data['EarningTag2'] = orig_data['EarningTag2'].str.title().str.replace(" ", "")

#Pivot tag
pivot_data = (pd.pivot_table(orig_data, index=['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'], columns='EarningTag2', aggfunc='size', fill_value=0)).reset_index()

#Melt data
pivot_melt_data = pd.melt(pivot_data, id_vars=['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'], var_name='Tag', value_name='NumQ')
#One-hot encode
pivot_melt_data = pd.concat([pivot_melt_data, 
                             pd.get_dummies(pivot_melt_data['Company'], prefix='C', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['Participants'], prefix='P', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['EventType'], prefix='ET', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_")], axis=1)
pivot_melt_data = pivot_melt_data.reset_index(drop=True)

#Analysts Present Data
event_analyst_data = orig_data[['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType']].drop_duplicates().reset_index(drop=True)
event_analyst_data = pd.concat([event_analyst_data, 
                                pd.get_dummies(event_analyst_data['AnalystName'], prefix='AP', prefix_sep="_"),
                                pd.get_dummies(event_analyst_data['AnalystCompany'], prefix='ACP', prefix_sep="_")], axis=1).drop(['AnalystName', 'AnalystCompany'], axis=1)
event_analyst_data = event_analyst_data.groupby(['Company', 'Participants', 'Year', 'Month', 'Quarter', 'EventType']).sum().reset_index()

all_features_data = pd.merge(pivot_melt_data, event_analyst_data, on=['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'])

#Index Data
groups = []
for i, (name, group) in enumerate(all_features_data.groupby(['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    groups.append(g2)
    
indexed_data = pd.concat(groups)

#Merge
indexed_data = indexed_data.drop(['Company', 'Participants', 'Tag', 'EventType'], axis=1)
indexed_data = indexed_data.reset_index(drop=True)

train, test = indexed_data.loc[~indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True), \
                indexed_data.loc[indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ','EventNumber'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ', 'EventNumber'], axis=1), test['NumQ'].values

In [65]:
scores = np.zeros(50)
scores_ridge = np.zeros(50)
scores_lasso = np.zeros(50)
scores_br = np.zeros(50)
scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)

estimator = LinearRegression().fit(X_train, y_train)
scores[0] = mean_squared_error(y_test, estimator.predict(X_test).round())
    
estimator_ridge = Ridge().fit(X_train, y_train)
scores_ridge[0] = mean_squared_error(y_test, estimator_ridge.predict(X_test).round())
    
estimator_lasso = Lasso().fit(X_train, y_train)
scores_lasso[0] = mean_squared_error(y_test, estimator_lasso.predict(X_test).round())    
    
estimator_br = BayesianRidge().fit(X_train, y_train)
scores_br[0] = mean_squared_error(y_test, estimator_br.predict(X_test).round())
    
estimator_gbc = GradientBoostingRegressor(warm_start=True).fit(X_train, y_train)
scores_gbc[0] = mean_squared_error(y_test, estimator_gbc.predict(X_test).round())
    
estimator_rf = RandomForestRegressor(warm_start=True).fit(X_train, y_train)
scores_rf[0] = mean_squared_error(y_test, estimator_rf.predict(X_test).round())

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator = LinearRegression().fit(X_train_W, y_train)
    scores[comp] = mean_squared_error(y_test, estimator.predict(X_test_W).round())
    
    estimator_ridge = Ridge().fit(X_train_W, y_train)
    scores_ridge[comp] = mean_squared_error(y_test, estimator_ridge.predict(X_test_W).round())
    
    estimator_lasso = Lasso().fit(X_train_W, y_train)
    scores_lasso[comp] = mean_squared_error(y_test, estimator_lasso.predict(X_test_W).round())
    
    estimator_br = BayesianRidge().fit(X_train_W, y_train)
    scores_br[comp] = mean_squared_error(y_test, estimator_br.predict(X_test_W).round())
    
    estimator_gbc = GradientBoostingRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_gbc[comp] = mean_squared_error(y_test, estimator_gbc.predict(X_test_W).round())
    
    estimator_rf = RandomForestRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_rf[comp] = mean_squared_error(y_test, estimator_rf.predict(X_test_W).round())


In [66]:
print('lm', scores.min(), scores.argmin())
print('ridge', scores_ridge.min(), scores_ridge.argmin())
print('lasso', scores_lasso.min(), scores_lasso.argmin())
print('br', scores_br.min(), scores_br.argmin())
print('gbc', scores_gbc.min(), scores_gbc.argmin())
print('rf', scores_rf.min(), scores_rf.argmin())

lm 5.298872180451128
ridge 5.355263157894737
lasso 6.795112781954887
br 5.2593984962406015
gbc 3.992481203007519
rf 3.922932330827068


In [None]:
model = NMF(n_components=44).fit(X_train)
X_train_W = model.transform(X_train)

param_grid = {'loss':['ls', 'huber', 'lad', 'quantile'],
              'learning_rate':10.0**np.arange(-3,0,1),
              'min_samples_split':np.arange(2,10,2),
              'min_samples_leaf':np.arange(1,15,2),
              'max_depth': np.arange(1,10,1)}

grid = GridSearchCV(GradientBoostingRegressor(warm_start=True), cv=5, param_grid=param_grid, return_train_score=False, scoring='neg_mean_squared_error')
grid.fit(X_train_W, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [72]:
X_test_W = model.transform(X_test)

estimator = GradientBoostingRegressor(learning_rate=0.1, loss='ls', max_depth=4, min_samples_leaf=5, min_samples_split=2, warm_start=True).fit(X_train_W, y_train)
mean_squared_error(y_test, estimator.predict(X_test_W).round())

3.8496240601503757

In [79]:
#import pickle as pkl
#with open('models/bestTagNumberModel.p', 'wb') as f:
#    pkl.dump(estimator, f, protocol=pkl.HIGHEST_PROTOCOL)

In [84]:
pass_data = pd.concat([indexed_data, pd.Series(estimator.predict(model.transform(indexed_data.drop(['NumQ', 'EventNumber'], axis=1))).round())], axis=1)[['EventNumber',0]+pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_").columns.tolist()]
pass_data.columns = ['EventNumber', 'PredQ'] + pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_").columns.tolist()
pass_data.to_csv('data/tagCntModel.csv', index=False)