In [5]:
# Dependencies
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rnd
rnd.seed(42)
from dateutil import parser
from pandas.tseries.offsets import BDay
from itertools import chain
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Metrics AND FUNCTIONS
# Standardize the data:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

## CLASSIFIERS LIST

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

# Results
res = pd.DataFrame()

# Prepare the news table with the necessary feats
# Taking out 'd1':['mean'],'d2':['mean'],
f = {'firstMentionSentence':['median','std'],
     'sentimentNeutral':['mean','std'], 
     'noveltyCount12H':['sum'],'noveltyCount24H':['sum'],'noveltyCount3D':['sum'],'noveltyCount5D':['sum'],'noveltyCount7D':['sum'],
     'relevance':['median'],  
     'companyCount':['median'], 
     'sentimentNegative':['std'],
     'sentimentWordCount':['median']}

def pre_processing(mkt, nws):

    ## MKT will have a value of 1 if the return is positive and 0 otherwise. 
    mkt["returnsOpenNextMktres10"] = mkt["returnsOpenNextMktres10"] > 0 # .clip(-1, 1)
    mkt.rename(columns={'returnsOpenNextMktres10':'target'}, inplace=True)

## CONSOLIDATE TIME TO THE NEXT BUSINESS DAY
    
        # i.e: 2007-01-01 22:00:01+00:00 -> 2007-01-02 
        #      2007-01-01 21:59:59+00:00 -> 2007-01-01
    if mkt.time.dtype != 'datetime64[ns, UTC]':
        mkt.time = mkt.time.apply(lambda x: parser.parse(x))
        nws.time = nws.time.apply(lambda x: parser.parse(x))
    nws['time'] = (nws['time'] - np.timedelta64(22,'h')).dt.ceil('1D') #.dt.date 
    mkt['time'] = mkt['time'].dt.floor('1D')
    # Verify if business day, if not, roll to the next B day
    offset = BDay()
    nws.time = nws.time.apply(lambda x: offset.rollforward(x))

    
    ## TRIM
    drop_mkt_feats = ['assetName','open','returnsClosePrevRaw1','returnsOpenPrevRaw1',
                 'returnsClosePrevRaw10','returnsOpenPrevRaw10', 
                 'returnsClosePrevMktres1']
    mkt.drop(drop_mkt_feats, axis=1, inplace=True)
    
    drop_nws_feats = ['sourceTimestamp', 'firstCreated', 'sourceId', 'headline',
    'takeSequence', 'provider', 'subjects', 'audiences','bodySize',
    'headlineTag', 'marketCommentary', 'assetName',
    'urgency', 'sentenceCount', 'wordCount', 'sentimentClass', 'sentimentPositive', 'volumeCounts12H',
    'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
    'volumeCounts7D']
    
    nws.drop(drop_nws_feats, axis=1, inplace=True)

    return mkt, nws
    
        
def expand_assets(nws):
    nws['assetCodes'] = nws['assetCodes'].str.findall(f"'([\w\./]+)'")   
    assetCodes_expanded = list(chain(*nws['assetCodes']))
    assetCodes_index = nws.index.repeat( nws['assetCodes'].apply(len) )
    df = pd.DataFrame({'idx': assetCodes_index, 'assetCode': assetCodes_expanded})
    # Create expandaded news (will repeat every assetCodes' row)
    nws_expanded = pd.merge(df, nws, left_on='idx', right_index=True)
    nws_expanded.drop(['idx','assetCodes'], axis=1, inplace=True)

    return nws_expanded

def split_dataset(features, data):
    X = StandardScaler().fit_transform(data.loc[:, features].values)
    y = np.array(data.target.values).reshape(X.shape[0],1)

    training_size = np.floor(X.shape[0]*0.75).astype(int)
    
    X_train = X[:training_size]
    y_train = y[:training_size]
    X_test = X[training_size:]
    y_test = y[training_size:]
    # Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:
    np.random.seed(42)
    rnd_idx = np.random.permutation(training_size)
    X_train = X_train[rnd_idx]
    y_train = y_train[rnd_idx]
    return X_train, X_test, y_train, y_test

def make_random_predictions(predictions_df):
    predictions_df.confidenceValue = 2.0 * np.random.rand(len(predictions_df)) - 1.0    
    
# Results
def get_res(res, clf_time, clf_name, X_train, y_test, y_pred, p):
    return pd.concat([res,
               pd.DataFrame({'data_size':str(X_train.shape),
              'ETA': clf_time,
              'Acc': accuracy_score(y_test, y_pred),
              'Precision': precision_score(y_test, y_pred),
              'Recall': recall_score(y_test, y_pred),
              'F1': f1_score(y_test, y_pred),
              'MSE': mean_squared_error(y_test, y_pred*1),
              'AUC': roc_auc_score(y_test, y_pred), 
              'Params':p}, index=[clf_name])])

def prep_res(r):
    # Remove the ETA column as well as the Params and data size
    r.drop(['ETA','Params','data_size'], inplace=True, axis=1)
    r = r.stack().reset_index()
    # Join the two indexes together and convert it to a df
    # Merge 
    r['metric'] = r.apply(lambda row: row.level_0+' '+row.level_1, axis=1)
    #res.drop(['level_0','level_1'], inplace=True, axis=1)
    #res.set_index('metric',inplace=True)
    r.columns = ['clf','metric','all','mix']
    return r

def get_baseline(res, X_train, y_train, X_test, y_test, title):
    ## BASELINE  
    # Logistic Regression  95s
    start = time()
    log_clf = LogisticRegression(random_state=42)
    log_clf.fit(X_train, y_train)
    y_pred = log_clf.predict(X_test)
    res = get_res(res, time()-start, 'LogReg', X_train, y_test, y_pred, title)

    # SGD
    start = time()
    sgd_clf = SGDClassifier(random_state=42)
    sgd_clf.fit(X_train, y_train)
    y_pred = sgd_clf.predict(X_test)
    res = get_res(res, time()-start, 'SGD', X_train, y_test, y_pred, title)

    # Decision Tree
    start = time()
    tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
    tree_clf.fit(X_train, y_train)
    # tree_clf.predict_proba(X_test)
    tree_clf.predict(X_test)
    res = get_res(res, time()-start, 'DecTree', X_train, y_test, y_pred, title)

#     # Xtra Trees
#     start = time()
#     xtree_clf = ExtraTreesClassifier(random_state=42)
#     xtree_clf.fit(X_train, y_train)
#     y_pred = xtree_clf.predict(X_test)
#     res = get_res(res, time()-start, 'XTrees', X_train, y_test, y_pred, title)

#     # Random Forest
#     start = time()
#     forest_clf = RandomForestClassifier(random_state=42)
#     forest_clf.fit(X_train, y_train)
#     y_pred = forest_clf.predict(X_test)
#     res = get_res(res, time()-start, 'RandFor', X_train, y_test, y_pred, title)
    
    return res

In [2]:
# First let's import the module and create an environment.
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
# days = env.get_prediction_days()

# # make_random_predictions(predictions_template_df)
# # env.predict(predictions_template_df)
# (market_obs_df, news_obs_df, predictions_template_df) = next(days)



Loading the data... This could take a minute.
Done!


In [3]:
# Resample
# end date = 2016-12-30 22:00:00+00:00
# mkt = market_train_df.loc[(market_train_df.time>='2007-01-01') & (market_train_df.time<='2009-12-31')]
# nws = news_train_df.loc[(news_train_df.time>='2007-01-01') & (news_train_df.time<='2009-12-31') ]
# mkt = market_train_df.loc[(market_train_df.time>='2010-01-01') & (market_train_df.time<='2012-12-31')]
# nws = news_train_df.loc[(news_train_df.time>='2010-01-01') & (news_train_df.time<='2012-12-31') ]
# mkt = market_train_df.loc[(market_train_df.time>='2013-01-01') & (market_train_df.time<='2016-12-31')]
# nws = news_train_df.loc[(news_train_df.time>='2013-01-01') & (news_train_df.time<='2016-12-31') ]

#print(mkt.shape, nws.shape)

# 77 s
start = time()
mkt, nws = pre_processing(market_train_df, news_train_df)
# print(time()-start, 'seconds')
del market_train_df, news_train_df
## Break down the set of assets 10s
# start = time()
nws = expand_assets(nws)
# print('Exploding Assets',time()-start, 'seconds', nws.shape)

## FEATURE SELECTION
# 18 s
# start = time()
nws = nws.groupby(['time','assetCode']).agg(f)
#print('Aggregating functions',time()-start, 'seconds', nws.shape)
# Correct the labels
col_name = ['_'.join(title) if isinstance(title, tuple) else title for title in nws.columns ]
assert len(col_name) == len(nws.columns)
nws.columns = col_name

# print(time()-start, 'seconds')

# start = time()


#print('Feature Selection',time()-start)

# 15min 2007 -> 2010, 35 min 2011-> ..., 
# Regroup the noveltyCount variables into max, min, median and std. 
#start = time()
current_col = [col for col in filter(lambda x: x.startswith('noveltyCount'), nws.columns)]
# nws['novelty_max'] = nws[current_col].apply(max, axis=1)
nws['novelty_median'] = nws[current_col].apply(np.median, axis=1)
nws['novelty_std'] = nws[current_col].apply(np.std, axis=1)
nws.drop(current_col, axis=1, inplace=True)


# current_col = [col for col in filter(lambda x: x.startswith('volumeCounts'), nws.columns)]
# nws['volumeCounts_max'] = nws[current_col].apply(max, axis=1)
# nws['volumeCounts_min'] = nws[current_col].apply(min, axis=1)
# nws.drop(current_col, axis=1, inplace=True)
#print('Feature Selection',time()-start)



# MERGING
# 16 s
# start = time()
# Join market and news info by time and asset code
#     Merge with market data
data = pd.merge(mkt, nws,  how='outer', left_on=['time','assetCode'], right_on = ['time','assetCode'])
print('Data size prior to clearing all the Nans', data.size)

# 2s
## FEATURE ENGINEERING 
# data['dow'] = data.time.dt.dayofweek
# data['mnth'] = data.time.dt.month   # Not a good feature


## Instance selection: 
# assert data.loc[(data.target.isnull()) & (data.sentimentClass_median.isnull()),:].shape[0] == 0
# Set all Nans to 0


data = data.loc[(~data.target.isnull()) & (~data.firstMentionSentence_median.isnull())].fillna(0)
data.drop(['time','assetCode'], inplace=True, axis=1)


# Reset Index
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)

## RESULTS
true_positives, true_negatives = data.target.value_counts()/data.shape[0]
print('Preprocessing Completed:',time()-start, 'seconds', data.shape, '\nDistribution: \n', true_positives, true_negatives)


del nws, mkt
    
data.head()

Data size prior to clearing all the Nans 127113762
Preprocessing Completed: 1009.3060801029205 seconds (1170863, 17) 
Distribution: 
 0.5065460263070914 0.49345397369290855


Unnamed: 0,volume,close,returnsOpenPrevMktres1,returnsClosePrevMktres10,returnsOpenPrevMktres10,target,universe,firstMentionSentence_median,firstMentionSentence_std,sentimentNeutral_mean,sentimentNeutral_std,relevance_median,companyCount_median,sentimentNegative_std,sentimentWordCount_median,novelty_median,novelty_std
0,2606900.0,32.19,0.0,0.0,0.0,True,1.0,9.0,11.313708,0.568264,0.480041,0.533333,6.5,0.093375,418.5,0.0,0.0
1,2051600.0,11.12,0.0,0.0,0.0,True,0.0,1.0,0.288675,0.412758,0.284623,1.0,2.0,0.220601,26.0,22.0,10.0
2,1208600.0,18.02,0.0,0.0,0.0,False,1.0,6.0,0.0,0.861472,0.0,0.447214,2.0,0.0,60.0,0.0,0.0
3,401800.0,52.46,0.0,0.0,0.0,True,1.0,0.0,0.0,0.260806,0.140365,1.0,2.0,0.102406,16.0,0.0,0.0
4,1636100.0,24.52,0.0,0.0,0.0,True,1.0,1.0,8.01041,0.323777,0.124412,1.0,1.5,0.305538,61.0,9.0,0.0


Dimensionality reduction

In [4]:
# My_PCA
# feats1_2007 = ['returnsClosePrevRaw10', 'dow', 'sentimentWordCount_median', 'returnsClosePrevMktres1', 'companyCount_mean', 'sentimentWordCount_std', 'universe', 'sentimentNegative_std', 'volumeCounts_max', 'firstMentionSentence_median', 'volumeCounts_min', 'sentimentNeutral_std', 'novelty_median', 'sentimentPositive_std', 'novelty_max', 'sentimentNegative_mean']
# feats2_2011 = ['returnsOpenPrevMktres10', 'sentenceCount_std', 'sentimentWordCount_std', 'returnsOpenPrevMktres1', 'novelty_max', 'volume', 'universe', 'companyCount_median', 'returnsClosePrevRaw1', 'sentimentNeutral_mean', 'sentimentPositive_mean', 'close', 'firstMentionSentence_median', 'relevance_std', 'relevance_median', 'novelty_std', 'sentimentClass_median', 'wordCount_median', 'urgency_mean', 'sentimentPositive_std', 'returnsClosePrevMktres1']
# featsRF_2007 = ['wordCount_std', 'companyCount_mean', 'sentenceCount_std','companyCount_median', 'novelty_median', 'bodySize_median','bodySize_std', 'wordCount_median', 'sentenceCount_median','assetCode', 'open', 'relevance_std', 'sentimentNegative_mean','sentimentWordCount_median', 'sentimentNeutral_mean','sentimentPositive_mean', 'volumeCounts_std', 'relevance_median','volume', 'volumeCounts_max', 'volumeCounts_median','firstMentionSentence_median', 'novelty_std', 'urgency_mean','returnsOpenPrevRaw10']
# featsRF_2011 = ['returnsOpenPrevMktres10', 'novelty_max', 'relevance_std','novelty_min', 'num_sub_mean', 'num_audi_mean', 'mnth', 'novelty_std','firstMentionSentence_median', 'companyCount_median','returnsClosePrevMktres10', 'open', 'firstMentionSentence_std','returnsClosePrevMktres1']

# LDA
features = data.columns.difference(['target'])
X_train, X_test, y_train, y_test = split_dataset(features, data)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)





(878147, 16) (292716, 16) (878147, 1) (292716, 1)


Dimensionality Reduction

In [None]:
# RAndom forest importances
start = time()
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
importances = forest_clf.feature_importances_
indices = np.argsort(importances)[::-1]
# for f in range(X_train.shape[1]):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
fig = plt.figure(figsize=(15, 7))
# Plot the feature importances of the forest
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
ind = list(indices).index(28)
print(np.c_[features[indices[:ind]],importances[:ind]])


In [None]:
# Get Random forest importance feature set performance results 
forestFeats = features[indices[:ind]]
clf = 'ForesFeat_Q2'
X_train, X_test, y_train, y_test = split_dataset(forestFeats, data)
res = get_baseline(pd.DataFrame(), X_train, y_train, X_test, y_test, clf )
res

# Get Baseline 
start = time()
res = get_baseline(res, X_train, y_train, X_test, y_test, 'Baseline')
print('Baseline',time()-start)

# Get Feature importances from all features
start = time() # 827 s - 1335 s - 
res = prep_res(res)
for f in features: 

    nwf = features.difference([f])
    X_train, X_test, y_train, y_test = split_dataset(nwf, data)
    res_f = get_baseline(pd.DataFrame(), X_train, y_train, X_test, y_test, '_'+f )
    res_f = prep_res(res_f)
    res_f.drop(['metric','clf'], axis=1, inplace=True)
    res_f.rename(columns={'all':f}, inplace=True)
    res = pd.merge(res, res_f, on='mix', how='outer')
    res[f+'_X'] = (res['all']-res[f]) / res['all']


print('FeatureImportances:',time()-start)

res.set_index('mix', inplace=True)
res.drop(['clf','metric'], axis=1, inplace=True)
my_res = res.loc[:,res.columns.str.endswith('_X')].T

n_cols = ['LogReg Acc', 'LogReg Precision', 'LogReg Recall', 'LogReg F1',
       'LogReg MSE', 'LogReg AUC', 'SGD Acc', 'SGD Precision',
       'SGD Recall', 'SGD F1', 'SGD MSE', 'SGD AUC', 'DecTree Acc',
       'DecTree Precision', 'DecTree Recall', 'DecTree F1', 'DecTree MSE',
       'DecTree AUC', 'XTrees Acc', 'XTrees Precision', 'XTrees Recall',
       'XTrees F1', 'XTrees MSE', 'XTrees AUC', 'RandFor Acc',
       'RandFor Precision', 'RandFor Recall', 'RandFor F1', 'RandFor MSE',
       'RandFor AUC']

my_res = res.loc[:,res.columns.str.endswith('_X')].T

my_res

*save my_res*

In [None]:
best_feats = pd.DataFrame()
mt = ['SGD','DecTree', 'XTrees', 'RandFor','LogReg']
quarter = 'Q3'
for m in mt:

    topFeats = my_res.loc[(my_res[m +' Acc']>0)&(my_res[m+' F1']>0)&(my_res[m+' AUC']>0),my_res.columns.str.startswith(m)].sort_values([m+' Acc',m+' AUC',m+' F1'], ascending=False).index.values
    
    tf = pd.DataFrame(topFeats, columns = [m])
    tf.reset_index(inplace=True)
    tf.set_index(m, inplace=True)
    tf.index.name = 'features'
    tf.rename(columns={'index':m+'_'+quarter}, inplace=True)
    
    best_feats = pd.merge(best_feats, tf, right_index=True, left_index=True, how='outer')
    

# Get rid of the _X
best_feats.reset_index(inplace=True)
best_feats['features'] = best_feats.features.apply(lambda x: x[:-2])
best_feats.set_index('features', inplace=True)

bClfs = ['SGD_Q3','DecTree_Q3','XTrees_Q3','RandFor_Q3','LogReg_Q3']

In [None]:
#Let's test the features found by XTrees_Q1
clf = bClfs[4]
new_feats = best_feats.loc[~best_feats[clf].isnull(),[clf]].index.values
X_train, X_test, y_train, y_test = split_dataset(new_feats, data)
res = get_baseline(pd.DataFrame(), X_train, y_train, X_test, y_test, clf )
res

*register res*

## Get Baseline

In [6]:
res = get_baseline(pd.DataFrame(), X_train, y_train, X_test, y_test, 'Baseline')


In [7]:
res

Unnamed: 0,data_size,ETA,Acc,Precision,Recall,F1,MSE,AUC,Params
LogReg,"(878147, 16)",5.783929,0.52579,0.518775,0.704626,0.597584,0.47421,0.525898,Baseline
SGD,"(878147, 16)",1.169805,0.514646,0.510514,0.696785,0.58928,0.485354,0.514756,Baseline
DecTree,"(878147, 16)",3.064497,0.514646,0.510514,0.696785,0.58928,0.485354,0.514756,Baseline
XTrees,"(878147, 16)",31.513141,0.508042,0.509634,0.409588,0.454166,0.491958,0.507982,Baseline
RandFor,"(878147, 16)",76.81016,0.510245,0.51238,0.411714,0.456564,0.489755,0.510186,Baseline


## GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# SGD
my_clf = SGDClassifier(random_state=42)
param_grid = [{'loss':['hinge','log','modified_huber',], 'penalty' : ['l1','elasticnet'], 'l1_ratio':[0,0.25,0.5,0.75,1],'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1]}]
# 2007
#alpha=0.01, l1_ratio=0.25, loss='log',penalty='elasticnet'

clf_name = 'SGD'
start = time()
grid_search = GridSearchCV(my_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf_time = time()-start
y_pred = grid_search.predict(X_test)
# Same thing: y_pred2 = grid_search.best_estimator_.predict(X_test)
# Results
res = get_res(res, clf_time, clf_name, X_train, y_test, y_pred, str(grid_search.best_params_))  #grid_search.best_params_
print(res.loc[clf_name].Params)

# DecTrees
my_clf = DecisionTreeClassifier(random_state=42)
param_grid = {'max_leaf_nodes': list(range(91, 99)),'min_samples_split': [1.0, 2], 'min_weight_fraction_leaf':[0,0.01], 'min_samples_leaf':[7,9,11]}
# 2008
#'max_leaf_nodes': 94, 'min_samples_leaf': 7, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0
#2009
#{'max_leaf_nodes': 93, 'min_samples_leaf': 7, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01}
#param_grid = {'max_leaf_nodes': [91,93], 'min_samples_leaf': [5,7], 'min_samples_split': [2,3], 'min_weight_fraction_leaf': [0.01, 0.1]}
clf_name = 'DT'
start = time()
grid_search = GridSearchCV(my_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf_time = time()-start
y_pred = grid_search.predict(X_test)
# Same thing: y_pred2 = grid_search.best_estimator_.predict(X_test)
# Results
res = get_res(res, clf_time, clf_name, X_train, y_test, y_pred, str(grid_search.best_params_))  #grid_search.best_params_

my_clf = LogisticRegression(random_state=42)
param_grid = [{'C': [0.001,0.01, 0.1, 1, 2], 'penalty':['l1','l2']}]
# 2007
#C=0.01, penalty='l1'
clf_name = 'LR'
start = time()
grid_search = GridSearchCV(my_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf_time = time()-start
y_pred = grid_search.predict(X_test)
# Same thing: y_pred2 = grid_search.best_estimator_.predict(X_test)
# Results
res = get_res(res, clf_time, clf_name, X_train, y_test, y_pred, str(grid_search.best_params_))  #grid_search.best_params_

my_clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators':list(range(43,45,47)),'min_samples_split':[11],'max_leaf_nodes':[93,95,97],'min_samples_leaf':[1,2]}
# 2007
#max_leaf_nodes=91, min_samples_leaf=1,min_samples_split=11,n_estimators=41
# 2008
#max_leaf_nodes=91, min_samples_leaf=1,min_samples_split=11, n_estimators=49
# 2009
# max_leaf_nodes= 95, min_samples_leaf= 1,min_samples_split= 11, n_estimators= 49 & 45
clf_name = 'RF'
start = time()
grid_search = GridSearchCV(my_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf_time = time()-start
y_pred = grid_search.predict(X_test)
# Same thing: y_pred2 = grid_search.best_estimator_.predict(X_test)
# Results
res = get_res(res, clf_time, clf_name, X_train, y_test, y_pred, str(grid_search.best_params_))  #grid_search.best_params_

my_clf = ExtraTreesClassifier(random_state=42)
param_grid = {'n_estimators':[47,49],'min_samples_split':[9,11],'max_leaf_nodes':[97,99],'min_samples_leaf':[2,3]}

# 2008
#'max_leaf_nodes': 95, 'min_samples_leaf': 3, 'min_samples_split': 11, 'n_estimators': 49
# 2009
# max_leaf_nodes=99, min_samples_leaf=3, min_samples_split=9,n_estimators=49
clf_name = 'XT'
start = time()
grid_search = GridSearchCV(my_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf_time = time()-start
y_pred = grid_search.predict(X_test)
# Same thing: y_pred2 = grid_search.best_estimator_.predict(X_test)
# Results
res = get_res(res, clf_time, clf_name, X_train, y_test, y_pred, str(grid_search.best_params_))  #grid_search.best_params_
res

In [None]:
print(res.loc[(res.index=='SGD')&(res.Params!='Baseline'),:].Params.values, 
res.loc['DT'].Params,
res.loc['LR'].Params,
res.loc['RF'].Params,
res.loc['XT'].Params, sep='\n')

In [None]:
# Top classifiers:
my_clf1 = RandomForestClassifier(random_state=42,max_leaf_nodes= 97,min_samples_leaf= 2,min_samples_split= 11, n_estimators=43)
my_clf2 = LogisticRegression(random_state=42)
my_clf3 = DecisionTreeClassifier(max_leaf_nodes= 98,min_samples_leaf= 11,min_samples_split= 2, min_weight_fraction_leaf=43)


---
ENSEMBLE

In [None]:
from sklearn.ensemble import VotingClassifier
# Random Forest
start = time()


voting_clf = VotingClassifier(
    estimators=[('XTree', my_xTree), ('rf1', my_RF1), ('rf2', my_RF2)],
    voting='soft')
voting_clf.fit(X_train, y_train)
clf_time = time()-start

In [None]:
for clf in (my_xTree,my_RF1, my_RF2, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
res = get_res(res, clf_time, 'SVote:XT,RF1,RF2', X_train, y_test, y_pred)

res

Bagging and Pasting

In [None]:
#
start = time()
from sklearn.ensemble import BaggingClassifier
n_estimators=500
bag_clf = BaggingClassifier(
    RandomForestClassifier(random_state=42, max_leaf_nodes= 95, min_samples_leaf= 1,min_samples_split= 11, n_estimators= 49),
    n_estimators=n_estimators, n_jobs=-1, random_state=40) # , oob_score=True, bootstrap=True
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
clf_time = time()-start
res = get_res(res, clf_time, 'PRf1', X_train, y_test, y_pred)
print(clf_time) #, bag_clf.oob_score_)
res

Boosting Stacking

In [None]:
# Default base estimator
#DecisionTreeClassifier(max_depth=1)

from sklearn.ensemble import AdaBoostClassifier
start = time()
ada_clf = AdaBoostClassifier(
    RandomForestClassifier(random_state=42), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
clf_time = time()-start
res = get_res(res, clf_time, 'aBRf-', X_train, y_test, y_pred)
print('Scores:',ada_clf.score(X_train, y_train), ada_clf.score(X_test, y_test),'...overfitting?')
res

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=49)

start = time()
#gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, learning_rate=1.0, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

print(bst_n_estimators)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)
y_pred = gbrt_best.predict(X_test)
y_pred_b = y_pred>=0.5

res = get_res(res, time()-start, 'GBoostBest', X_train, y_test, y_pred_b)
res

In [None]:
res