In [33]:
import os
import glob
import pickle as pkl
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.metrics import r2_score, \
    explained_variance_score, normalized_mutual_info_score, mutual_info_score, make_scorer, mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin

from skll.metrics import spearman, pearson

from xgboost import XGBRegressor, XGBClassifier


from hyperopt import Trials, fmin, tpe, hp, STATUS_OK

from mlxtend.preprocessing import shuffle_arrays_unison

from pylab import rcParams
rcParams['figure.figsize'] = 8, 8

# Import the Data
Load the data from the pickle files created in `preproccess.ipynb`

In [4]:
with open("../pkl/data/data_outliers_removed", "rb" ) as f:
    [X, Y] = pkl.load(f)

## Modelling with XGB

### Setup

In [13]:
splitter= KFold(n_splits=10,shuffle=True,random_state=7)

In [14]:
scoring = {'r2':make_scorer(r2_score), 
           'SRC':make_scorer(spearman), 
           'PCC':make_scorer(pearson), 
           'MI':make_scorer(mutual_info_score), 
           'MAE':make_scorer(mean_absolute_error)}

In [15]:
try:
    overall_results = pd.read_csv('../reports/model_results.csv',index_col=0)
except FileNotFoundError:
    overall_results = pd.DataFrame(columns = scoring.keys())

### Regressor only

In [19]:
# param_grid = {'max_depth': [7, 9, 11]}
# param_grid = {'max_depth': [9], 'learning_rate': np.logspace(-2,0,10)}
# param_grid = {'max_depth': [9], 'n_estimators': [100], 'colsample_bytree': [1, 0.9, 0.7]}
param_grid = {'max_depth': [9], 'n_estimators': [50,100,150], 'colsample_bytree': [0.9]}
refit = 'r2'

search = GridSearchCV(estimator=XGBRegressor(),param_grid=param_grid,scoring=('neg_mean_squared_error','r2','explained_variance'),refit=refit,cv=splitter)
search.fit(X,Y)



GridSearchCV(cv=KFold(n_splits=10, random_state=7, shuffle=True),
       error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [9], 'n_estimators': [50, 100, 150], 'colsample_bytree': [0.9]},
       pre_dispatch='2*n_jobs', refit='r2', return_train_score='warn',
       scoring=('neg_mean_squared_error', 'r2', 'explained_variance'),
       verbose=0)

In [20]:
search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [21]:
search.best_score_

0.8330994080091952

In [22]:
Y_binary = (Y != 0)
Y_binary = Y_binary.astype(int)

In [28]:
param_grid = {'max_depth': [7, 9, 11]}

search= GridSearchCV(estimator=XGBClassifier(),param_grid=param_grid,scoring=('accuracy','recall'),refit='accuracy',cv=splitter)
search.fit(X,Y_binary)

GridSearchCV(cv=KFold(n_splits=10, random_state=7, shuffle=True),
       error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [7, 9, 11]}, pre_dispatch='2*n_jobs',
       refit='accuracy', return_train_score='warn',
       scoring=('accuracy', 'recall'), verbose=0)

In [29]:
search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [30]:
search.cv_results_



{'mean_fit_time': array([0.15150874, 0.15080857, 0.15600901]),
 'std_fit_time': array([0.01032782, 0.01349   , 0.01606946]),
 'mean_score_time': array([0.00340011, 0.0032002 , 0.00350015]),
 'std_score_time': array([0.00048991, 0.00039999, 0.00050008]),
 'param_max_depth': masked_array(data=[7, 9, 11],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 7}, {'max_depth': 9}, {'max_depth': 11}],
 'split0_test_accuracy': array([0.97368421, 0.97368421, 0.97368421]),
 'split1_test_accuracy': array([1., 1., 1.]),
 'split2_test_accuracy': array([0.97368421, 0.97368421, 0.97368421]),
 'split3_test_accuracy': array([0.97297297, 0.97297297, 0.97297297]),
 'split4_test_accuracy': array([0.97297297, 0.97297297, 0.97297297]),
 'split5_test_accuracy': array([0.97297297, 0.97297297, 0.97297297]),
 'split6_test_accuracy': array([0.91891892, 0.91891892, 0.91891892]),
 'split7_test_accuracy': array([0.97297297, 0.97297297, 0.97297297])

In [26]:
search.best_score_

0.9705093833780161

In [35]:
class XGBCombined(BaseEstimator,RegressorMixin):
    def __init__(self, max_depth_reg=None, max_depth_clas=None):
        self.max_depth_reg = max_depth_reg
        self.max_depth_clas = max_depth_clas

    def fit(self,X,y):
        self.reg = XGBRegressor(max_depth=self.max_depth_reg)
        self.clas = XGBClassifier(max_depth=self.max_depth_clas)
        self.reg.fit(X,y)
        y_binary = y != 0
        y_binary = y_binary.astype(int)
        self.clas.fit(X,y_binary)
        return self
        
    def predict(self, X):
        pred_reg = self.reg.predict(X)
        pred_clas = self.clas.predict(X)
        pred = np.multiply(pred_reg,pred_clas)
        return pred

In [42]:
xgb_combined_results = cross_validate(XGBCombined(max_depth_reg=9,max_depth_clas=7),X,Y,cv=splitter,scoring=scoring)

In [43]:
for score_name in scoring.keys():
    overall_results.loc['XGB Combined',score_name] = np.round(np.mean(xgb_combined_results['test_'+score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0.0,-0.0,-0.0,1.94
Dummy Median All,-0.32,0.0,-0.0,-0.0,1.68
Dummy Median Nonzero,-0.08,0.0,-0.0,-0.0,1.77
"Perfect Clasif., Mean Regr.",0.13,0.73,0.41,0.53,1.53
Lasso,0.45,0.61,0.7,3.07,1.23
Bounded Lasso,0.55,0.64,0.75,2.87,1.08
Bounded Lasso + LogReg,0.64,0.8,0.82,2.66,0.86
FFNN,0.66,0.7,0.83,3.58,0.96
XGB Combined,0.84,0.9,0.92,3.04,0.4


In [44]:
overall_results.to_csv('../reports/model_results.csv')