# TDS Project: Running lazy predict on the 1yr, 3yr and 5yr comorbid data

## This is at the ICD A00 to Z99 10yr window binary resolution

#### Load packages

In [151]:
import os
import numpy as np 
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, recall_score, precision_score

from lazypredict.Supervised import LazyClassifier 

import xgboost as xgb

In [42]:
os.chdir("/rds/general/project/hda_students_data/live/Group9/General/Data")

#### Read in the CSV

In [43]:
data_10yr = pd.read_csv("hes_10yr_A00Z99_bin_FAMD.csv")

#### Preview the data

In [44]:
data_10yr.shape

(15356, 1292)

In [45]:
data_10yr.head()

Unnamed: 0.1,Unnamed: 0,eid,X21003.0.0,X31.0.0,casecont,triplet_id,A02,A04,A06,A08,...,Z93,Z94,Z95,Z96,Z97,Z98,Z99,X0,X1,X2
0,1,1000015,65,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,-0.33,-0.03,-0.06
1,2,1000027,66,1,0,57,0,0,0,0,...,0,0,0,0,0,0,0,-0.33,-0.05,-0.05
2,3,1000039,69,1,0,8,0,0,0,0,...,0,0,0,0,0,0,0,-0.34,-0.05,-0.07
3,4,1000040,50,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,-0.33,-0.05,-0.06
4,5,1000053,46,1,0,273,0,0,0,0,...,0,0,0,0,0,0,0,-0.32,-0.06,-0.05


## Remove unnecessary columns!

In [46]:
data_10yr = data_10yr.drop(["X21003.0.0", "X31.0.0", "triplet_id","Unnamed: 0"], axis = 1)

In [47]:
data_10yr = data_10yr.set_index("eid")

## 10 Year

#### Create features (X) and target (Y) 

In [48]:
X_10yr = data_10yr
Y_10yr = data_10yr["casecont"]

#### Train test split

In [49]:
X_train_10yr, X_test_10yr, Y_train_10yr, Y_test_10yr = train_test_split(X_10yr, Y_10yr, test_size = 0.25, random_state = 289)

#### Balance the training data 

In [50]:
X_train_10yr = X_train_10yr.append(X_train_10yr[X_train_10yr["casecont"] == 1])
Y_train_10yr = Y_train_10yr.append(Y_train_10yr[Y_train_10yr == 1])

X_train_10yr = X_train_10yr.drop(labels = "casecont", axis = 1)
X_test_10yr = X_test_10yr.drop(labels = "casecont", axis = 1)

## Lazy predict

#### 10 year time window

In [29]:
clf = LazyClassifier(verbose = 0, custom_metric = None)

In [30]:
models, predictions = clf.fit(X_train_10yr, X_test_10yr, Y_train_10yr, Y_test_10yr)

100%|██████████| 29/29 [38:24<00:00, 79.48s/it]


In [31]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.79,0.78,0.78,0.79,3.25
XGBClassifier,0.78,0.77,0.77,0.79,10.93
AdaBoostClassifier,0.77,0.76,0.76,0.77,12.99
RandomForestClassifier,0.79,0.76,0.76,0.78,19.53
BaggingClassifier,0.77,0.75,0.75,0.77,11.89
LogisticRegression,0.74,0.71,0.71,0.74,4.15
DecisionTreeClassifier,0.73,0.7,0.7,0.73,2.91
ExtraTreesClassifier,0.75,0.7,0.7,0.74,41.81
LinearSVC,0.71,0.68,0.68,0.71,66.6
SGDClassifier,0.7,0.67,0.67,0.7,8.86


## XGBoost

In [157]:
model = xgb.XGBClassifier()

In [158]:
model.fit(X_train_10yr, Y_train_10yr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [159]:
model.score(X_test_10yr, Y_test_10yr)

0.7835373795259182

#### Adjust max_depth parameter, starting with 3 and incrementing by 1

In [133]:
params = {
    "max_depth" : [3, 4, 5, 6, 7, 8, 9, 10]
}

In [134]:
model = xgb.XGBClassifier()
grid = GridSearchCV(estimator = model, param_grid = params, cv = 10, n_jobs = 20, verbose = 3)

In [135]:
grid.fit(X_train_10yr, Y_train_10yr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   3 out of  10 | elapsed:  3.0min remaining:  7.0min
[Parallel(n_jobs=20)]: Done   7 out of  10 | elapsed:  3.4min remaining:  1.5min
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:  3.4min finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None

In [136]:
print(grid.best_estimator_)
print(grid.best_params_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
{'max_depth': 5}


In [137]:
score = grid.best_estimator_.score(X_train_10yr, Y_train_10yr)
print(f"Training score: {score}")

score = grid.best_estimator_.score(X_test_10yr, Y_test_10yr)
print(f"Test score: {score}")

Training score: 0.853588454254555
Test score: 0.7835373795259182


In [78]:
grid.cv_results_

{'mean_fit_time': array([224.04391742, 315.29138455, 388.48249807, 486.95465341,
        557.25365486, 609.65080128, 615.42437406, 562.41762214]),
 'std_fit_time': array([12.31904463,  8.53022038, 18.70713556, 11.36638558, 10.09275636,
         7.91731629,  7.03922732,  4.96658733]),
 'mean_score_time': array([1.29556036, 1.59916172, 1.71170607, 1.53338604, 1.502425  ,
        0.71853242, 0.66794562, 0.26432519]),
 'std_score_time': array([0.5062905 , 0.29363112, 0.24186868, 0.61186157, 0.47586766,
        0.09815936, 0.20012607, 0.05191629]),
 'param_max_depth': masked_array(data=[3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 3},
  {'max_depth': 4},
  {'max_depth': 5},
  {'max_depth': 6},
  {'max_depth': 7},
  {'max_depth': 8},
  {'max_depth': 9},
  {'max_depth': 10}],
 'split0_test_score': array([0.78746327, 0.79497225, 0.81488737, 0.81456089, 0.81945

#### Settle on a max_depth of 4, now time to run a gridsearchCV on the remaining important parameters

#### We will run a gridsearch on learning_rate, subsample, colsample_bytree and gamma!

In [140]:
params = {
    "learning_rate" : [0.1, 0.05, 0.01],
    "subsample" : [0.8, 0.9, 1],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    "gamma" : [0, 1, 5],
    "max_depth" : [4]
}

In [141]:
model = xgb.XGBClassifier()
grid = GridSearchCV(estimator = model, param_grid = params, cv = 2, n_jobs = 20, verbose = 3)

In [142]:
grid.fit(X_train_10yr, Y_train_10yr)

Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  88 tasks      | elapsed: 20.9min
[Parallel(n_jobs=20)]: Done 248 tasks      | elapsed: 56.8min
[Parallel(n_jobs=20)]: Done 324 out of 324 | elapsed: 71.9min finished


GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [143]:
print(grid.best_estimator_)
print(grid.best_params_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)
{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 0.8}


In [144]:
score = grid.best_estimator_.score(X_train_10yr, Y_train_10yr)
print(f"Training score: {score}")

score = grid.best_estimator_.score(X_test_10yr, Y_test_10yr)
print(f"Test score: {score}")

Training score: 0.804218637758767
Test score: 0.7772857514977859


#### Finally, let's get out all of the metrics for the predictive accuracy of the final model 

## Final metrics for baseline XGBoost model

In [160]:
preds = model.predict(X_test_10yr)
roc_auc = round(roc_auc_score(Y_test_10yr, preds), 2)
accuracy = round(accuracy_score(Y_test_10yr, preds), 2)
f1 = round(f1_score(Y_test_10yr, preds), 2)
recall = round(recall_score(Y_test_10yr, preds), 2)
precision = round(precision_score(Y_test_10yr, preds), 2)

print(confusion_matrix(Y_test_10yr, preds))
print(f"ROC/AUC score => {roc_auc}")
print(f"Accuracy score => {accuracy}")
print(f"F1 score => {f1}")
print(f"Recall score => {recall}")
print(f"precision score => {precision}")

[[2024  492]
 [ 339  984]]
ROC/AUC score => 0.77
Accuracy score => 0.78
F1 score => 0.7
Recall score => 0.74
precision score => 0.67


## Final metrics for tuned XGBoost model

In [161]:
preds = grid.best_estimator_.predict(X_test_10yr)
roc_auc = round(roc_auc_score(Y_test_10yr, preds), 2)
accuracy = round(accuracy_score(Y_test_10yr, preds), 2)
f1 = round(f1_score(Y_test_10yr, preds), 2)
recall = round(recall_score(Y_test_10yr, preds), 2)
precision = round(precision_score(Y_test_10yr, preds), 2)

print(confusion_matrix(Y_test_10yr, preds))
print(f"ROC/AUC score => {roc_auc}")
print(f"Accuracy score => {accuracy}")
print(f"F1 score => {f1}")
print(f"Recall score => {recall}")
print(f"precision score => {precision}")

[[1963  553]
 [ 302 1021]]
ROC/AUC score => 0.78
Accuracy score => 0.78
F1 score => 0.7
Recall score => 0.77
precision score => 0.65


## Increase tree max depth to 6 (Best performance all things considered...)

In [201]:
model = xgb.XGBClassifier(colsample_bytree = 0.6, gamma = 0, learning_rate = 0.1, max_depth = 6, subsample = 0.8)

In [202]:
model.fit(X_train_10yr, Y_train_10yr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [203]:
preds = model.predict(X_test_10yr)
roc_auc = round(roc_auc_score(Y_test_10yr, preds), 2)
accuracy = round(accuracy_score(Y_test_10yr, preds), 2)
f1 = round(f1_score(Y_test_10yr, preds), 2)
recall = round(recall_score(Y_test_10yr, preds), 2)
precision = round(precision_score(Y_test_10yr, preds), 2)

print(confusion_matrix(Y_test_10yr, preds))
print(f"ROC/AUC score => {roc_auc}")
print(f"Accuracy score => {accuracy}")
print(f"F1 score => {f1}")
print(f"Recall score => {recall}")
print(f"precision score => {precision}")

[[2004  512]
 [ 311 1012]]
ROC/AUC score => 0.78
Accuracy score => 0.79
F1 score => 0.71
Recall score => 0.76
precision score => 0.66
