In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import joblib

Import Data, split into train and test, scale features

In [2]:
hb_train_data = pd.read_csv('data/train.csv')
hb_test_data = pd.read_csv('data/test.csv')

hb_train_data_labels = hb_train_data['label']
hb_train_data = hb_train_data.drop('label', axis=1)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(hb_train_data, hb_train_data_labels, test_size=0.2, random_state=42)

scaler = sklearn.preprocessing.StandardScaler().fit(X_train)

# Scale Data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

hb_train_data

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27
0,0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.000000,1.374992,...,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,0.907542,0.329147,0.359412,1.497970,-0.313010,1.095531,-0.557525,-1.588230,2.173076,0.812581,...,-1.138930,-0.000819,0.000000,0.302220,0.833048,0.985700,0.978098,0.779732,0.992356,0.798343
2,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.000000,0.851737,...,1.128848,0.900461,0.000000,0.909753,1.108330,0.985692,0.951331,0.803252,0.865924,0.780118
3,1.344385,-0.876626,0.935913,1.992050,0.882454,1.786066,-1.646778,-0.942383,0.000000,2.423265,...,-0.678379,-1.360356,0.000000,0.946652,1.028704,0.998656,0.728281,0.869200,1.026736,0.957904
4,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.000000,0.800872,...,-0.373566,0.113041,0.000000,0.755856,1.361057,0.986610,0.838085,1.133295,0.872245,0.808487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.620218,-0.823058,0.157442,1.442791,0.767518,2.327923,0.344575,-0.072010,0.000000,1.762271,...,-2.329035,1.499211,0.000000,1.733151,1.019404,0.991311,1.722911,0.900219,1.075457,1.077989
49996,0.686833,-1.711317,-1.417914,1.311205,-0.298519,0.517214,-1.012040,-1.190188,0.000000,0.570343,...,-1.039824,0.303930,3.101961,0.715210,0.735663,1.112954,0.706047,0.276972,0.600938,0.591874
49997,0.728559,-1.834037,-1.397939,0.344134,-0.390320,0.951158,-0.165394,-1.400851,0.000000,1.260420,...,2.004978,0.829432,0.000000,0.636411,0.983770,0.996603,0.929728,0.862690,0.934698,0.954439
49998,1.477981,-0.818188,-1.546087,0.835522,-1.267488,0.785623,0.013838,0.079991,2.173076,0.677361,...,-0.616750,0.572508,0.000000,0.526324,0.893976,0.977501,0.814417,0.756959,0.958430,0.842852


In [3]:
def computeF1(precision, recall):
    return (2 * precision * recall) / (precision + recall)

### Decision Tree

In [4]:
DT_parameters = {'max_depth':(None, 1, 3, 5, 10, 20, 30), 'min_samples_split':(2, 5, 10, 20, 50), 'min_samples_leaf': (1, 2, 5, 10, 15, 25)}
DT_cv = sklearn.tree.DecisionTreeClassifier(random_state = 42)
DT_grid = GridSearchCV(estimator = DT_cv, param_grid = DT_parameters, n_jobs = -1)
DT_grid.fit(X_train, y_train)
DT_grid.best_estimator_.get_params()
predictions = DT_grid.best_estimator_.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix

tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Params": DT_grid.best_estimator_.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : DT_grid.best_estimator_.score(X_train, y_train), 
                  "Test Accuracy" : DT_grid.best_estimator_.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}
model_data

{'Params': {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': 10,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 25,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'monotonic_cst': None,
  'random_state': 42,
  'splitter': 'best'},
 'True Positive': np.int64(3601),
 'False Positive': np.int64(1565),
 'False Negative': np.int64(1681),
 'True Negative': np.int64(3153),
 'Training Accuracy': 0.73405,
 'Test Accuracy': 0.6754,
 'Accuracy': np.float64(0.6754),
 'Precision': np.float64(0.6970576848625629),
 'Recall': np.float64(0.6817493373722074),
 'F1 Score': np.float64(0.6893185298621746),
 'ROC/AUC Score': np.float64(0.6750204931880113)}

### Ada Boosted Decision Tree

In [5]:
dt_ada_params = {'n_estimators':[10, 25, 50]}
dt_ada = AdaBoostClassifier(estimator = DT_grid.best_estimator_, random_state=42)
dt_ada_grid = GridSearchCV(dt_ada, dt_ada_params)
dt_ada_grid.fit(X_train, y_train)
y_pred = dt_ada_grid.best_estimator_.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
cf_matrix

tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": dt_ada_grid.best_estimator_.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : dt_ada_grid.best_estimator_.score(X_train, y_train), 
                  "Test Accuracy" : dt_ada_grid.best_estimator_.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)),
                 "ROC/AUC Score": roc_auc_score(y_test, y_pred)}

model_data

{'Parameters': {'algorithm': 'deprecated',
  'estimator__ccp_alpha': 0.0,
  'estimator__class_weight': None,
  'estimator__criterion': 'gini',
  'estimator__max_depth': 10,
  'estimator__max_features': None,
  'estimator__max_leaf_nodes': None,
  'estimator__min_impurity_decrease': 0.0,
  'estimator__min_samples_leaf': 25,
  'estimator__min_samples_split': 2,
  'estimator__min_weight_fraction_leaf': 0.0,
  'estimator__monotonic_cst': None,
  'estimator__random_state': 42,
  'estimator__splitter': 'best',
  'estimator': DecisionTreeClassifier(max_depth=10, min_samples_leaf=25, random_state=42),
  'learning_rate': 1.0,
  'n_estimators': 10,
  'random_state': 42},
 'True Positive': np.int64(3682),
 'False Positive': np.int64(1609),
 'False Negative': np.int64(1600),
 'True Negative': np.int64(3109),
 'Training Accuracy': 0.801725,
 'Test Accuracy': 0.6791,
 'Accuracy': np.float64(0.6791),
 'Precision': np.float64(0.6958986958986959),
 'Recall': np.float64(0.6970844377129876),
 'F1 Score':

### Random Forest

In [6]:
N_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {N_CORES}")
RF_Classifier = sklearn.ensemble.RandomForestClassifier(max_depth = 10, 
                                                        min_samples_split = 10, 
                                                        n_estimators = 100, 
                                                        min_samples_leaf = 25, 
                                                        max_features = None,
                                                        n_jobs = -1,
                                                        random_state = 42)

RF_Classifier.fit(X_train, y_train)
predictions = RF_Classifier.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix

tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": RF_Classifier.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : RF_Classifier.score(X_train, y_train), 
                  "Test Accuracy" : RF_Classifier.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}
model_data

Number of physical cores: 10


{'Parameters': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': 10,
  'max_features': None,
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 25,
  'min_samples_split': 10,
  'min_weight_fraction_leaf': 0.0,
  'monotonic_cst': None,
  'n_estimators': 100,
  'n_jobs': -1,
  'oob_score': False,
  'random_state': 42,
  'verbose': 0,
  'warm_start': False},
 'True Positive': np.int64(3834),
 'False Positive': np.int64(1449),
 'False Negative': np.int64(1448),
 'True Negative': np.int64(3269),
 'Training Accuracy': 0.767575,
 'Test Accuracy': 0.7103,
 'Accuracy': np.float64(0.7103),
 'Precision': np.float64(0.7257240204429302),
 'Recall': np.float64(0.7258614161302537),
 'F1 Score': np.float64(0.725792711784193),
 'ROC/AUC Score': np.float64(0.7093698772045928)}

### Random Forest with Boosting (Hist Boosting Classifier)

In [7]:
RF_hist_boost = HistGradientBoostingClassifier(max_depth = 10, min_samples_leaf = 25, random_state = 42).fit(X_train, y_train)
RF_hist_boost.score(X_test, y_test)
predictions = RF_hist_boost.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix
tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": RF_hist_boost.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : RF_hist_boost.score(X_train, y_train), 
                  "Test Accuracy" : RF_hist_boost.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}


test_pred_probs = RF_hist_boost.predict_proba(hb_test_data)[:, 1]
test_pred = pd.Series(test_pred_probs, index = np.arange(0, len(test_pred_probs)))
frame = pd.DataFrame({'Id': test_pred.index, 'Predicted': test_pred.values})
np.savetxt(fname='stest.csv', X=frame, header='Id,Predicted', delimiter=',', comments='')
test_pred_probs

model_data

{'Parameters': {'categorical_features': 'from_dtype',
  'class_weight': None,
  'early_stopping': 'auto',
  'interaction_cst': None,
  'l2_regularization': 0.0,
  'learning_rate': 0.1,
  'loss': 'log_loss',
  'max_bins': 255,
  'max_depth': 10,
  'max_features': 1.0,
  'max_iter': 100,
  'max_leaf_nodes': 31,
  'min_samples_leaf': 25,
  'monotonic_cst': None,
  'n_iter_no_change': 10,
  'random_state': 42,
  'scoring': 'loss',
  'tol': 1e-07,
  'validation_fraction': 0.1,
  'verbose': 0,
  'warm_start': False},
 'True Positive': np.int64(3877),
 'False Positive': np.int64(1425),
 'False Negative': np.int64(1405),
 'True Negative': np.int64(3293),
 'Training Accuracy': 0.76665,
 'Test Accuracy': 0.717,
 'Accuracy': np.float64(0.717),
 'Precision': np.float64(0.7312334967936628),
 'Recall': np.float64(0.7340022718667172),
 'F1 Score': np.float64(0.7326152683295539),
 'ROC/AUC Score': np.float64(0.7159837556874917)}

### Gradient Boosting Classifier

In [8]:
RF_grad_boost = GradientBoostingClassifier(max_depth = 10, min_samples_leaf = 25, min_samples_split = 10, random_state = 42).fit(X_train, y_train)
RF_grad_boost.score(X_test, y_test)
predictions = RF_grad_boost.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix
tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": RF_grad_boost.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : RF_grad_boost.score(X_train, y_train), 
                  "Test Accuracy" : RF_grad_boost.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}
model_data

{'Parameters': {'ccp_alpha': 0.0,
  'criterion': 'friedman_mse',
  'init': None,
  'learning_rate': 0.1,
  'loss': 'log_loss',
  'max_depth': 10,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 25,
  'min_samples_split': 10,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 100,
  'n_iter_no_change': None,
  'random_state': 42,
  'subsample': 1.0,
  'tol': 0.0001,
  'validation_fraction': 0.1,
  'verbose': 0,
  'warm_start': False},
 'True Positive': np.int64(3924),
 'False Positive': np.int64(1456),
 'False Negative': np.int64(1358),
 'True Negative': np.int64(3262),
 'Training Accuracy': 0.937575,
 'Test Accuracy': 0.7186,
 'Accuracy': np.float64(0.7186),
 'Precision': np.float64(0.729368029739777),
 'Recall': np.float64(0.7429004165088982),
 'F1 Score': np.float64(0.7360720315137873),
 'ROC/AUC Score': np.float64(0.7171475376313037)}

### Logistic Regression

In [9]:
logi_reg_clf = LogisticRegression(random_state=42, n_jobs = -1).fit(X_train_scaled, y_train)

logi_reg_clf.score(X_test_scaled, y_test)
predictions = logi_reg_clf.predict(X_test_scaled)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix
tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": logi_reg_clf.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : logi_reg_clf.score(X_train_scaled, y_train), 
                  "Test Accuracy" : logi_reg_clf.score(X_test_scaled, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}

model_data


{'Parameters': {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 100,
  'multi_class': 'deprecated',
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'True Positive': np.int64(3889),
 'False Positive': np.int64(2270),
 'False Negative': np.int64(1393),
 'True Negative': np.int64(2448),
 'Training Accuracy': 0.641725,
 'Test Accuracy': 0.6337,
 'Accuracy': np.float64(0.6337),
 'Precision': np.float64(0.6314336742977756),
 'Recall': np.float64(0.7362741385838697),
 'F1 Score': np.float64(0.6798356786994144),
 'ROC/AUC Score': np.float64(0.6275690319879924)}

### Voting Classifier

In [10]:
VC_no_logi = VotingClassifier(estimators = [
        ('RF_grad_boost', RF_grad_boost), 
        ('RF_hist_boost', RF_hist_boost), 
        ('dt_ada', dt_ada_grid.best_estimator_), 
        ('RF', RF_Classifier)
    ], n_jobs = -1, voting='soft')
VC_no_logi.fit(X_train, y_train)
predictions = VC_no_logi.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix
tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": VC_no_logi.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : VC_no_logi.score(X_train, y_train), 
                  "Test Accuracy" : VC_no_logi.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}
model_data

#Exports to VC_no_logi.csv
test_pred_probs = VC_no_logi.predict_proba(hb_test_data)[:, 1]
test_pred = pd.Series(test_pred_probs, index = np.arange(0, len(test_pred_probs)))
frame = pd.DataFrame({'Id': test_pred.index, 'Predicted': test_pred.values})
np.savetxt(fname='VC_no_logi.csv', X=frame, header='Id,Predicted', delimiter=',', comments='')
#test_pred_probs

In [11]:
model_data

{'Parameters': {'estimators': [('RF_grad_boost',
    GradientBoostingClassifier(max_depth=10, min_samples_leaf=25,
                               min_samples_split=10, random_state=42)),
   ('RF_hist_boost',
    HistGradientBoostingClassifier(max_depth=10, min_samples_leaf=25,
                                   random_state=42)),
   ('dt_ada',
    AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=10,
                                                        min_samples_leaf=25,
                                                        random_state=42),
                       n_estimators=10, random_state=42)),
   ('RF',
    RandomForestClassifier(max_depth=10, max_features=None, min_samples_leaf=25,
                           min_samples_split=10, n_jobs=-1, random_state=42))],
  'flatten_transform': True,
  'n_jobs': -1,
  'verbose': False,
  'voting': 'soft',
  'weights': None,
  'RF_grad_boost': GradientBoostingClassifier(max_depth=10, min_samples_leaf=25,
                  

### Stacking

In [12]:
SC = StackingClassifier(estimators = [
        ('RF_grad_boost', RF_grad_boost), 
        ('RF_hist_boost', RF_hist_boost), 
        ('dt_ada', dt_ada_grid.best_estimator_), 
        ('RF', RF_Classifier)
    ], n_jobs = -1)
SC.fit(X_train, y_train)
predictions = SC.predict(X_test)
cf_matrix = sklearn.metrics.confusion_matrix(y_test, predictions)
cf_matrix
tn, fp, fn, tp = cf_matrix.ravel()
model_data = {"Parameters": SC.get_params(), "True Positive": tp, "False Positive": fp, "False Negative": fn, "True Negative": tn,
                 "Training Accuracy" : SC.score(X_train, y_train), 
                  "Test Accuracy" : SC.score(X_test, y_test), 
                 "Accuracy" : (tp + tn) / (tp + tn + fn + fp), "Precision" : (tp) / (tp + fp), "Recall" : (tp) / (tp + fn), 
                  "F1 Score" : computeF1((tp) / (tp + fp), (tp) / (tp + fn)), 
                 "ROC/AUC Score": roc_auc_score(y_test, predictions)}
model_data

#Exports to SC.csv
test_pred_probs = SC.predict_proba(hb_test_data)[:, 1]
test_pred = pd.Series(test_pred_probs, index = np.arange(0, len(test_pred_probs)))
frame = pd.DataFrame({'Id': test_pred.index, 'Predicted': test_pred.values})
np.savetxt(fname='SC.csv', X=frame, header='Id,Predicted', delimiter=',', comments='')
model_data

{'Parameters': {'cv': None,
  'estimators': [('RF_grad_boost',
    GradientBoostingClassifier(max_depth=10, min_samples_leaf=25,
                               min_samples_split=10, random_state=42)),
   ('RF_hist_boost',
    HistGradientBoostingClassifier(max_depth=10, min_samples_leaf=25,
                                   random_state=42)),
   ('dt_ada',
    AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=10,
                                                        min_samples_leaf=25,
                                                        random_state=42),
                       n_estimators=10, random_state=42)),
   ('RF',
    RandomForestClassifier(max_depth=10, max_features=None, min_samples_leaf=25,
                           min_samples_split=10, n_jobs=-1, random_state=42))],
  'final_estimator': None,
  'n_jobs': -1,
  'passthrough': False,
  'stack_method': 'auto',
  'verbose': 0,
  'RF_grad_boost': GradientBoostingClassifier(max_depth=10, min_samples_leaf=25,