### **Imports** ###

In [27]:
import pandas as pd

import xgboost as xgb


from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import precision_score,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

### **Training and Test Sets** ###

In [28]:
training_set = pd.read_csv('../dataset/standardized_training.csv')
X_train = training_set.drop('general_two_year', axis=1)
y_train = training_set['general_two_year'].values

test_set = pd.read_csv('../dataset/standardized_testing.csv')
X_test = test_set.drop('general_two_year', axis=1)
y_test = test_set['general_two_year'].values


## **CART** 

#### **Hyperparameter Tuning**

In [29]:
#hyperparameter tuning 
cart = DecisionTreeClassifier(random_state=42)
cart_param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [50, 100, 300],
    'min_samples_leaf': [5, 50, 100],
    'max_leaf_nodes': [2, 3, 4, 5]
}

cart_grid_search = GridSearchCV(estimator=cart, param_grid=cart_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
cart_grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [30]:
#best cart:
best_cart = DecisionTreeClassifier(**cart_grid_search.best_params_)
best_cart.fit(X_train, y_train)
best_cart_pred = best_cart.predict(X_test)
best_cart_score = precision_score(y_test, best_cart_pred)
print("CART")
print("Best Hyperparameters:", cart_grid_search.best_params_)
print("accuracy:\t",accuracy_score(y_test, best_cart_pred))
print("roc auc:\t",roc_auc_score(y_test, best_cart_pred))
print("precision:\t", best_cart_score)



CART
Best Hyperparameters: {'max_depth': 2, 'max_leaf_nodes': 2, 'min_samples_leaf': 5, 'min_samples_split': 50}
accuracy:	 0.6061381074168798
roc auc:	 0.5666239590006406
precision:	 0.5853658536585366


## **EBM** ##

#### **Hyperparameter Tuning**

In [31]:
#hyperparameter tuning 
ebm = ExplainableBoostingClassifier(random_state=42, n_jobs=-1)
ebm_param_grid = {
    'learning_rate': [0.01,0.1],
    'max_bins': [128, 256],
    'max_interaction_bins': [16, 32],
    'interactions': [10,20],
    'min_samples_leaf': [2,10]
}

ebm_grid_search = GridSearchCV(estimator=ebm, param_grid=ebm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
ebm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", ebm_grid_search.best_params_)



Fitting 5 folds for each of 32 candidates, totalling 160 fits


Best Hyperparameters: {'interactions': 20, 'learning_rate': 0.01, 'max_bins': 128, 'max_interaction_bins': 32, 'min_samples_leaf': 2}


In [32]:
#best ebm:
best_ebm = ExplainableBoostingClassifier(**ebm_grid_search.best_params_)
best_ebm.fit(X_train, y_train)
best_ebm_pred = best_ebm.predict(X_test)
best_ebm_score = precision_score(y_test, best_ebm_pred)



print("EBM")
print("Best Hyperparameters:", ebm_grid_search.best_params_)
print("accuracy:\t",accuracy_score(y_test, best_ebm_pred))
print("roc auc:\t",roc_auc_score(y_test, best_ebm_pred))
print("precision:\t", best_ebm_score)



EBM
Best Hyperparameters: {'interactions': 20, 'learning_rate': 0.01, 'max_bins': 128, 'max_interaction_bins': 32, 'min_samples_leaf': 2}
accuracy:	 0.6547314578005116
roc auc:	 0.6429906043134743
precision:	 0.6064516129032258


## **Linear SVM** ##

#### **Hyperparameter Tuning**

In [33]:
#hyperparameter tuning 
lsvm = LinearSVC(random_state=42)
lsvm_param_grid = {
    'C': [0.1, 1.0, 10],
    'intercept_scaling': [0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge'],
}


lsvm_grid_search = GridSearchCV(estimator=lsvm, param_grid=lsvm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
lsvm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", lsvm_grid_search.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'C': 0.1, 'intercept_scaling': 0.1, 'loss': 'hinge'}




In [34]:
#best lsvm:
best_lsvm = LinearSVC(**lsvm_grid_search.best_params_)
best_lsvm.fit(X_train, y_train)
best_lsvm_pred = best_lsvm.predict(X_test)
best_lsvm_score = precision_score(y_test, best_lsvm_pred)



print("LSVM")
print("Best Hyperparameters:", lsvm_grid_search.best_params_)
print("accuracy:\t",accuracy_score(y_test, best_lsvm_pred))
print("roc auc:\t",roc_auc_score(y_test, best_lsvm_pred))
print("precision:\t", best_lsvm_score)


LSVM
Best Hyperparameters: {'C': 0.1, 'intercept_scaling': 0.1, 'loss': 'hinge'}
accuracy:	 0.670076726342711
roc auc:	 0.6630498612000855
precision:	 0.6167664670658682




## **XGBoost** ##

#### **Hyperparameter Tuning**

In [35]:
#hyperparameter tuning 
xgboost = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_param_grid = {
    #'n_estimators': [100, 250, 500], 
    'n_estimators': [40, 50, 60], 
    'max_depth': [1, 2, 3],      
    'min_child_weight': [20, 30, 40],   
    'colsample_bytree': [0.4, 0.5, 0.6],
    'gamma': [0, 1, 2]       
}


xgb_grid_search = GridSearchCV(estimator=xgboost, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
xgb_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", xgb_grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'colsample_bytree': 0.4, 'gamma': 1, 'max_depth': 1, 'min_child_weight': 40, 'n_estimators': 40}


In [36]:
#best xgb:
best_xgb = xgb.XGBClassifier(**xgb_grid_search.best_params_)
best_xgb.fit(X_train, y_train)
best_xgb_pred = best_xgb.predict(X_test)
best_xgb_score = precision_score(y_test, best_xgb_pred)



print("XGBoost")
print("Best Hyperparameters:", xgb_grid_search.best_params_)
print("accuracy:\t",accuracy_score(y_test, best_xgb_pred))
print("roc auc:\t",roc_auc_score(y_test, best_xgb_pred))
print("precision:\t", best_xgb_score)


XGBoost
Best Hyperparameters: {'colsample_bytree': 0.4, 'gamma': 1, 'max_depth': 1, 'min_child_weight': 40, 'n_estimators': 40}
accuracy:	 0.6419437340153452
roc auc:	 0.6347159940209268
precision:	 0.5833333333333334


# **EBM has a good balance between performance and interpretability**