### **Imports** ###

In [9]:
import pandas as pd
import xgboost as xgb

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf
from keras import backend as K

from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt


### **Training and Test Sets** ###

In [10]:
training_set = pd.read_csv('../dataset/standardized_training.csv')
X_train = training_set.drop('general_two_year', axis=1)
y_train = training_set['general_two_year']

test_set = pd.read_csv('../dataset/standardized_testing.csv')
X_test = test_set.drop('general_two_year', axis=1)
y_test = test_set['general_two_year'].values


## **CART** 

In [11]:
#basic cart:
cart = DecisionTreeClassifier(random_state=42)
cart.fit(X_train, y_train)

cart_pred = cart.predict(X_test)
cart_score = f1_score(y_test, cart_pred)
print(cart_score)


0.5029585798816568


#### **Hyperparameter Tuning**

In [12]:
#hyperparameter tuning 

cart_param_grid = {
    'max_depth': [1, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [10, 20, 40, 50],
    'max_leaf_nodes': [2, 10, 15]
}

cart_grid_search = GridSearchCV(estimator=cart, param_grid=cart_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
cart_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", cart_grid_search.best_params_)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Hyperparameters: {'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_leaf': 40, 'min_samples_split': 2}


In [50]:
#best cart:
best_cart = DecisionTreeClassifier(**cart_grid_search.best_params_)
best_cart.fit(X_train, y_train)
best_cart_pred = best_cart.predict(X_test)
best_cart_f1 = f1_score(y_test, best_cart_pred)
print("f1 score:", best_cart_f1)



f1 score: 0.5705705705705705


## **EBM** ##

In [39]:
#basic ebm:
ebm = ExplainableBoostingClassifier(random_state=42, n_jobs=-1)
ebm.fit(X_train, y_train)

ebm_pred = ebm.predict(X_test)
ebm_f1 = f1_score(y_test, ebm_pred)
print("f1 score:", ebm_f1)


f1 score: 0.5741324921135647


#### **Hyperparameter Tuning**

In [82]:
#hyperparameter tuning 

ebm_param_grid = {
    'learning_rate': [0.1],
    #'max_bins': [128, 256],
    #'max_interaction_bins': [16, 32],
    'interactions': [10,20],
    'min_samples_leaf': [2,10]
}

ebm_grid_search = GridSearchCV(estimator=ebm, param_grid=ebm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
ebm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", ebm_grid_search.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


Best Hyperparameters: {'interactions': 20, 'learning_rate': 0.1, 'min_samples_leaf': 2}


In [83]:
#best ebm:
best_ebm = ExplainableBoostingClassifier(**ebm_grid_search.best_params_)
best_ebm.fit(X_train, y_train)
best_ebm_pred = best_ebm.predict(X_test)
best_ebm_f1 = f1_score(y_test, best_ebm_pred)
print("f1 score:", best_ebm_f1)



f1 score: 0.5932721712538226


## **Linear SVM** ##

In [98]:
#basic linear svm:
lsvm = LinearSVC(random_state=42, max_iter=100000)
lsvm.fit(X_train, y_train)

y_pred = lsvm.predict(X_test)
lsvm_f1 = f1_score(y_test, y_pred)
print("f1 score:", lsvm_f1)




f1 score: 0.5975609756097562


#### **Hyperparameter Tuning**

In [88]:
#hyperparameter tuning 

lsvm_param_grid = {
    'C': [0.1, 1.0, 10],
    'intercept_scaling': [0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge'],
}


lsvm_grid_search = GridSearchCV(estimator=lsvm, param_grid=lsvm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
lsvm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", lsvm_grid_search.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'C': 0.1, 'intercept_scaling': 0.1, 'loss': 'hinge'}




In [94]:
#best lsvm:
best_lsvm = LinearSVC(**lsvm_grid_search.best_params_, max_iter=10000)
best_lsvm.fit(X_train, y_train)
best_lsvm_pred = best_lsvm.predict(X_test)
best_lsvm_f1 = f1_score(y_test, best_lsvm_pred)
print("f1 score:", best_lsvm_f1)


f1 score: 0.6149253731343284




## **XGBoost** ##

In [100]:
#basic xgboost:
xgboost = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
xgb_f1 = f1_score(y_test, y_pred)
print("f1 score:", xgb_f1)


f1 score: 0.5062500000000001


#### **Hyperparameter Tuning**

In [101]:
#hyperparameter tuning 

xgb_param_grid = {
    #'n_estimators': [100, 250, 500], 
    'n_estimators': [40, 50, 60], 
    'max_depth': [1, 2, 3],      
    'min_child_weight': [20, 30, 40],   
    'colsample_bytree': [0.4, 0.5, 0.6],
    'gamma': [0, 1, 2]       
}


xgb_grid_search = GridSearchCV(estimator=xgboost, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
xgb_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", xgb_grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'colsample_bytree': 0.4, 'gamma': 1, 'max_depth': 1, 'min_child_weight': 40, 'n_estimators': 40}


In [103]:
#best xgb:
best_xgb = xgb.XGBClassifier(**xgb_grid_search.best_params_)
best_xgb.fit(X_train, y_train)
best_xgb_pred = best_xgb.predict(X_test)
best_xgb_f1 = f1_score(y_test, best_xgb_pred)
print("f1 score:", best_xgb_f1)


f1 score: 0.5833333333333334


## **Neural Network** ##

In [117]:

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = K.cast(y_true, 'bool')
        y_pred = K.cast(K.round(y_pred), 'bool')

        # Cast boolean tensors to float
        true_positives = K.cast(K.sum(K.cast(y_true & y_pred, 'float32')), 'float32')
        false_positives = K.cast(K.sum(K.cast(~y_true & y_pred, 'float32')), 'float32')
        false_negatives = K.cast(K.sum(K.cast(y_true & ~y_pred, 'float32')), 'float32')

        self.true_positives.assign_add(true_positives)
        self.false_positives.assign_add(false_positives)
        self.false_negatives.assign_add(false_negatives)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + K.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + K.epsilon())
        f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
        return f1

    def reset_state(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)


In [118]:


neural_net = Sequential()
neural_net.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
neural_net.add(Dense(64, activation='relu'))
neural_net.add(Dense(1, activation='sigmoid')) #for binary features

neural_net.compile(optimizer='adam', loss='binary_crossentropy', metrics=[F1Score()])
history = neural_net.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

loss, nn_f1 = neural_net.evaluate(X_test, y_test)
print("f1 score:", nn_f1)


f1 score: 0.4066980481147766


In [119]:
def create_model(optimizer, activation):
    model = Sequential()
    model.add(Dense(128, activation=activation, input_shape=(X_train.shape[1],)))
    model.add(Dense(64, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[F1Score()])
    return model



In [126]:

from sklearn.model_selection import train_test_split
X_tune, X_tunetest, y_tune, y_tunetest = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


#hyperparameter grid
optimizers = ['adam', 'adagrad', 'sgd']
activations = ['relu', 'tanh']
batch_sizes = [32, 64]
epochs = [10, 50]

best_nn_f1 = 0
best_nn_params = {}

for optimizer in optimizers:
    for activation in activations:
        for sizes in batch_sizes:
            for epoch in epochs:
                # Create and train the model
                nn = create_model(optimizer=optimizer, activation=activation)
                nn.fit(X_tune, y_tune, epochs=epoch, batch_size=sizes, verbose=0)
                
                # Evaluate the model
                loss, nn_f1 = nn.evaluate(X_tunetest, y_tunetest, verbose=0)
                
                # Compare and store the best parameters
                if nn_f1 > best_nn_f1:
                    best_nn_f1 = nn_f1
                    best_nn_params = {'optimizer': optimizer, 'activation': activation, 
                                      'batch_size': sizes, 'epoch': epoch}

print(f"Best Parameters: {best_nn_params}")

Best Parameters: {'optimizer': 'adam', 'activation': 'relu', 'batch_size': 32, 'epoch': 10}


In [127]:
#best nn:
best_nn = create_model(optimizer=best_nn_params['optimizer'], activation=best_nn_params['activation'])
best_nn.fit(X_train, y_train, epochs=epoch, batch_size=sizes, verbose=0)

best_nn_pred = best_nn.predict(X_test)
best_nn_pred = (best_nn_pred > 0.5).astype(int).ravel()
best_nn_f1 = f1_score(y_test, best_nn_pred)
print("f1 score:", best_nn_f1)

f1 score: 0.5244956772334294


# **LINEAR SVM WORKED THE BEST**

This was expected, based on the published study. 