### **Imports** ###

In [12]:
import pandas as pd
import xgboost as xgb

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from imblearn.over_sampling import SMOTE



### **Training and Test Sets** ###

In [13]:
training_set = pd.read_csv('train_with_misclass.csv')
X_train = training_set.loc[:,:'five_year']
y_train = training_set['general_two_year'].values

test_set = pd.read_csv('dataset/standardized_testing.csv')
X_test = test_set.loc[:,:'five_year']
y_test = test_set['general_two_year'].values


In [8]:
#smote = SMOTE(random_state=42)
#X_smote, y_smote = smote.fit_resample(X_train, y_train)

## **CART** 

In [21]:
#basic cart:
cart = DecisionTreeClassifier(random_state=42)
cart.fit(X_train, y_train)

cart_pred = cart.predict(X_test)
cart_accuracy = accuracy_score(y_test, cart_pred)
print("Accuracy:", cart_accuracy)


Accuracy: 0.5447570332480819


#### **Hyperparameter Tuning**

In [22]:
#hyperparameter tuning 

cart_param_grid = {
    'max_depth': [1, 5, 10, 100],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [10, 20, 40, 50, 100],
    'max_leaf_nodes': [2, 10, 15, 20]
}

cart_grid_search = GridSearchCV(estimator=cart, param_grid=cart_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
cart_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", cart_grid_search.best_params_)


Fitting 5 folds for each of 320 candidates, totalling 1600 fits
Best Hyperparameters: {'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_leaf': 50, 'min_samples_split': 2}


In [23]:
#best cart:

best_cart = DecisionTreeClassifier(**cart_grid_search.best_params_)
best_cart.fit(X_train, y_train)
best_cart_pred = best_cart.predict(X_test)
best_cart_accuracy = accuracy_score(y_test, best_cart_pred)
print("Accuracy with Best Hyperparameters:", best_cart_accuracy)



Accuracy with Best Hyperparameters: 0.6368286445012787


In [24]:
best_cart_train = best_cart.predict(X_train)
print(accuracy_score(y_train, best_cart_train))

0.6380525304292121


## **EBM** ##

In [25]:
#basic ebm:
ebm = ExplainableBoostingClassifier(random_state=42, n_jobs=-1)
ebm.fit(X_train, y_train)

ebm_pred = ebm.predict(X_test)
ebm_accuracy = accuracy_score(y_test, ebm_pred)
print("Accuracy:", ebm_accuracy)


Accuracy: 0.6470588235294118


#### **Hyperparameter Tuning**

In [26]:
#hyperparameter tuning 

ebm_param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_bins': [256, 512],
    'max_interaction_bins': [16, 32, 64],
    'interactions': [0, 2, 5],
    'min_samples_leaf': [1, 10],
    'early_stopping_rounds': [10, 50]
}

ebm_grid_search = GridSearchCV(estimator=ebm, param_grid=ebm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
ebm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", ebm_grid_search.best_params_)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Hyperparameters: {'early_stopping_rounds': 10, 'interactions': 5, 'learning_rate': 0.01, 'max_bins': 512, 'max_interaction_bins': 16, 'min_samples_leaf': 10}


In [29]:
#best ebm:
best_ebm = ExplainableBoostingClassifier(**ebm_grid_search.best_params_)
best_ebm.fit(X_train, y_train)
best_ebm_pred = best_ebm.predict(X_test)
best_ebm_accuracy = accuracy_score(y_test, best_ebm_pred)
print("Accuracy with Best Hyperparameters:", best_ebm_accuracy)



Accuracy with Best Hyperparameters: 0.6445012787723785


In [30]:
best_ebm_train = best_ebm.predict(X_train)
print(accuracy_score(y_train, best_ebm_train))

0.6758488148622678


## **Linear SVM** ##

In [31]:
#basic linear svm:
lsvm = LinearSVC(C=1.0, random_state=42)
lsvm.fit(X_train, y_train)

y_pred = lsvm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Accuracy: 0.670076726342711


#### **Hyperparameter Tuning**

In [32]:
#hyperparameter tuning 

lsvm_param_grid = {
    'C': [0.1, 1, 10],
    'intercept_scaling': [0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge']
}


lsvm_grid_search = GridSearchCV(estimator=lsvm, param_grid=lsvm_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
lsvm_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", lsvm_grid_search.best_params_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best Hyperparameters: {'C': 1, 'intercept_scaling': 1, 'loss': 'squared_hinge'}




In [33]:
#best lsvm:
best_lsvm = LinearSVC(**lsvm_grid_search.best_params_)
best_lsvm.fit(X_train, y_train)
best_lsvm_pred = best_lsvm.predict(X_test)
best_lsvm_accuracy = accuracy_score(y_test, best_lsvm_pred)
print("Accuracy with Best Hyperparameters:", best_lsvm_accuracy)




Accuracy with Best Hyperparameters: 0.6624040920716112




In [34]:
best_lsvm_train = best_lsvm.predict(X_train)
print(accuracy_score(y_train, best_lsvm_train))

0.6386931454196029


## **XGBoost** ##

In [36]:
#basic xgboost:
xgboost = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5959079283887468


#### **Hyperparameter Tuning**

In [37]:
#hyperparameter tuning 

xgb_param_grid = {
    'n_estimators': [100, 250, 500], 
    'max_depth': [2, 5, 10],      
    'min_child_weight': [1, 5, 10],  
    'subsample': [0.4, 0.6, 0.8],   
    'colsample_bytree': [0.5, 0.75, 1.0],
    'gamma': [0, 1, 10]       
}


xgb_grid_search = GridSearchCV(estimator=xgboost, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
xgb_grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", xgb_grid_search.best_params_)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Hyperparameters: {'colsample_bytree': 1.0, 'gamma': 10, 'max_depth': 2, 'min_child_weight': 10, 'n_estimators': 250, 'subsample': 0.4}


In [38]:
#best xgb:
best_xgb = xgb.XGBClassifier(**xgb_grid_search.best_params_)
best_xgb.fit(X_train, y_train)
best_xgb_pred = best_xgb.predict(X_test)
best_xgb_accuracy = accuracy_score(y_test, best_xgb_pred)
print("Accuracy with Best Hyperparameters:", best_xgb_accuracy)


Accuracy with Best Hyperparameters: 0.6521739130434783


In [39]:
best_xgb_train = best_xgb.predict(X_train)
print(accuracy_score(y_train, best_xgb_train))

0.649583600256246


## **Neural Network** ##

In [40]:
neural_net = Sequential()
neural_net.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
neural_net.add(Dense(64, activation='relu'))
neural_net.add(Dense(1, activation='sigmoid')) #for binary features

neural_net.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = neural_net.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

loss, accuracy = neural_net.evaluate(X_test, y_test)
print("Accuracy:", accuracy)






Accuracy: 0.5754475593566895


In [41]:
def create_model(optimizer='adam', activation='relu', neurons=128):
    model = Sequential()
    model.add(Dense(neurons, activation=activation, input_shape=(X_train.shape[1],)))
    model.add(Dense(64, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [42]:
#hyperparameter grid
optimizers = ['adam', 'adagrad', 'sgd']
activations = ['relu', 'tanh']
neuron_numbers = [128, 256, 512]
batch_sizes = [32, 64]
epochs = [10, 50]

best_nn_accuracy = 0
best_nn_params = {}

for optimizer in optimizers:
    for activation in activations:
        for neurons in neuron_numbers:
            for sizes in batch_sizes:
                for epoch in epochs:
                    # Create and train the model
                    nn = create_model(optimizer=optimizer, activation=activation, neurons=neurons)
                    nn.fit(X_train, y_train, epochs=epoch, batch_size=sizes, verbose=0)
                    
                    # Evaluate the model
                    loss, accuracy = nn.evaluate(X_test, y_test, verbose=0)
                    
                    # Compare and store the best parameters
                    if accuracy > best_nn_accuracy:
                        best_nn_accuracy = accuracy
                        best_nn_params = {'optimizer': optimizer, 'activation': activation, 'neurons': neurons,
                                          'batch_size': sizes, 'epoch': epoch}

print(f"Best Parameters: {best_nn_params}")


Best Parameters: {'optimizer': 'sgd', 'activation': 'tanh', 'neurons': 256, 'batch_size': 64, 'epoch': 50}


In [43]:
#best nn:
best_nn = create_model(optimizer=best_nn_params['optimizer'], activation=best_nn_params['activation'], neurons=best_nn_params['neurons'])
best_nn.fit(X_train, y_train, epochs=epoch, batch_size=sizes, verbose=0)

best_nn_pred = best_nn.predict(X_test)
best_nn_pred = (best_nn_pred > 0.5).astype(int).ravel()
best_nn_accuracy = accuracy_score(y_test, best_nn_pred)
print("Accuracy with Best Hyperparameters:", best_nn_accuracy)

Accuracy with Best Hyperparameters: 0.6726342710997443


In [44]:
best_nn_train = best_nn.predict(X_train)
best_nn_train = (best_nn_train > 0.5).astype(int).ravel()
print(accuracy_score(y_train, best_nn_train))

0.6483023702754644


# **count misclassifications of the training set**

In [45]:
 
misclassification_count = 0
for i in range(len(y_train)):
    
    # Compare predictions with true label for each model
    if best_cart_train[i] != y_train[i]:
        misclassification_count += 1
    if best_ebm_train[i] != y_train[i]:
        misclassification_count += 1
    if best_lsvm_train[i] != y_train[i]:
        misclassification_count += 1
    if best_xgb_train[i] != y_train[i]:
        misclassification_count += 1
    if best_nn_train[i] != y_train[i]:
        misclassification_count += 1

print(misclassification_count)

2731
