In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectKBest, mutual_info_classif
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, make_scorer, average_precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier

In [13]:
# change the names accordingly
df = pd.read_csv("final_for_ml.csv")
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Preliminary Model Running 

Decision Tree, Random Forest, SVM, KNN, neural networks, and XGBoost

In [14]:
def metrics(y_test, y_pred, y_scores):
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_scores)
    return precision, recall, f1, auc_pr

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_scores_dt = dt.predict_proba(X_test)[:, 1]

precision, recall, f1, auc_pr = metrics(y_test, y_pred_dt, y_scores_dt)
print(f"Decision Tree - Precision: {precision}, Recall: {recall}, F1 Score: {f1}, AUC-PR: {auc_pr}")

Decision Tree - Precision: 0.5578947368421052, Recall: 0.5602536997885835, F1 Score: 0.559071729957806, AUC-PR: 0.5513685605576214


In [16]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_scores_rf = rf.predict_proba(X_test)[:, 1]

precision_rf, recall_rf, f1_rf, auc_pr_rf = metrics(y_test, y_pred_rf, y_scores_rf)
print(f"Random Forest - Precision: {precision_rf}, Recall: {recall_rf}, F1 Score: {f1_rf}, AUC-PR: {auc_pr_rf}")

Random Forest - Precision: 0.5734513274336284, Recall: 0.6849894291754757, F1 Score: 0.6242774566473989, AUC-PR: 0.5859312185986763


In [19]:
svm = CalibratedClassifierCV(SVC())
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
y_scores_svm = svm.predict_proba(X_test)[:, 1]

precision_svm, recall_svm, f1_svm, auc_pr_svm = metrics(y_test, y_pred_svm, y_scores_svm)
print(f"SVM - Precision: {precision_svm}, Recall: {recall_svm}, F1 Score: {f1_svm}, AUC-PR: {auc_pr_svm}")

AUC-ROC: 0.5299648347445471
SVM - Precision: 0.5462304409672831, Recall: 0.8118393234672304, F1 Score: 0.6530612244897961, AUC-PR: 0.5666947402900635


In [25]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_scores_knn = knn.predict_proba(X_test)[:, 1]

precision_knn, recall_knn, f1_knn, auc_pr_knn = metrics(y_test, y_pred_knn, y_scores_knn)
print(f"KNN - Precision: {precision_knn}, Recall: {recall_knn}, F1 Score: {f1_knn}, AUC-PR: {auc_pr_knn}")

KNN - Precision: 0.5573333333333333, Recall: 0.4418604651162791, F1 Score: 0.4929245283018868, AUC-PR: 0.5583585014802174


In [26]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_scores_xgb = xgb.predict_proba(X_test)[:, 1]

precision_xgb, recall_xgb, f1_xgb, auc_pr_xgb = metrics(y_test, y_pred_xgb, y_scores_xgb)
print(f"XGBoost - Precision: {precision_xgb}, Recall: {recall_xgb}, F1 Score: {f1_xgb}, AUC-PR: {auc_pr_xgb}")

XGBoost - Precision: 0.5708502024291497, Recall: 0.5961945031712473, F1 Score: 0.5832471561530507, AUC-PR: 0.5628932547022664


Hyperparameter Tuning

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}
tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best AUC-PR score:", grid_search.best_score_)

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200], 
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt']
}

rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best AUC-PR score for Random Forest:", grid_search_rf.best_score_)

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid_svm = {
    'base_estimator__C': [0.001, 0.1, 1, 10],
    'base_estimator__kernel': ['linear', 'rbf', 'poly'],
    'base_estimator__gamma': ['scale', 'auto']
}

svm = CalibratedClassifierCV(SVC(), cv=3)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best AUC-PR score for SVM:", grid_search_svm.best_score_)

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Best AUC-PR score for KNN:", grid_search_knn.best_score_)

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)

param_grid_xgb = {
    'n_estimators': [50, 100, 200, 300], 
    'learning_rate': [0.01, 0.1, 0.2, 0.5, 1], 
    'max_depth': [None, 3, 5, 10], 
    'subsample': [0.3, 0.5, 0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best AUC-PR score for XGBoost:", grid_search_xgb.best_score_)