In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectKBest, mutual_info_classif
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, make_scorer, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder

In [2]:
# change the names accordingly
df = pd.read_csv("SDSS_processed.csv")
X = df.drop('Target', axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Preliminary Model Running 

Decision Tree, Random Forest, SVM, KNN, neural networks, and XGBoost

In [3]:
#edits
def metrics(y_test, y_pred, y_scores):
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    n_classes = len(np.unique(y_test))
    precision_recall_auc = []
    for i in range(n_classes):
        y_test_binary = (y_test == i).astype(int)
        y_scores_binary = y_scores[:, i]
        precision_curve, recall_curve, _ = precision_recall_curve(y_test_binary, y_scores_binary)
        auc_score = auc(recall_curve, precision_curve)
        precision_recall_auc.append(auc_score)

    average_precision_recall_auc = np.mean(precision_recall_auc)

    return precision, recall, f1, average_precision_recall_auc

In [4]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_scores_dt = dt.predict_proba(X_test)

precision, recall, f1, auc_pr = metrics(y_test, y_pred_dt, y_scores_dt)
print(f"Decision Tree - Precision: {precision}, Recall: {recall}, F1 Score: {f1}, AUC-PR: {auc_pr}")

Decision Tree - Precision: 0.967944666928406, Recall: 0.9720493452501122, F1 Score: 0.9699779611482736, AUC-PR: 0.9717822999760989


In [5]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_scores_rf = rf.predict_proba(X_test)

precision_rf, recall_rf, f1_rf, auc_pr_rf = metrics(y_test, y_pred_rf, y_scores_rf)
print(f"Random Forest - Precision: {precision_rf}, Recall: {recall_rf}, F1 Score: {f1_rf}, AUC-PR: {auc_pr_rf}")

Random Forest - Precision: 0.9891065594938458, Recall: 0.9745692403011145, F1 Score: 0.9816129388400942, AUC-PR: 0.9936922353451703


In [8]:
svm = CalibratedClassifierCV(SVC())
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
y_scores_svm = svm.predict_proba(X_test)

precision_svm, recall_svm, f1_svm, auc_pr_svm = metrics(y_test, y_pred_svm, y_scores_svm)
print(f"SVM - Precision: {precision_svm}, Recall: {recall_svm}, F1 Score: {f1_svm}, AUC-PR: {auc_pr_svm}")

SVM - Precision: 0.9538519042757213, Recall: 0.9082172946197614, F1 Score: 0.9283977923722504, AUC-PR: 0.9641834616967772


In [6]:
X_train_knn = X_train.to_numpy()
X_test_knn = X_test.to_numpy()
y_train_knn = y_train.to_numpy()
y_test_knn = y_test.to_numpy()

knn = KNeighborsClassifier()
knn.fit(X_train_knn, y_train_knn)
y_pred_knn = knn.predict(X_test_knn)
y_scores_knn = knn.predict_proba(X_test_knn)

precision_knn, recall_knn, f1_knn, auc_pr_knn = metrics(y_test_knn, y_pred_knn, y_scores_knn)
print(f"KNN - Precision: {precision_knn}, Recall: {recall_knn}, F1 Score: {f1_knn}, AUC-PR: {auc_pr_knn}")

KNN - Precision: 0.979110133335238, Recall: 0.9649239199766816, F1 Score: 0.9717985479006469, AUC-PR: 0.9864768971157232


In [7]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_scores_xgb = xgb.predict_proba(X_test)

precision_xgb, recall_xgb, f1_xgb, auc_pr_xgb = metrics(y_test, y_pred_xgb, y_scores_xgb)
print(f"XGBoost - Precision: {precision_xgb}, Recall: {recall_xgb}, F1 Score: {f1_xgb}, AUC-PR: {auc_pr_xgb}")

XGBoost - Precision: 0.9899651318155703, Recall: 0.9787873290994016, F1 Score: 0.9842442590575344, AUC-PR: 0.9949193613258424


Hyperparameter Tuning

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}
tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best AUC-PR score:", grid_search.best_score_)

In [11]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='f1_weighted', verbose=20)
grid_search_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best F1-weighted score for Random Forest:", grid_search_rf.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5; 1/144] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 1/5; 1/144] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.992 total time=   0.7s
[CV 2/5; 1/144] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 2/5; 1/144] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.992 total time=   0.6s
[CV 3/5; 1/144] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 3/5; 1/144] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.992 total time=   0.6s
[CV 4/5; 1/144] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10
[CV 4/5; 1/144] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.994 total time=   0.6s
[CV 5/5; 1/144] START max_depth=None, min_samples_lea

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)
param_grid_svm = {
    'base_estimator__C': [0.001, 0.1, 1, 10],
    'base_estimator__kernel': ['linear', 'rbf', 'poly'],
    'base_estimator__gamma': ['scale', 'auto']
}

svm = CalibratedClassifierCV(SVC(), cv=3)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best AUC-PR score for SVM:", grid_search_svm.best_score_)

In [None]:
auc_pr_scorer = make_scorer(average_precision_score, needs_proba=True)

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring=auc_pr_scorer, verbose=1, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Best AUC-PR score for KNN:", grid_search_knn.best_score_)

In [12]:
param_grid_xgb = {
    'n_estimators': [50, 100, 300],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_depth': [None, 1, 5, 10]
}

xgb = XGBClassifier()
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='f1_weighted', verbose=20)
grid_search_xgb.fit(X_train, y_train)

print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best AUC-PR score for XGBoost:", grid_search_xgb.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START learning_rate=0.01, max_depth=None, n_estimators=50........
[CV 1/5; 1/48] END learning_rate=0.01, max_depth=None, n_estimators=50;, score=0.992 total time=   1.5s
[CV 2/5; 1/48] START learning_rate=0.01, max_depth=None, n_estimators=50........
[CV 2/5; 1/48] END learning_rate=0.01, max_depth=None, n_estimators=50;, score=0.992 total time=   1.5s
[CV 3/5; 1/48] START learning_rate=0.01, max_depth=None, n_estimators=50........
[CV 3/5; 1/48] END learning_rate=0.01, max_depth=None, n_estimators=50;, score=0.992 total time=   1.4s
[CV 4/5; 1/48] START learning_rate=0.01, max_depth=None, n_estimators=50........
[CV 4/5; 1/48] END learning_rate=0.01, max_depth=None, n_estimators=50;, score=0.993 total time=   1.5s
[CV 5/5; 1/48] START learning_rate=0.01, max_depth=None, n_estimators=50........
[CV 5/5; 1/48] END learning_rate=0.01, max_depth=None, n_estimators=50;, score=0.992 total time=   1.5s
[CV 1/5; 2/48