### Data

In [2]:
import pandas as pd

X_train = pd.read_csv('../../data/pca_class_train_df.csv')
X_test = pd.read_csv('../../data/pca_class_test_df.csv')

In [3]:
import numpy as np

y_train = np.load('../../data/prep_class_train_y.npy', allow_pickle=True)
y_test = np.load('../../data/prep_class_test_y.npy', allow_pickle=True)

### Cross-Validation (K = 10)
#### Naive Bayes

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

nb_clf = GaussianNB()
cv_results = cross_validate(nb_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.5413636964281939
Split 1: 0.540927726862037
Split 2: 0.5413214533619449
Split 3: 0.5402645316151716
Split 4: 0.5421297354821258
Split 5: 0.5403403650754706
Split 6: 0.5413576779299764
Split 7: 0.5384764013726594
Split 8: 0.5435787518587176
Split 9: 0.5409998940725943


#### Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

lr_clf = LogisticRegression(C=0.5, penalty='l1', random_state=42, solver='saga')
cv_results = cross_validate(lr_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.5392128201828734
Split 1: 0.5403987746366437
Split 2: 0.5398123519916622
Split 3: 0.5416922759381646
Split 4: 0.5432581041780364
Split 5: 0.5411533782354635
Split 6: 0.5447392052737194
Split 7: 0.5391265756003321
Split 8: 0.544480374768354
Split 9: 0.5415664117378597


#### K-Nearest Neighbours

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

knn_clf = KNeighborsClassifier(n_neighbors=15, weights='distance')
cv_results = cross_validate(knn_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.932425294621886
Split 1: 0.9332457925513035
Split 2: 0.9340682595537286
Split 3: 0.934183067178038
Split 4: 0.9328961374969567
Split 5: 0.9341383993690154
Split 6: 0.9325893154900253
Split 7: 0.9334094656413248
Split 8: 0.933964697711354
Split 9: 0.9337095340418958


#### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

rf_clf = RandomForestClassifier(max_depth=25, n_estimators=150, random_state=42)
cv_results = cross_validate(rf_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.9349587850119839
Split 1: 0.9358971148949161
Split 2: 0.936442787465342
Split 3: 0.9363167071582132
Split 4: 0.9344636418213375
Split 5: 0.9355388619906295
Split 6: 0.9350345462123316
Split 7: 0.9347981676651227
Split 8: 0.9366022264940312
Split 9: 0.9368055526511785


#### Multilayer Perceptron

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

mlp_clf = MLPClassifier(hidden_layer_sizes=512, random_state=42)
cv_results = cross_validate(mlp_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")



Cross-Validation Results: 
Split 0: 0.7236546119228913
Split 1: 0.7247941313963037
Split 2: 0.7274510628386941
Split 3: 0.7232412075034788
Split 4: 0.7241007882185003
Split 5: 0.7237215547004727
Split 6: 0.7221964517070979
Split 7: 0.7187309230263887
Split 8: 0.7229197002554162
Split 9: 0.7240168256707262


### Statistical Testing
#### Between Best Models

In [6]:
from scipy.stats import ttest_rel

knn_cv = [0.932425294621886, 0.9332457925513035, 0.9340682595537286, 0.934183067178038, 0.9328961374969567,
          0.9341383993690154, 0.9325893154900253, 0.9334094656413248, 0.933964697711354, 0.9337095340418958]
rf_cv = [0.9349587850119839, 0.9358971148949161, 0.936442787465342, 0.9363167071582132, 0.9344636418213375, 
         0.9355388619906295, 0.9350345462123316, 0.9347981676651227, 0.9366022264940312, 0.9368055526511785]

t_stat, p_value = ttest_rel(knn_cv, rf_cv)

print(f"t-statistic: {t_stat}")
print(f"p_value: {p_value}")

t-statistic: -11.983821530365955
p_value: 7.788878642697853e-07


#### Baseline and Best Model

In [2]:
from scipy.stats import ttest_rel

rf_cv = [0.9349587850119839, 0.9358971148949161, 0.936442787465342, 0.9363167071582132, 0.9344636418213375, 
         0.9355388619906295, 0.9350345462123316, 0.9347981676651227, 0.9366022264940312, 0.9368055526511785]
dummy_cv = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

t_stat, p_value = ttest_rel(rf_cv, dummy_cv)

print(f"t-statistic: {t_stat}")
print(f"p_value: {p_value}")

t-statistic: 1638.2761023792937
p_value: 5.989368134616179e-26
