# Selected Features

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### ML models ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.ml_training import train_evaluate_single

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(working_memory=1024*10) 



## SMOTE

In [2]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df_train = pd.read_csv(data_location + "train_selected_smote.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_selected_from_smote.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']

In [3]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.843124,0.737379,0.206164,0.61808,0.309194,0.834022,3.390107


In [4]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 333567, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 667134, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.840953,0.747643,0.208237,0.642374,0.314517,0.841097,2.721852


In [10]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.838182,0.712396,0.190809,0.57049,0.285971,0.796992,33.59313


In [9]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.927048,0.64077,0.345455,0.317802,0.331052,0.766219,0.087307


In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.815233,0.740651,0.184108,0.656511,0.287571,0.827891,268.631729


In [7]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [8]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.756147,0.754241,0.156774,0.752091,0.259463,0.835067,1.026906


In [11]:
selected_smote_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
selected_smote_results = selected_smote_results.round(3)
selected_smote_results.to_csv('../Data/results_selected_smote.csv', index=False)
selected_smote_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.843,0.737,0.206,0.618,0.309,0.834,3.39
0,LGBM,0.841,0.748,0.208,0.642,0.315,0.841,2.722
0,RF,0.838,0.712,0.191,0.57,0.286,0.797,33.593
0,kNN,0.927,0.641,0.345,0.318,0.331,0.766,0.087
0,QDA,0.756,0.754,0.157,0.752,0.259,0.835,1.027
0,MLP,0.815,0.741,0.184,0.657,0.288,0.828,268.632


## Borderline SMOTE

In [2]:
df_train = pd.read_csv(data_location + "train_selected_borderline_smote.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_selected_from_borderline_smote.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']   

In [3]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.857048,0.741578,0.223159,0.61131,0.326961,0.845309,4.050668


In [4]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 333567, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 667134, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.855351,0.75331,0.22607,0.638192,0.333872,0.851655,2.835012


In [5]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=4, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.855396,0.710949,0.207432,0.547989,0.300946,0.804079,25.569519


In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.837865,0.747316,0.205154,0.645161,0.311314,0.843019,149.620687


In [7]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [8]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.77335,0.761209,0.166659,0.747511,0.272552,0.842196,0.892408


In [9]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.924345,0.645699,0.333133,0.331342,0.332235,0.762984,0.070448


In [10]:
selected_borderline_smote_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
selected_borderline_smote_results = selected_borderline_smote_results.round(3)
selected_borderline_smote_results.to_csv('../Data/results_selected_borderline_smote.csv', index=False)
selected_borderline_smote_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.857,0.742,0.223,0.611,0.327,0.845,4.051
0,LGBM,0.855,0.753,0.226,0.638,0.334,0.852,2.835
0,RF,0.855,0.711,0.207,0.548,0.301,0.804,25.57
0,kNN,0.924,0.646,0.333,0.331,0.332,0.763,0.07
0,QDA,0.773,0.761,0.167,0.748,0.273,0.842,0.892
0,MLP,0.838,0.747,0.205,0.645,0.311,0.843,149.621


## KMeans SMOTE

In [11]:
df_train = pd.read_csv(data_location + "train_selected_kmeans_smote.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_selected_from_kmeans_smote.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']   

In [12]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.899201,0.670545,0.257904,0.412585,0.317402,0.860814,2.968155


In [13]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 333569, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 667136, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initscore=0.000006
[LightGBM] [Info] Start training from score 0.000006


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.89781,0.679632,0.260189,0.433493,0.325192,0.863586,4.837011


In [14]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.897347,0.661048,0.247131,0.394464,0.303881,0.829896,19.728571


In [15]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.942543,0.594528,0.486098,0.201912,0.285312,0.754712,2.501714


In [16]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.895933,0.649725,0.236008,0.371963,0.288784,0.851224,288.256062


In [17]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [18]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.813593,0.736133,0.181252,0.648746,0.283341,0.828633,0.590945


In [19]:
selected_kmeans_smote_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
selected_kmeans_smote_results = selected_kmeans_smote_results.round(3)
selected_kmeans_smote_results.to_csv('../Data/results_selected_kmeans_smote.csv', index=False)
selected_kmeans_smote_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.899,0.671,0.258,0.413,0.317,0.861,2.968
0,LGBM,0.898,0.68,0.26,0.433,0.325,0.864,4.837
0,RF,0.897,0.661,0.247,0.394,0.304,0.83,19.729
0,kNN,0.943,0.595,0.486,0.202,0.285,0.755,2.502
0,QDA,0.814,0.736,0.181,0.649,0.283,0.829,0.591
0,MLP,0.896,0.65,0.236,0.372,0.289,0.851,288.256


## Adasyn

In [20]:
df_train = pd.read_csv(data_location + "train_selected_adasyn.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_selected_from_adasyn.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']   

In [21]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.840466,0.74411,0.206337,0.635404,0.311515,0.836878,3.06159


In [22]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 329817, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 663384, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497174 -> initscore=-0.011306
[LightGBM] [Info] Start training from score -0.011306


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.840161,0.753024,0.209614,0.654719,0.317558,0.843019,2.373799


In [23]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.835184,0.714456,0.189087,0.578256,0.284985,0.798961,21.637068


In [24]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.802735,0.747967,0.178448,0.686181,0.283237,0.828839,111.053496


In [25]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [26]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.751555,0.756392,0.155554,0.761848,0.258356,0.835234,0.817558


In [28]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.923768,0.654282,0.335943,0.350259,0.342952,0.770312,0.071926


In [29]:
selected_adasyn_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
selected_adasyn_results = selected_adasyn_results.round(3)
selected_adasyn_results.to_csv('../Data/results_selected_adasyn.csv', index=False)
selected_adasyn_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.84,0.744,0.206,0.635,0.312,0.837,3.062
0,LGBM,0.84,0.753,0.21,0.655,0.318,0.843,2.374
0,RF,0.835,0.714,0.189,0.578,0.285,0.799,21.637
0,kNN,0.924,0.654,0.336,0.35,0.343,0.77,0.072
0,QDA,0.752,0.756,0.156,0.762,0.258,0.835,0.818
0,MLP,0.803,0.748,0.178,0.686,0.283,0.829,111.053
