# All features

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### ML models ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.ml_training import train_evaluate_single

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(working_memory=1024*10) 



In [2]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df_train = pd.read_csv(data_location + "train_filled_mapped.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_filled_mapped.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']

In [3]:
original_dtypes = X_train.dtypes

X_train = (X_train - X_train.min(axis=0)) / (X_train.max(axis=0)-X_train.min(axis=0))              # min max scale
# X_train_scaled = (X_train - X_train.mean())/X_train.std() # If we use StandardScaler, the feature names will be lost, so we do it mannually.

# x_scaled.hist(figsize=(16, 20), bins=30, edgecolor="black") # plot to show features after scaling
# plt.subplots_adjust()

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_train[column] = X_train[column].astype(dtype)


original_dtypes = X_test.dtypes

# Apply Min-Max scaling based on training set statistics
X_test= (X_test - X_train.min(axis=0)) / (X_train.max(axis=0) - X_train.min(axis=0))

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_test[column] = X_test[column].astype(dtype)

In [4]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.939806,0.594107,0.43617,0.204102,0.278079,0.805187,3.735427


In [5]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 20086, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 353653, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056796 -> initscore=-2.809821
[LightGBM] [Info] Start training from score -2.809821


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.944047,0.659606,0.511271,0.33871,0.407474,0.868264,2.189588


In [6]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.943369,0.501774,0.857143,0.003584,0.007139,0.855698,18.9726


In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.943199,0.5,0.0,0.0,0.0,0.702928,31.83778


In [10]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [11]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.374454,0.655199,0.081286,0.971924,0.150025,0.778193,1.419627


In [12]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.943199,0.5,0.0,0.0,0.0,0.507879,0.0701


In [13]:
all_features_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
all_features_results = all_features_results.round(3)
all_features_results.to_csv('../Data/results_all_features.csv', index=False)
all_features_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.94,0.594,0.436,0.204,0.278,0.805,3.735
0,LGBM,0.944,0.66,0.511,0.339,0.407,0.868,2.19
0,RF,0.943,0.502,0.857,0.004,0.007,0.856,18.973
0,kNN,0.943,0.5,0.0,0.0,0.0,0.508,0.07
0,QDA,0.374,0.655,0.081,0.972,0.15,0.778,1.42
0,MLP,0.943,0.5,0.0,0.0,0.0,0.703,31.838
