# All features

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

### ML models ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.ml_training import train_evaluate_single

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(working_memory=1024*10) 

## Preprocessed data

In [2]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df_train = pd.read_csv(data_location + "train_filled_mapped.csv")
df_train = optimize_dtypes(df_train)
# df_train.head()

df_test = pd.read_csv(data_location + "test_filled_mapped.csv")
df_test = optimize_dtypes(df_test)


# Separate target variable from feature variables
X_train = df_train.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_train = df_train['HadHeartAttack']

# Separate target variable from feature variables
X_test = df_test.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y_test = df_test['HadHeartAttack']

In [3]:
original_dtypes = X_train.dtypes

X_train = (X_train - X_train.min(axis=0)) / (X_train.max(axis=0)-X_train.min(axis=0))              # min max scale
# X_train_scaled = (X_train - X_train.mean())/X_train.std() # If we use StandardScaler, the feature names will be lost, so we do it mannually.

# x_scaled.hist(figsize=(16, 20), bins=30, edgecolor="black") # plot to show features after scaling
# plt.subplots_adjust()

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_train[column] = X_train[column].astype(dtype)


original_dtypes = X_test.dtypes

# Apply Min-Max scaling based on training set statistics
X_test= (X_test - X_train.min(axis=0)) / (X_train.max(axis=0) - X_train.min(axis=0))

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_test[column] = X_test[column].astype(dtype)

In [4]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.939806,0.594107,0.43617,0.204102,0.278079,0.805187,3.735427


In [5]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 20086, number of negative: 333567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 353653, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056796 -> initscore=-2.809821
[LightGBM] [Info] Start training from score -2.809821


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.944047,0.659606,0.511271,0.33871,0.407474,0.868264,2.189588


In [6]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.943369,0.501774,0.857143,0.003584,0.007139,0.855698,18.9726


In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.943199,0.5,0.0,0.0,0.0,0.702928,31.83778


In [10]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [11]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.374454,0.655199,0.081286,0.971924,0.150025,0.778193,1.419627


In [12]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.943199,0.5,0.0,0.0,0.0,0.507879,0.0701


In [13]:
all_features_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
all_features_results = all_features_results.round(3)
all_features_results.to_csv('../Data/results_all_features.csv', index=False)
all_features_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.94,0.594,0.436,0.204,0.278,0.805,3.735
0,LGBM,0.944,0.66,0.511,0.339,0.407,0.868,2.19
0,RF,0.943,0.502,0.857,0.004,0.007,0.856,18.973
0,kNN,0.943,0.5,0.0,0.0,0.0,0.508,0.07
0,QDA,0.374,0.655,0.081,0.972,0.15,0.778,1.42
0,MLP,0.943,0.5,0.0,0.0,0.0,0.703,31.838


## Unpreprocessed data (only dropped NaN values and mapping)

In [3]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df = pd.read_csv(data_location + "heart_2022_no_nans.csv")
df = df.drop('State', axis = 1)
df.insert(len(df.columns)-1, 'HadHeartAttack', df.pop('HadHeartAttack'))



In [4]:
def map_categorical_features(df):
    """
    Map categorical features to integer values.

    Parameters:
    - df (DataFrame): The DataFrame to map categorical features.

    Returns:
    - df_mapped (DataFrame): The DataFrame with mapped categorical features.
    """
    map_lexicon = {
        'Sex': {'Female': 0, 'Male': 1},
        'GeneralHealth': {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4},
        'LastCheckupTime': {'5 or more years ago': 0, 'Within past 5 years (2 years but less than 5 years ago)': 1,
                            'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past year (anytime less than 12 months ago)': 3},
        'PhysicalActivities': {'No': 0, 'Yes': 1},
        'RemovedTeeth': {'All': 0, '6 or more, but not all': 1, '1 to 5': 2, 'None of them': 3},
        'HadAngina': {'No': 0, 'Yes': 1},
        'HadStroke': {'No': 0, 'Yes': 1},
        'HadAsthma': {'No': 0, 'Yes': 1},
        'HadSkinCancer': {'No': 0, 'Yes': 1},
        'HadCOPD': {'No': 0, 'Yes': 1},
        'HadDepressiveDisorder': {'No': 0, 'Yes': 1},
        'HadKidneyDisease': {'No': 0, 'Yes': 1},
        'HadArthritis': {'No': 0, 'Yes': 1},
        'HadDiabetes': {'No': 0, 'No, pre-diabetes or borderline diabetes': 1, 'Yes, but only during pregnancy (female)': 2, 'Yes': 3},
        'DeafOrHardOfHearing': {'No': 0, 'Yes': 1},
        'BlindOrVisionDifficulty': {'No': 0, 'Yes': 1},
        'DifficultyConcentrating': {'No': 0, 'Yes': 1},
        'DifficultyWalking': {'No': 0, 'Yes': 1},
        'DifficultyDressingBathing': {'No': 0, 'Yes': 1},
        'DifficultyErrands': {'No': 0, 'Yes': 1},
        'SmokerStatus': {'Never smoked': 0, 'Former smoker': 1, 'Current smoker - now smokes some days': 2, 'Current smoker - now smokes every day': 3},
        'ECigaretteUsage': {'Never used e-cigarettes in my entire life': 0, 'Not at all (right now)': 1, 'Use them some days': 2, 'Use them every day': 3},
        'ChestScan': {'No': 0, 'Yes': 1},
        'RaceEthnicityCategory': {'White only, Non-Hispanic': 0, 'Hispanic': 1, 'Other race only, Non-Hispanic': 2, 'Multiracial, Non-Hispanic': 3, 'Black only, Non-Hispanic': 4},
        'AgeCategory': {'Age 18 to 24': 1, 'Age 25 to 29': 2, 'Age 30 to 34': 3, 'Age 35 to 39': 4, 'Age 40 to 44': 5, 'Age 45 to 49': 6, 'Age 50 to 54': 7, 'Age 55 to 59': 8, 'Age 60 to 64': 9, 'Age 65 to 69': 10, 'Age 70 to 74': 11, 'Age 75 to 79': 12, 'Age 80 or older': 13},
        'AlcoholDrinkers': {'No': 0, 'Yes': 1},
        'HIVTesting': {'No': 0, 'Yes': 1},
        'FluVaxLast12': {'No': 0, 'Yes': 1},
        'PneumoVaxEver': {'No': 0, 'Yes': 1},
        'TetanusLast10Tdap': {'No, did not receive any tetanus shot in the past 10 years': 0, 'Yes, received tetanus shot but not sure what type': 1, 'Yes, received tetanus shot, but not Tdap': 2, 'Yes, received Tdap': 3},
        'HighRiskLastYear': {'No': 0, 'Yes': 1},
        'CovidPos': {'No': 0, 'Tested positive using home test without a health professional': 1, 'Yes': 2},
        'HadHeartAttack': {'No': 0, 'Yes': 1}
    }

    df_mapped = df.copy()
    for column, lexicon in map_lexicon.items():
        df_mapped[column] = df_mapped[column].map(lexicon)
    
    return df_mapped

In [5]:
df_mapped = map_categorical_features(df)


# Separate target variable from feature variables
X = df_mapped.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y = df_mapped['HadHeartAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

original_dtypes = X_train.dtypes

X_train = (X_train - X_train.min(axis=0)) / (X_train.max(axis=0)-X_train.min(axis=0))              # min max scale
# X_train_scaled = (X_train - X_train.mean())/X_train.std() # If we use StandardScaler, the feature names will be lost, so we do it mannually.

# x_scaled.hist(figsize=(16, 20), bins=30, edgecolor="black") # plot to show features after scaling
# plt.subplots_adjust()

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_train[column] = X_train[column].astype(dtype)


original_dtypes = X_test.dtypes

# Apply Min-Max scaling based on training set statistics
X_test= (X_test - X_train.min(axis=0)) / (X_train.max(axis=0) - X_train.min(axis=0))

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_test[column] = X_test[column].astype(dtype)

In [14]:
#how many cases are included in the dataset
length = len(df_mapped)
#how many features are in the dataset
features = df_mapped.shape[1]-1 # - HadHeartAttack

# Number of cases with heart attack
hha_yes = len(df_mapped[df_mapped['HadHeartAttack']==1])

#Number of cases without heart attack
hha_no = len(df_mapped[df_mapped['HadHeartAttack']==0])

#Rate of heart attacks over all cases
rate = (float(hha_yes)/(length))*100

print ("There are "+ str(len(df_mapped))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(hha_yes)+" cases with heart attack")
print ("There are {}".format(hha_no)+" cases without heart attack")
print ("The percentage of heart attack cases is: {:.2f}%".format(rate))

There are 246022 cases in this dataset
There are 38 features in this dataset
There are 13435 cases with heart attack
There are 232587 cases without heart attack
The percentage of heart attack cases is: 5.46%


In [6]:
xgb = XGBClassifier(booster='gbtree', validate_parameters=True, subsample=0.6 , random_state=13)
metrics_xgb = train_evaluate_single(xgb, X_train, y_train, X_test, y_test, classifier_name="XGB")

# Store metrics in a DataFrame
metrics_df_xgb = pd.DataFrame([metrics_xgb])
metrics_df_xgb

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.944518,0.546702,0.462995,0.100112,0.164627,0.814348,2.021778


In [7]:
lgbm = LGBMClassifier( boosting_type='gbdt', objective='binary', subsample=0.6, random_state=13)
metrics_lgbm = train_evaluate_single(lgbm, X_train, y_train, X_test, y_test, classifier_name="LGBM")

# Store metrics in a DataFrame
metrics_df_lgbm = pd.DataFrame([metrics_lgbm])
metrics_df_lgbm

# https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

[LightGBM] [Info] Number of positive: 10748, number of negative: 186069
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 196817, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.054609 -> initscore=-2.851398
[LightGBM] [Info] Start training from score -2.851398


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,LGBM,0.94216,0.570528,0.419125,0.153331,0.224523,0.796985,1.165124


In [8]:
rf = RandomForestClassifier(criterion='entropy', n_jobs=-1, class_weight='balanced', random_state=13)
metrics_rf = train_evaluate_single(rf, X_train, y_train, X_test, y_test, classifier_name="RF")

# Store metrics in a DataFrame
metrics_df_rf = pd.DataFrame([metrics_rf])
metrics_df_rf

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,RF,0.945392,0.5,0.0,0.0,0.0,0.821372,11.240809


In [9]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1)
metrics_knn = train_evaluate_single(knn, X_train, y_train, X_test, y_test, classifier_name="kNN")

# Store metrics in a DataFrame
metrics_df_knn = pd.DataFrame([metrics_knn])
metrics_df_knn

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,kNN,0.945392,0.5,0.0,0.0,0.0,0.500515,0.051234


In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate_init=0.001, max_iter=1000, shuffle=True, random_state=13, tol=0.0001, verbose=False, warm_start=False, early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=20)
metrics_mlp = train_evaluate_single(mlp, X_train, y_train, X_test, y_test, classifier_name="MLP")

# Store metrics in a DataFrame
metrics_df_mlp = pd.DataFrame([metrics_mlp])
metrics_df_mlp

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,MLP,0.945392,0.5,0.0,0.0,0.0,0.545868,19.186951


In [11]:
# Ensure X_train and X_test are in float32 or float64 format
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [12]:
qda = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.7, store_covariance=False, tol=0.0001)
metrics_qda = train_evaluate_single(qda, X_train, y_train, X_test, y_test, classifier_name="QDA")

# Store metrics in a DataFrame
metrics_df_qda = pd.DataFrame([metrics_qda])
metrics_df_qda

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,QDA,0.516675,0.703526,0.094374,0.913286,0.17107,0.771322,0.83862


In [13]:
all_features_no_preprocessing_results = pd.concat([metrics_df_xgb, metrics_df_lgbm, metrics_df_rf, metrics_df_knn, metrics_df_qda, metrics_df_mlp])
all_features_no_preprocessing_results = all_features_no_preprocessing_results.round(3)
all_features_no_preprocessing_results.to_csv('../Data/results_all_features_no_preprocessing.csv', index=False)
all_features_no_preprocessing_results

Unnamed: 0,classifier,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,fit_time
0,XGB,0.945,0.547,0.463,0.1,0.165,0.814,2.022
0,LGBM,0.942,0.571,0.419,0.153,0.225,0.797,1.165
0,RF,0.945,0.5,0.0,0.0,0.0,0.821,11.241
0,kNN,0.945,0.5,0.0,0.0,0.0,0.501,0.051
0,QDA,0.517,0.704,0.094,0.913,0.171,0.771,0.839
0,MLP,0.945,0.5,0.0,0.0,0.0,0.546,19.187
