In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zobiabilal/heart-disease-risk-prediction")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/zobiabilal/heart-disease-risk-prediction/versions/1


In [2]:
import os
os.listdir(path)

['heart.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/heart.csv')

In [5]:
df.head()

Unnamed: 0,General_Health,Exercise,Depression,Diabetes,Sex,Age_Category,Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Heart_Disease
0,Poor,No,No,No,Female,70-74,32.66,14.54,Yes,No,No
1,Very Good,No,No,Yes,Female,70-74,77.11,28.29,No,No,Yes
2,Very Good,Yes,No,Yes,Female,60-64,88.45,33.47,No,No,No
3,Poor,Yes,No,Yes,Male,75-79,93.44,28.73,No,Yes,Yes
4,Good,No,No,No,Male,80+,88.45,24.37,Yes,Yes,No


In [6]:
df.shape

(999, 11)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   General_Health       999 non-null    object 
 1   Exercise             999 non-null    object 
 2   Depression           999 non-null    object 
 3   Diabetes             999 non-null    object 
 4   Sex                  999 non-null    object 
 5   Age_Category         999 non-null    object 
 6   Weight_(kg)          999 non-null    float64
 7   BMI                  999 non-null    float64
 8   Smoking_History      999 non-null    object 
 9   Alcohol_Consumption  999 non-null    object 
 10  Heart_Disease        999 non-null    object 
dtypes: float64(2), object(9)
memory usage: 86.0+ KB


In [8]:
df.head()

Unnamed: 0,General_Health,Exercise,Depression,Diabetes,Sex,Age_Category,Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Heart_Disease
0,Poor,No,No,No,Female,70-74,32.66,14.54,Yes,No,No
1,Very Good,No,No,Yes,Female,70-74,77.11,28.29,No,No,Yes
2,Very Good,Yes,No,Yes,Female,60-64,88.45,33.47,No,No,No
3,Poor,Yes,No,Yes,Male,75-79,93.44,28.73,No,Yes,Yes
4,Good,No,No,No,Male,80+,88.45,24.37,Yes,Yes,No


In [9]:
df['Age_Category'].unique()

array(['70-74', '60-64', '75-79', '80+', '65-69', '50-54', '45-49',
       '18-24', '30-34', '55-59', '35-39', '40-44', '25-29'], dtype=object)

In [10]:
df['General_Health'] = df['General_Health'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4})
df['Exercise'] = df['Exercise'].map({'No': 0, 'Yes': 1})
df['Depression'] = df['Depression'].map({'No': 0, 'Yes': 1})
df['Diabetes'] = df['Diabetes'].map({'No': 0, 'Yes': 1})
df['Sex'] = df['Sex'].map({'Female': 0, 'Male': 1})


In [11]:
def merge_age_groups(age_group):
    if age_group in ['18-24', '25-29']:
        return '20-29'
    elif age_group in ['30-34', '35-39']:
        return '30-39'
    elif age_group in ['40-44', '45-49']:
        return '40-49'
    elif age_group in ['50-54', '55-59']:
        return '50-59'
    elif age_group in ['60-64', '65-69']:
        return '60-69'
    elif age_group in ['70-74', '75-79', '80+']:
        return '70+'

In [12]:
df['Age_Category'] = df['Age_Category'].apply(merge_age_groups)

In [13]:
df['Age_Category'].unique()

array(['70+', '60-69', '50-59', '40-49', '20-29', '30-39'], dtype=object)

In [14]:
enc = OneHotEncoder(categories='auto')
age = df['Age_Category'].values.reshape(-1, 1)
enc.fit(age)
new_features = enc.get_feature_names_out()
print(new_features)
new_age = pd.DataFrame(enc.transform(age).toarray())

['x0_20-29' 'x0_30-39' 'x0_40-49' 'x0_50-59' 'x0_60-69' 'x0_70+']


In [15]:
new_age.columns = ['20-29', '30-39', '40-49', '50-59', '60-69', '70+']

In [16]:
df = pd.concat([df, new_age], axis=1)
df.drop('Age_Category', axis=1, inplace=True)

In [17]:
df.head()

Unnamed: 0,General_Health,Exercise,Depression,Diabetes,Sex,Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Heart_Disease,20-29,30-39,40-49,50-59,60-69,70+
0,0,0,0,0,0,32.66,14.54,Yes,No,No,0.0,0.0,0.0,0.0,0.0,1.0
1,3,0,0,1,0,77.11,28.29,No,No,Yes,0.0,0.0,0.0,0.0,0.0,1.0
2,3,1,0,1,0,88.45,33.47,No,No,No,0.0,0.0,0.0,0.0,1.0,0.0
3,0,1,0,1,1,93.44,28.73,No,Yes,Yes,0.0,0.0,0.0,0.0,0.0,1.0
4,2,0,0,0,1,88.45,24.37,Yes,Yes,No,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
df['Heart_Disease'].unique()

array(['No', 'Yes'], dtype=object)

In [19]:
df['Smoking_History'] = df['Smoking_History'].map({'No': 0, 'Yes': 1})
df['Alcohol_Consumption'] = df['Alcohol_Consumption'].map({'No': 0, 'Yes': 1})
df['Heart_Disease'] = df['Heart_Disease'].map({'No': 0, 'Yes': 1})



In [20]:
df.head()

Unnamed: 0,General_Health,Exercise,Depression,Diabetes,Sex,Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Heart_Disease,20-29,30-39,40-49,50-59,60-69,70+
0,0,0,0,0,0,32.66,14.54,1,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,0,0,1,0,77.11,28.29,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
2,3,1,0,1,0,88.45,33.47,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,1,0,1,1,93.44,28.73,0,1,1,0.0,0.0,0.0,0.0,0.0,1.0
4,2,0,0,0,1,88.45,24.37,1,1,0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
y = df['Heart_Disease'].values
X = df.drop('Heart_Disease', axis=1)

In [22]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(999, 15)
(999, 11)


In [23]:
# build mask
gender_0_mask = df['Sex'] == 0
gender_1_mask = df['Sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  687
Male:  312


In [24]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [25]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.000000  1.000000  0.000000  1.000000       0     123   
1     2  Female  0.000000  0.992000  0.008000  1.000000       0     124   
2     3  Female  0.000000  1.000000  0.000000  1.000000       0     124   
3     4  Female  0.000000  1.000000  0.000000  1.000000       0     121   
4     5  Female  0.000000  1.000000  0.000000  1.000000       0     121   
5     1    Male  0.000000  1.000000  0.000000  1.000000       0      51   
6     2    Male  0.000000  1.000000  0.000000  1.000000       0      45   
7     3    Male  0.000000  1.000000  0.000000  1.000000       0      50   
8     4    Male  



In [26]:
results_df = pd.concat(results_list, ignore_index=True)

In [27]:
result_path = './results/K19_result.xlsx'
results_df.to_excel(result_path, index=False)

In [28]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.0,1.0,0.0,1.0,0,123,0,15,...,0,15,0.066667,0.96748,0.03252,0.933333,1,119,4,14
1,2,Female,0.0,0.992,0.008,1.0,0,124,1,13,...,5,12,0.153846,0.96,0.04,0.846154,2,120,5,11
2,3,Female,0.0,1.0,0.0,1.0,0,124,0,13,...,3,12,0.076923,0.983871,0.016129,0.923077,1,122,2,12
3,4,Female,0.0,1.0,0.0,1.0,0,121,0,16,...,2,15,0.0625,0.975207,0.024793,0.9375,1,118,3,15
4,5,Female,0.0,1.0,0.0,1.0,0,121,0,16,...,2,15,0.0625,0.958678,0.041322,0.9375,1,116,5,15


In [29]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.4237107971667934))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.440686016488678))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(18.0), pvalue=np.float64(0.2933255737660211))
DT -TPR: TtestResult(statistic=np.float64(0.7250098857760101), pvalue=np.float64(0.48910648372750565), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-2.3990659903426494), pvalue=np.float64(0.04323970669327355), df=np.float64(8.0))
DT - FN/FP: TtestResult(statistic=np.float64(-1.1753945483386177), pvalue=np.float64(0.273630641040447), df=np.float64(8.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(13.5), pvalue=np.float64(0.9160510722818964))
RF - FPR: TtestResult(statistic=np.float64(-3.8736054333179313), pvalue=np.float64(0.004717231828408018), df=np.float64(8.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(20.0), pvalue=np.float64(0.15079365079365079))
LR -TPR: Mannwhit