In [18]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/exploring-risk-factors-for-cardiovascular-diseas")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/thedevastator/exploring-risk-factors-for-cardiovascular-diseas/versions/2


In [19]:
import os
os.listdir(path)

['heart_data.csv']

In [20]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [21]:
df = pd.read_csv(f'{path}/heart_data.csv')
df.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [22]:
df.shape

(70000, 14)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        70000 non-null  int64  
 1   id           70000 non-null  int64  
 2   age          70000 non-null  int64  
 3   gender       70000 non-null  int64  
 4   height       70000 non-null  int64  
 5   weight       70000 non-null  float64
 6   ap_hi        70000 non-null  int64  
 7   ap_lo        70000 non-null  int64  
 8   cholesterol  70000 non-null  int64  
 9   gluc         70000 non-null  int64  
 10  smoke        70000 non-null  int64  
 11  alco         70000 non-null  int64  
 12  active       70000 non-null  int64  
 13  cardio       70000 non-null  int64  
dtypes: float64(1), int64(13)
memory usage: 7.5 MB


In [24]:
a = df['gender'].unique()

In [25]:
df['gender'] = df['gender'].map({a[0]:0, a[1]:1})
df['gender'].unique()

array([0, 1])

In [26]:
y = df['cardio'].values
X = df.drop('cardio', axis=1)

In [27]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(70000, 13)
(70000, 11)


In [28]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  24470
Male:  45530


In [29]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=15, shuffle=True, random_state=seed)

In [13]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN

In [14]:
results_df = pd.concat(results_list, ignore_index=True)

In [15]:
result_path = './results/k25_result.xlsx'
results_df.to_excel(result_path, index=False)

In [16]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.680682,0.759309,0.240691,0.319318,599,571,181,281,...,169,302,0.271591,0.917553,0.082447,0.728409,239,690,62,641
1,2,Female,0.683274,0.773131,0.226869,0.316726,576,610,179,267,...,165,253,0.223013,0.936629,0.063371,0.776987,188,739,50,655
2,3,Female,0.700865,0.744836,0.255164,0.299135,567,613,210,242,...,226,214,0.241038,0.917375,0.082625,0.758962,195,755,68,614
3,4,Female,0.698113,0.737157,0.262843,0.301887,555,617,220,240,...,215,248,0.232704,0.917563,0.082437,0.767296,185,768,69,610
4,5,Female,0.690909,0.767038,0.232962,0.309091,570,619,188,255,...,162,271,0.221818,0.916976,0.083024,0.778182,183,740,67,642


In [17]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-1.051224234202105), pvalue=np.float64(0.3021440679237131), df=np.float64(28.0))
SVM - FPR: TtestResult(statistic=np.float64(0.11738654148837806), pvalue=np.float64(0.9076193212479762), df=np.float64(21.990232109095345))
SVM - FN/FP: TtestResult(statistic=np.float64(1.527601021166017), pvalue=np.float64(0.13783048532728398), df=np.float64(28.0))
DT -TPR: MannwhitneyuResult(statistic=np.float64(95.0), pvalue=np.float64(0.48073111045562256))
DT - FPR: TtestResult(statistic=np.float64(0.7874664747166839), pvalue=np.float64(0.437624367977426), df=np.float64(28.0))
DT - FN/FP: TtestResult(statistic=np.float64(1.4198834616056277), pvalue=np.float64(0.1691932673553153), df=np.float64(22.762696567415905))
RF -TPR: TtestResult(statistic=np.float64(-1.5846690471098062), pvalue=np.float64(0.12427237087773485), df=np.float64(28.0))
RF - FPR: TtestResult(statistic=np.float64(0.8370335828636922), pvalue=np.float64(0.4096612486464998), df=np.float64(28.0))
R