In [53]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jillanisofttech/brain-stroke-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/jillanisofttech/brain-stroke-dataset/versions/1


In [54]:
import os
os.listdir(path)

['brain_stroke.csv']

In [55]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [56]:
df = pd.read_csv(f'{path}/brain_stroke.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [57]:
df.shape

(4981, 11)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [59]:
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [60]:
df['smoking_status'] = df['smoking_status'].map({'never smoked': 0, 'Unknown': 1, 'formerly smoked': 2, 'smokes': 3})
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})
df['ever_married'] = df['ever_married'].map({'No': 0, 'Yes': 1})

In [61]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children'], dtype=object)

In [62]:
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [63]:
enc = OneHotEncoder(categories='auto')
worktype = df['work_type'].values.reshape(-1, 1)
enc.fit(worktype)
new_features = enc.get_feature_names_out()
print(new_features)
new_worktype = pd.DataFrame(enc.transform(worktype).toarray())

['x0_Govt_job' 'x0_Private' 'x0_Self-employed' 'x0_children']


In [64]:
new_worktype.columns = ['Govt_job', 'Private', 'Self-employed', 'children']

In [65]:
df = pd.concat([df, new_worktype], axis=1)
df.drop('work_type', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Private,Self-employed,children
0,1,67.0,0,1,1,Urban,228.69,36.6,2,1,0.0,1.0,0.0,0.0
1,1,80.0,0,1,1,Rural,105.92,32.5,0,1,0.0,1.0,0.0,0.0
2,0,49.0,0,0,1,Urban,171.23,34.4,3,1,0.0,1.0,0.0,0.0
3,0,79.0,1,0,1,Rural,174.12,24.0,0,1,0.0,0.0,1.0,0.0
4,1,81.0,0,0,1,Urban,186.21,29.0,2,1,0.0,1.0,0.0,0.0


In [66]:
enc2 = OneHotEncoder(categories='auto')
residence = df['Residence_type'].values.reshape(-1, 1)
enc2.fit(residence)
new_features = enc2.get_feature_names_out()
print(new_features)
new_residence = pd.DataFrame(enc2.transform(residence).toarray())

['x0_Rural' 'x0_Urban']


In [67]:
new_residence.columns = ['Rural', 'Urban']

In [68]:
df = pd.concat([df, new_residence], axis=1)
df.drop('Residence_type', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Private,Self-employed,children,Rural,Urban
0,1,67.0,0,1,1,228.69,36.6,2,1,0.0,1.0,0.0,0.0,0.0,1.0
1,1,80.0,0,1,1,105.92,32.5,0,1,0.0,1.0,0.0,0.0,1.0,0.0
2,0,49.0,0,0,1,171.23,34.4,3,1,0.0,1.0,0.0,0.0,0.0,1.0
3,0,79.0,1,0,1,174.12,24.0,0,1,0.0,0.0,1.0,0.0,1.0,0.0
4,1,81.0,0,0,1,186.21,29.0,2,1,0.0,1.0,0.0,0.0,0.0,1.0


In [69]:
y = df['stroke'].values
X = df.drop('stroke', axis=1)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   int64  
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   int64  
 5   avg_glucose_level  4981 non-null   float64
 6   bmi                4981 non-null   float64
 7   smoking_status     4981 non-null   int64  
 8   stroke             4981 non-null   int64  
 9   Govt_job           4981 non-null   float64
 10  Private            4981 non-null   float64
 11  Self-employed      4981 non-null   float64
 12  children           4981 non-null   float64
 13  Rural              4981 non-null   float64
 14  Urban              4981 non-null   float64
dtypes: float64(9), int64(6)
memory usage: 583.8 KB


In [71]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(4981, 14)
(4981, 12)


In [72]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  2907
Male:  2074


In [73]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [74]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female      0.0      1.0      0.0      1.0       0     556       0   
1     2  Female      0.0      1.0      0.0      1.0       0     557       0   
2     3  Female      0.0      1.0      0.0      1.0       0     552       0   
3     4  Female      0.0      1.0      0.0      1.0       0     546       0   
4     5  Female      0.0      1.0      0.0      1.0       0     556       0   
5     1    Male      0.0      1.0      0.0      1.0       0     397       0   
6     2    Male      0.0      1.0      0.0      1.0       0     393       0   
7     3    Male      0.0      1.0      0.0      1.0       0 



In [75]:
results_df = pd.concat(results_list, ignore_index=True)

In [76]:
result_path = './results/k12_result.xlsx'
results_df.to_excel(result_path, index=False)

In [77]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0,1,0,1,0,556,0,26,...,2,25,0.461538,0.872302,0.127698,0.538462,12,485,71,14
1,2,Female,0,1,0,1,0,557,0,25,...,3,25,0.44,0.885099,0.114901,0.56,11,493,64,14
2,3,Female,0,1,0,1,0,552,0,29,...,2,29,0.482759,0.860507,0.139493,0.517241,14,475,77,15
3,4,Female,0,1,0,1,0,546,0,35,...,0,34,0.4,0.888278,0.111722,0.6,14,485,61,21
4,5,Female,0,1,0,1,0,556,0,25,...,1,24,0.32,0.839928,0.160072,0.68,8,467,89,17


In [78]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
SVM - FN/FP: TtestResult(statistic=np.float64(2.6438156312704884), pvalue=np.float64(0.02953789799685505), df=np.float64(8.0))
DT -TPR: TtestResult(statistic=np.float64(-0.030611326779845176), pvalue=np.float64(0.9763294185966797), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-0.7114553139479731), pvalue=np.float64(0.497019913914162), df=np.float64(8.0))
DT - FN/FP: TtestResult(statistic=np.float64(0.448584312304482), pvalue=np.float64(0.6656328042644181), df=np.float64(8.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
RF - FPR: TtestResult(statistic=np.float64(-1.6446616503839806), pvalue=np.float64(0.1386602517571117), df=np.float64(8.0))
RF - FN/FP: TtestResult(statistic=np.float64(1.6363561120285681), pvalue=np.float64(0.14040224284863254), df=np.float64