In [30]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zzettrkalpakbal/full-filled-brain-stroke-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset/versions/2


In [31]:
import os
os.listdir(path)

['full_filled_stroke_data (1).csv', 'full_data.csv']

In [32]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [33]:
df = pd.read_csv(f'{path}/full_filled_stroke_data (1).csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,31.555602,never smoked,1
1,Female,59.0,0,0,Yes,Private,Rural,76.15,30.242937,Unknown,1
2,Male,78.0,0,1,Yes,Private,Urban,219.84,30.698951,Unknown,1
3,Male,57.0,0,1,No,Govt_job,Urban,217.08,33.80841,Unknown,1
4,Male,58.0,0,0,Yes,Private,Rural,189.84,31.378534,Unknown,1


In [34]:
df.shape

(201, 11)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             201 non-null    object 
 1   age                201 non-null    float64
 2   hypertension       201 non-null    int64  
 3   heart_disease      201 non-null    int64  
 4   ever_married       201 non-null    object 
 5   work_type          201 non-null    object 
 6   Residence_type     201 non-null    object 
 7   avg_glucose_level  201 non-null    float64
 8   bmi                201 non-null    float64
 9   smoking_status     201 non-null    object 
 10  stroke             201 non-null    int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 17.4+ KB


In [36]:
df['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [37]:
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

In [38]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,61.0,0,0,Yes,Self-employed,Rural,202.21,31.555602,never smoked,1
1,0,59.0,0,0,Yes,Private,Rural,76.15,30.242937,Unknown,1
2,1,78.0,0,1,Yes,Private,Urban,219.84,30.698951,Unknown,1
3,1,57.0,0,1,No,Govt_job,Urban,217.08,33.80841,Unknown,1
4,1,58.0,0,0,Yes,Private,Rural,189.84,31.378534,Unknown,1


In [39]:
df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [40]:
df['ever_married'] = df['ever_married'].map({'No': 0, 'Yes': 1})

In [41]:
df['work_type'].unique()

array(['Self-employed', 'Private', 'Govt_job', 'children'], dtype=object)

In [42]:
enc = OneHotEncoder(categories='auto')
worktype = df['work_type'].values.reshape(-1, 1)
enc.fit(worktype)
new_features = enc.get_feature_names_out()
print(new_features)
new_worktype = pd.DataFrame(enc.transform(worktype).toarray())

['x0_Govt_job' 'x0_Private' 'x0_Self-employed' 'x0_children']


In [43]:
new_worktype.columns = ['Govt_job', 'Private', 'Self-employed', 'children']

In [44]:
df = pd.concat([df, new_worktype], axis=1)
df.drop('work_type', axis=1, inplace=True)

In [45]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Private,Self-employed,children
0,0,61.0,0,0,1,Rural,202.21,31.555602,never smoked,1,0.0,0.0,1.0,0.0
1,0,59.0,0,0,1,Rural,76.15,30.242937,Unknown,1,0.0,1.0,0.0,0.0
2,1,78.0,0,1,1,Urban,219.84,30.698951,Unknown,1,0.0,1.0,0.0,0.0
3,1,57.0,0,1,0,Urban,217.08,33.80841,Unknown,1,1.0,0.0,0.0,0.0
4,1,58.0,0,0,1,Rural,189.84,31.378534,Unknown,1,0.0,1.0,0.0,0.0


In [46]:
df['Residence_type'] = df['Residence_type'].map({'Rural': 0, 'Urban': 1})

In [47]:
df['smoking_status'].unique()

array(['never smoked', 'Unknown', 'formerly smoked', 'smokes'],
      dtype=object)

In [48]:
df['smoking_status'] = df['smoking_status'].map({'Unknown': 0, 'never smoked': 1, 'formerly smoked': 2, 'smokes': 3})

In [49]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Private,Self-employed,children
0,0,61.0,0,0,1,0,202.21,31.555602,1,1,0.0,0.0,1.0,0.0
1,0,59.0,0,0,1,0,76.15,30.242937,0,1,0.0,1.0,0.0,0.0
2,1,78.0,0,1,1,1,219.84,30.698951,0,1,0.0,1.0,0.0,0.0
3,1,57.0,0,1,0,1,217.08,33.80841,0,1,1.0,0.0,0.0,0.0
4,1,58.0,0,0,1,0,189.84,31.378534,0,1,0.0,1.0,0.0,0.0


In [50]:
y = df['stroke'].values
X = df.drop('stroke', axis=1)

In [51]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(201, 13)
(201, 12)


In [52]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  97
Male:  104


In [53]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [54]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT




Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female     0.00    1.000    0.000     1.00       0      15       0   
1     2  Female     0.25    0.875    0.125     0.75       1      14       2   
2     3  Female     0.00    1.000    0.000     1.00       0      15       0   
3     4  Female     0.00    1.000    0.000     1.00       0      15       0   
4     5  Female     0.00    1.000    0.000     1.00       0      15       0   
5     1    Male     0.00    1.000    0.000     1.00       0      17       0   
6     2    Male     0.00    1.000    0.000     1.00       0      17       0   
7     3    Male     0.00    1.000    0.000     1.00       0 



In [55]:
results_df = pd.concat(results_list, ignore_index=True)

In [56]:
result_path = './results/k18_result.xlsx'
results_df.to_excel(result_path, index=False)

In [57]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.0,1.0,0.0,1.0,0,15,0,5,...,2,2,0.4,1.0,0.0,0.6,2,15,0,3
1,2,Female,0.25,0.875,0.125,0.75,1,14,2,3,...,3,3,0.5,0.3125,0.6875,0.5,2,5,11,2
2,3,Female,0.0,1.0,0.0,1.0,0,15,0,4,...,1,4,0.0,1.0,0.0,1.0,0,15,0,4
3,4,Female,0.0,1.0,0.0,1.0,0,15,0,4,...,1,3,0.25,0.933333,0.066667,0.75,1,14,1,3
4,5,Female,0.0,1.0,0.0,1.0,0,15,0,4,...,3,2,0.25,0.733333,0.266667,0.75,1,11,4,3


In [58]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(14.0), pvalue=np.float64(0.7971696931337416))
DT -TPR: TtestResult(statistic=np.float64(-1.0145065254199583), pvalue=np.float64(0.3400377613123261), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(0.9448711245555644), pvalue=np.float64(0.37238323438936843), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(10.0), pvalue=np.float64(0.6742358755985722))
RF -TPR: MannwhitneyuResult(statistic=np.float64(14.5), pvalue=np.float64(0.7432568200144034))
RF - FPR: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(13.5), pvalue=np.float64(0.9160510722818964))
LR -TPR: MannwhitneyuResult(statistic=np.float64(18.0), pvalue=np.float64(0.2585329597855