In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashwatwork/cerebral-stroke-predictionimbalaced-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/shashwatwork/cerebral-stroke-predictionimbalaced-dataset/versions/1


In [2]:
import os
os.listdir(path)

['dataset.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f"{path}/dataset.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [5]:
df.shape

(43400, 12)

In [6]:
df = df.drop_duplicates().reset_index(drop=True)

In [7]:
df.shape

(43400, 12)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


In [9]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
df.drop('id', axis=1, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29072 entries, 0 to 29071
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             29072 non-null  object 
 1   age                29072 non-null  float64
 2   hypertension       29072 non-null  int64  
 3   heart_disease      29072 non-null  int64  
 4   ever_married       29072 non-null  object 
 5   work_type          29072 non-null  object 
 6   Residence_type     29072 non-null  object 
 7   avg_glucose_level  29072 non-null  float64
 8   bmi                29072 non-null  float64
 9   smoking_status     29072 non-null  object 
 10  stroke             29072 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 2.4+ MB


In [12]:
df = df[df['gender'] != 'Other']
df.reset_index(drop=True, inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29065 entries, 0 to 29064
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             29065 non-null  object 
 1   age                29065 non-null  float64
 2   hypertension       29065 non-null  int64  
 3   heart_disease      29065 non-null  int64  
 4   ever_married       29065 non-null  object 
 5   work_type          29065 non-null  object 
 6   Residence_type     29065 non-null  object 
 7   avg_glucose_level  29065 non-null  float64
 8   bmi                29065 non-null  float64
 9   smoking_status     29065 non-null  object 
 10  stroke             29065 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 2.4+ MB


In [14]:
df['ever_married'] = df['ever_married'].map({'No': 0, 'Yes': 1})

In [15]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [16]:
enc = OneHotEncoder(categories='auto')
worktype = df['work_type'].values.reshape(-1, 1)
enc.fit(worktype)
new_features = enc.get_feature_names_out()
print(new_features)
new_worktype = pd.DataFrame(enc.transform(worktype).toarray())

['x0_Govt_job' 'x0_Never_worked' 'x0_Private' 'x0_Self-employed'
 'x0_children']


In [17]:
new_worktype.columns = ['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children']

In [18]:
df = pd.concat([df, new_worktype], axis=1)
df.drop('work_type', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Never_worked,Private,Self-employed,children
0,Male,58.0,1,0,1,Urban,87.96,39.2,never smoked,0,0.0,0.0,1.0,0.0,0.0
1,Female,70.0,0,0,1,Rural,69.04,35.9,formerly smoked,0,0.0,0.0,1.0,0.0,0.0
2,Female,52.0,0,0,1,Urban,77.59,17.7,formerly smoked,0,0.0,0.0,1.0,0.0,0.0
3,Female,75.0,0,1,1,Rural,243.53,27.0,never smoked,0,0.0,0.0,0.0,1.0,0.0
4,Female,32.0,0,0,1,Rural,77.67,32.3,smokes,0,0.0,0.0,1.0,0.0,0.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29065 entries, 0 to 29064
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             29065 non-null  object 
 1   age                29065 non-null  float64
 2   hypertension       29065 non-null  int64  
 3   heart_disease      29065 non-null  int64  
 4   ever_married       29065 non-null  int64  
 5   Residence_type     29065 non-null  object 
 6   avg_glucose_level  29065 non-null  float64
 7   bmi                29065 non-null  float64
 8   smoking_status     29065 non-null  object 
 9   stroke             29065 non-null  int64  
 10  Govt_job           29065 non-null  float64
 11  Never_worked       29065 non-null  float64
 12  Private            29065 non-null  float64
 13  Self-employed      29065 non-null  float64
 14  children           29065 non-null  float64
dtypes: float64(8), int64(4), object(3)
memory usage: 3.3+ MB


In [20]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [21]:
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

In [22]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Never_worked,Private,Self-employed,children
0,1,58.0,1,0,1,Urban,87.96,39.2,never smoked,0,0.0,0.0,1.0,0.0,0.0
1,0,70.0,0,0,1,Rural,69.04,35.9,formerly smoked,0,0.0,0.0,1.0,0.0,0.0
2,0,52.0,0,0,1,Urban,77.59,17.7,formerly smoked,0,0.0,0.0,1.0,0.0,0.0
3,0,75.0,0,1,1,Rural,243.53,27.0,never smoked,0,0.0,0.0,0.0,1.0,0.0
4,0,32.0,0,0,1,Rural,77.67,32.3,smokes,0,0.0,0.0,1.0,0.0,0.0


In [23]:
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [24]:
enc2 = OneHotEncoder(categories='auto')
residence = df['Residence_type'].values.reshape(-1, 1)
enc2.fit(residence)
new_features = enc2.get_feature_names_out()
print(new_features)
new_residence = pd.DataFrame(enc2.transform(residence).toarray())

['x0_Rural' 'x0_Urban']


In [25]:
new_residence.columns = ['Urban', 'Rural']

In [26]:
df = pd.concat([df, new_residence], axis=1)
df.drop('Residence_type', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Never_worked,Private,Self-employed,children,Urban,Rural
0,1,58.0,1,0,1,87.96,39.2,never smoked,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0,70.0,0,0,1,69.04,35.9,formerly smoked,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,52.0,0,0,1,77.59,17.7,formerly smoked,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,75.0,0,1,1,243.53,27.0,never smoked,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0,32.0,0,0,1,77.67,32.3,smokes,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [27]:
df['smoking_status'].unique()

array(['never smoked', 'formerly smoked', 'smokes'], dtype=object)

In [28]:
df['smoking_status'] = df['smoking_status'].map({'never smoked': 0, 'formerly smoked': 1, 'smokes': 2})
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,Govt_job,Never_worked,Private,Self-employed,children,Urban,Rural
0,1,58.0,1,0,1,87.96,39.2,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0,70.0,0,0,1,69.04,35.9,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,52.0,0,0,1,77.59,17.7,1,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,75.0,0,1,1,243.53,27.0,0,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0,32.0,0,0,1,77.67,32.3,2,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [29]:
y = df['stroke'].values
X = df.drop('stroke', axis=1)

In [30]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(29065, 15)
(29065, 11)


In [31]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  17852
Male:  11213


In [32]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [33]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN



In [34]:
results_df = pd.concat(results_list, ignore_index=True)

In [35]:
result_path = './results/K52_result.xlsx'
results_df.to_excel(result_path, index=False)

In [36]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0,1,0,1,0,1760,0,26,...,0,26,0.0,0.977841,0.022159,1.0,0,1721,39,26
1,2,Female,0,1,0,1,0,1750,0,36,...,0,36,0.111111,0.978286,0.021714,0.888889,4,1712,38,32
2,3,Female,0,1,0,1,0,1749,0,36,...,0,36,0.055556,0.975415,0.024585,0.944444,2,1706,43,34
3,4,Female,0,1,0,1,0,1766,0,19,...,0,19,0.0,0.976784,0.023216,1.0,0,1725,41,19
4,5,Female,0,1,0,1,0,1751,0,34,...,0,34,0.176471,0.981725,0.018275,0.823529,6,1719,32,28


In [37]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
SVM - FN/FP: TtestResult(statistic=np.float64(2.928204477054587), pvalue=np.float64(0.008980732767502027), df=np.float64(18.0))
DT -TPR: TtestResult(statistic=np.float64(1.15340754723006), pvalue=np.float64(0.2638295665907278), df=np.float64(18.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(4.0), pvalue=np.float64(0.0005828399431792743))
DT - FN/FP: TtestResult(statistic=np.float64(1.5204662907371955), pvalue=np.float64(0.1457655595759537), df=np.float64(18.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
RF - FPR: MannwhitneyuResult(statistic=np.float64(62.0), pvalue=np.float64(0.30771074846953017))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(52.5), pvalue=np.float64(0.879514088297801))
LR -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
LR - FPR: 