In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fatemehmehrparvar/dementia")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/fatemehmehrparvar/dementia/versions/1


In [2]:
import os
os.listdir(path)

['OPTIMAL_combined_3studies_6feb2020.csv']

In [3]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/OPTIMAL_combined_3studies_6feb2020.csv')
df.head()

Unnamed: 0,ID,age,gender,dementia,dementia_all,educationyears,EF,PS,Global,diabetes,...,hypercholesterolemia,lacunes_num,fazekas_cat,study,study1,SVD Simple Score,SVD Amended Score,Fazekas,lac_count,CMB_count
0,1,52.67,male,0.0,0,11.0,-2.403333,-1.29,-1.287,0,...,Yes,more-than-zero,2 to 3,scans,scans,3.0,7.0,3,>5,>=1
1,10,64.58,male,0.0,0,10.0,1.28,0.36,0.744,0,...,Yes,more-than-zero,0 to 1,scans,scans,2.0,3.0,1,1 to 2,>=1
2,100,74.92,male,0.0,0,8.0,-1.44,-1.52,-0.922,0,...,Yes,more-than-zero,0 to 1,scans,scans,1.0,2.0,1,1 to 2,0
3,101,74.83,male,1.0,1,9.0,,-2.136271,-1.301102,0,...,Yes,more-than-zero,2 to 3,scans,scans,2.0,4.0,2,3 to 5,0
4,102,79.25,male,0.0,0,10.0,-0.92,-1.493333,-0.924,0,...,Yes,more-than-zero,2 to 3,scans,scans,2.0,3.0,2,1 to 2,0


In [5]:
df.shape

(1842, 22)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1842 entries, 0 to 1841
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    1842 non-null   int64  
 1   age                   1842 non-null   float64
 2   gender                1842 non-null   object 
 3   dementia              1808 non-null   float64
 4   dementia_all          1842 non-null   int64  
 5   educationyears        1842 non-null   float64
 6   EF                    1634 non-null   float64
 7   PS                    1574 non-null   float64
 8   Global                1534 non-null   float64
 9   diabetes              1842 non-null   int64  
 10  smoking               1831 non-null   object 
 11  hypertension          1842 non-null   object 
 12  hypercholesterolemia  1842 non-null   object 
 13  lacunes_num           1842 non-null   object 
 14  fazekas_cat           1842 non-null   object 
 15  study                

In [7]:
df.drop(['SVD Simple Score', 'SVD Amended Score', 'ID'], axis=1, inplace=True)

In [8]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1510 non-null   float64
 1   gender                1510 non-null   object 
 2   dementia              1510 non-null   float64
 3   dementia_all          1510 non-null   int64  
 4   educationyears        1510 non-null   float64
 5   EF                    1510 non-null   float64
 6   PS                    1510 non-null   float64
 7   Global                1510 non-null   float64
 8   diabetes              1510 non-null   int64  
 9   smoking               1510 non-null   object 
 10  hypertension          1510 non-null   object 
 11  hypercholesterolemia  1510 non-null   object 
 12  lacunes_num           1510 non-null   object 
 13  fazekas_cat           1510 non-null   object 
 14  study                 1510 non-null   object 
 15  study1               

In [9]:
df.head()

Unnamed: 0,age,gender,dementia,dementia_all,educationyears,EF,PS,Global,diabetes,smoking,hypertension,hypercholesterolemia,lacunes_num,fazekas_cat,study,study1,Fazekas,lac_count,CMB_count
0,52.67,male,0.0,0,11.0,-2.403333,-1.29,-1.287,0,current-smoker,Yes,Yes,more-than-zero,2 to 3,scans,scans,3,>5,>=1
1,64.58,male,0.0,0,10.0,1.28,0.36,0.744,0,ex-smoker,Yes,Yes,more-than-zero,0 to 1,scans,scans,1,1 to 2,>=1
2,74.92,male,0.0,0,8.0,-1.44,-1.52,-0.922,0,never-smoker,Yes,Yes,more-than-zero,0 to 1,scans,scans,1,1 to 2,0
3,79.25,male,0.0,0,10.0,-0.92,-1.493333,-0.924,0,ex-smoker,Yes,Yes,more-than-zero,2 to 3,scans,scans,2,1 to 2,0
4,62.75,male,0.0,0,24.0,-0.38,-1.11,-0.526667,0,ex-smoker,Yes,Yes,zero,2 to 3,scans,scans,2,Zero,0


In [10]:
df['gender'] = df['gender'].map({'female': 0, 'male': 1})

In [11]:
df['smoking'] = df['smoking'].map({'never-smoker': 0, 'ex-smoker': 1, 'current-smoker': 2})

In [12]:
df['hypertension'].unique()
df['hypercholesterolemia'].unique()

array(['Yes', 'No'], dtype=object)

In [13]:
df['hypertension'] = df['hypertension'].map({'No': 0, 'Yes': 1})
df['hypercholesterolemia'] = df['hypercholesterolemia'].map({'No': 0, 'Yes': 1})

In [14]:
df['study'].unique()

array(['scans', 'rundmc', 'ASPS-elderly', 'ASPS-family'], dtype=object)

In [15]:
df['lacunes_num'] = df['lacunes_num'].map({'zero': 0, 'more-than-zero': 1})
df['fazekas_cat'] = df['fazekas_cat'].map({'0 to 1': 0, '2 to 3': 1})


In [16]:
enc = OneHotEncoder(categories='auto')
study = df['study'].values.reshape(-1, 1)
enc.fit(study)
new_features = enc.get_feature_names_out()
print(new_features)
new_study = pd.DataFrame(enc.transform(study).toarray())

['x0_ASPS-elderly' 'x0_ASPS-family' 'x0_rundmc' 'x0_scans']


In [17]:
new_study.columns = ['ASPS-elderly', 'ASPS-family', 'rundmc', 'scans']

In [18]:
df = pd.concat([df, new_study], axis=1)
df.drop('study', axis=1, inplace=True)

In [19]:
df.head()

Unnamed: 0,age,gender,dementia,dementia_all,educationyears,EF,PS,Global,diabetes,smoking,...,lacunes_num,fazekas_cat,study1,Fazekas,lac_count,CMB_count,ASPS-elderly,ASPS-family,rundmc,scans
0,52.67,1,0.0,0,11.0,-2.403333,-1.29,-1.287,0,2,...,1,1,scans,3,>5,>=1,0.0,0.0,0.0,1.0
1,64.58,1,0.0,0,10.0,1.28,0.36,0.744,0,1,...,1,0,scans,1,1 to 2,>=1,0.0,0.0,0.0,1.0
2,74.92,1,0.0,0,8.0,-1.44,-1.52,-0.922,0,0,...,1,0,scans,1,1 to 2,0,0.0,0.0,0.0,1.0
3,79.25,1,0.0,0,10.0,-0.92,-1.493333,-0.924,0,1,...,1,1,scans,2,1 to 2,0,0.0,0.0,0.0,1.0
4,62.75,1,0.0,0,24.0,-0.38,-1.11,-0.526667,0,1,...,0,1,scans,2,Zero,0,0.0,0.0,0.0,1.0


In [20]:
df['study1'].unique()

array(['scans', 'rundmc', 'ASPS'], dtype=object)

In [21]:
enc2 = OneHotEncoder(categories='auto')
study1 = df['study1'].values.reshape(-1, 1)
enc2.fit(study1)
new_features = enc2.get_feature_names_out()
print(new_features)
new_study1 = pd.DataFrame(enc2.transform(study1).toarray())

['x0_ASPS' 'x0_rundmc' 'x0_scans']


In [22]:
new_study1.columns = ['ASPS', 'rundmc', 'scans']

In [23]:
df = pd.concat([df, new_study1], axis=1)
df.drop('study1', axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0,age,gender,dementia,dementia_all,educationyears,EF,PS,Global,diabetes,smoking,...,Fazekas,lac_count,CMB_count,ASPS-elderly,ASPS-family,rundmc,scans,ASPS,rundmc.1,scans.1
0,52.67,1,0.0,0,11.0,-2.403333,-1.29,-1.287,0,2,...,3,>5,>=1,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,64.58,1,0.0,0,10.0,1.28,0.36,0.744,0,1,...,1,1 to 2,>=1,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,74.92,1,0.0,0,8.0,-1.44,-1.52,-0.922,0,0,...,1,1 to 2,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,79.25,1,0.0,0,10.0,-0.92,-1.493333,-0.924,0,1,...,2,1 to 2,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,62.75,1,0.0,0,24.0,-0.38,-1.11,-0.526667,0,1,...,2,Zero,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1510 non-null   float64
 1   gender                1510 non-null   int64  
 2   dementia              1510 non-null   float64
 3   dementia_all          1510 non-null   int64  
 4   educationyears        1510 non-null   float64
 5   EF                    1510 non-null   float64
 6   PS                    1510 non-null   float64
 7   Global                1510 non-null   float64
 8   diabetes              1510 non-null   int64  
 9   smoking               1510 non-null   int64  
 10  hypertension          1510 non-null   int64  
 11  hypercholesterolemia  1510 non-null   int64  
 12  lacunes_num           1510 non-null   int64  
 13  fazekas_cat           1510 non-null   int64  
 14  Fazekas               1510 non-null   int64  
 15  lac_count            

In [26]:
df['lac_count'].unique()

array(['>5', '1 to 2', 'Zero', '3 to 5'], dtype=object)

In [27]:
df['lac_count'] = df['lac_count'].map({'Zero': 0, '1 to 2': 1, '3 to 5': 2, '>5': 3})

In [28]:
df['CMB_count'].unique()

array(['>=1', '0'], dtype=object)

In [29]:
df['CMB_count'] = df['CMB_count'].map({'0': 0, '>=1': 1})

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1510 non-null   float64
 1   gender                1510 non-null   int64  
 2   dementia              1510 non-null   float64
 3   dementia_all          1510 non-null   int64  
 4   educationyears        1510 non-null   float64
 5   EF                    1510 non-null   float64
 6   PS                    1510 non-null   float64
 7   Global                1510 non-null   float64
 8   diabetes              1510 non-null   int64  
 9   smoking               1510 non-null   int64  
 10  hypertension          1510 non-null   int64  
 11  hypercholesterolemia  1510 non-null   int64  
 12  lacunes_num           1510 non-null   int64  
 13  fazekas_cat           1510 non-null   int64  
 14  Fazekas               1510 non-null   int64  
 15  lac_count            

In [31]:
y = df['dementia_all'].values
X = df.drop('dementia_all', axis=1)

In [32]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1510, 23)
(1510, 18)


In [33]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  808
Male:  702


In [34]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [35]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group  SVM_TPR   SVM_TNR   SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female      0.0  1.000000  0.000000      1.0       0      79   
1      2  Female      0.0  1.000000  0.000000      1.0       0      80   
2      3  Female      0.0  1.000000  0.000000      1.0       0      79   
3      4  Female      0.0  1.000000  0.000000      1.0       0      79   
4      5  Female      0.0  1.000000  0.000000      1.0       0      79   
5      6  Female      0.0  1.000000  0.000000      1.0       0      79   
6      7  Female      0.0  1.000000  0.000000      1.0       0      78   
7      8  Female      0.0  1.000000  0.000000      1.0       0      77   
8      9  Female      0.0



In [36]:
results_df = pd.concat(results_list, ignore_index=True)

In [37]:
result_path = './results/k102_result.xlsx'
results_df.to_excel(result_path, index=False)

In [38]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.0,1.0,0.0,1.0,0,79,0,2,...,0,2,0.5,0.924051,0.075949,0.5,1,73,6,1
1,2,Female,0.0,1.0,0.0,1.0,0,80,0,1,...,0,1,1.0,0.9125,0.0875,0.0,1,73,7,0
2,3,Female,0.0,1.0,0.0,1.0,0,79,0,2,...,0,2,0.0,0.924051,0.075949,1.0,0,73,6,2
3,4,Female,0.0,1.0,0.0,1.0,0,79,0,2,...,0,2,1.0,0.886076,0.113924,0.0,2,70,9,0
4,5,Female,0.0,1.0,0.0,1.0,0,79,0,2,...,0,2,0.5,0.898734,0.101266,0.5,1,71,8,1


In [39]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(40.0), pvalue=np.float64(0.16748875557485465))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(35.0), pvalue=np.float64(0.07787247147704081))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(59.5), pvalue=np.float64(0.480176889906077))
DT -TPR: MannwhitneyuResult(statistic=np.float64(25.5), pvalue=np.float64(0.03227416338303555))
DT - FPR: TtestResult(statistic=np.float64(-2.976316413876007), pvalue=np.float64(0.008091279748588105), df=np.float64(18.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(57.0), pvalue=np.float64(0.6215988993579933))
RF -TPR: MannwhitneyuResult(statistic=np.float64(30.5), pvalue=np.float64(0.07592963414955697))
RF - FPR: MannwhitneyuResult(statistic=np.float64(35.0), pvalue=np.float64(0.07787247147704081))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(70.0), pvalue=np.float64(0.1265494644605128))
LR -TPR: MannwhitneyuResult(statistic=np.float64(36.5), pvalue=np.float64(0.19593793942588678))
