In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("morisqed/mental-health-data")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/morisqed/mental-health-data/versions/1


In [7]:
import os
os.listdir(path)

['med dataset.csv']

In [8]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [9]:
df = pd.read_csv(f'{path}/med dataset.csv')

In [10]:
df.shape

(886, 20)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886 entries, 0 to 885
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         886 non-null    int64  
 1   age        886 non-null    int64  
 2   year       886 non-null    int64  
 3   sex        886 non-null    int64  
 4   glang      886 non-null    int64  
 5   part       886 non-null    int64  
 6   job        886 non-null    int64  
 7   stud_h     886 non-null    int64  
 8   health     886 non-null    int64  
 9   psyt       886 non-null    int64  
 10  jspe       886 non-null    int64  
 11  qcae_cog   886 non-null    int64  
 12  qcae_aff   886 non-null    int64  
 13  amsp       886 non-null    int64  
 14  erec_mean  886 non-null    float64
 15  cesd       886 non-null    int64  
 16  stai_t     886 non-null    int64  
 17  mbi_ex     886 non-null    int64  
 18  mbi_cy     886 non-null    int64  
 19  mbi_ea     886 non-null    int64  
dtypes: float64

In [12]:
df['health'].unique()

array([3, 4, 5, 2, 1])

In [13]:
df['sex'].unique()

array([1, 2, 3])

In [14]:
df['sex'].value_counts()

sex
2    606
1    275
3      5
Name: count, dtype: int64

In [15]:
df = df[df['sex'] != 3]

In [16]:
print(df['sex'].unique())

[1 2]


In [17]:
df.shape

(881, 20)

In [18]:
y = df['health'].values
X = df.drop('health', axis=1)

In [19]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(881, 19)
(881, 18)


In [20]:
# build mask
gender_0_mask = df['sex'] == 2
gender_1_mask = df['sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  606
Male:  275


In [21]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [22]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0     1  Female      0.0      0.0      0.0      0.0       0       0       0   
1     2  Female      1.0      0.0      1.0      0.0       1       0       1   
2     3  Female      1.0      0.0      0.0      0.0       2       0       0   
3     4  Female      1.0      0.0      0.0      0.0       3       0       0   
4     5  Female      1.0      0.0      0.0      0.0       3       0       0   
5     1    Male      0.0      0.0      0.0      0.0       0       0       0   
6     2    Male      0.0      0.0      0.0      0.0       0       0       0   
7     3    Male      0.0      0.0      0.0      0.0       0 



In [23]:
results_df = pd.concat(results_list, ignore_index=True)

In [24]:
result_path = './results/K57_result.xlsx'
results_df.to_excel(result_path, index=False)

In [25]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,8,0,0,0
1,2,Female,1,0,1,0,1,0,1,0,...,2,0,1,0,1,0,7,0,1,0
2,3,Female,1,0,0,0,2,0,0,0,...,0,0,1,0,0,0,7,0,0,0
3,4,Female,1,0,0,0,3,0,0,0,...,0,0,1,0,1,0,3,0,1,0
4,5,Female,1,0,0,0,3,0,0,0,...,0,0,1,0,1,0,7,0,4,0


In [26]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(22.5), pvalue=np.float64(0.019964453305216043))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
DT -TPR: MannwhitneyuResult(statistic=np.float64(21.5), pvalue=np.float64(0.058212739779278815))
DT - FPR: MannwhitneyuResult(statistic=np.float64(17.5), pvalue=np.float64(0.2703441406547801))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(17.5), pvalue=np.float64(0.17971249487899976))
RF -TPR: MannwhitneyuResult(statistic=np.float64(22.5), pvalue=np.float64(0.019964453305216043))
RF - FPR: MannwhitneyuResult(statistic=np.float64(17.5), pvalue=np.float64(0.2703441406547801))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
LR -TPR: MannwhitneyuResult(statistic=np.float64(22.5), pvalue=np.float64(0.019964453305216043))
LR - FPR: MannwhitneyuResult(statistic=np.float64(20.0),