In [87]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shariful07/student-mental-health")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/shariful07/student-mental-health/versions/3


In [88]:
import os
os.listdir(path)

['Student Mental health.csv']

In [89]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [90]:
df = pd.read_csv(f'{path}/Student Mental health.csv')
df.head()

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


In [91]:
df.shape

(101, 11)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Timestamp                                     101 non-null    object 
 1   Choose your gender                            101 non-null    object 
 2   Age                                           100 non-null    float64
 3   What is your course?                          101 non-null    object 
 4   Your current year of Study                    101 non-null    object 
 5   What is your CGPA?                            101 non-null    object 
 6   Marital status                                101 non-null    object 
 7   Do you have Depression?                       101 non-null    object 
 8   Do you have Anxiety?                          101 non-null    object 
 9   Do you have Panic attack?                     101 non-null    obj

In [93]:
df.drop(['Timestamp', 'What is your course?'], axis=1, inplace=True)

In [94]:
df.head()

Unnamed: 0,Choose your gender,Age,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,Female,18.0,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,Male,21.0,year 2,3.00 - 3.49,No,No,Yes,No,No
2,Male,19.0,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,Female,22.0,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,Male,23.0,year 4,3.00 - 3.49,No,No,No,No,No


In [95]:
df['Choose your gender'].unique()

array(['Female', 'Male'], dtype=object)

In [96]:
df['Choose your gender'] = df['Choose your gender'].map({'Female': 0, 'Male': 1})

In [97]:
df['Your current year of Study'].unique()

array(['year 1', 'year 2', 'Year 1', 'year 3', 'year 4', 'Year 2',
       'Year 3'], dtype=object)

In [98]:
enc = OneHotEncoder(categories='auto')
current_year = df['Your current year of Study'].values.reshape(-1, 1)
enc.fit(current_year)
new_features = enc.get_feature_names_out()
print(new_features)
new_current_year = pd.DataFrame(enc.transform(current_year).toarray())

['x0_Year 1' 'x0_Year 2' 'x0_Year 3' 'x0_year 1' 'x0_year 2' 'x0_year 3'
 'x0_year 4']


In [99]:
new_current_year.columns = ['Year 1', 'Year 2', 'Year 3', 'year 1', 'year 2', 'year 3', 'year 4']

In [100]:
df = pd.concat([df, new_current_year], axis=1)
df.drop('Your current year of Study', axis=1, inplace=True)

In [101]:
df.head()

Unnamed: 0,Choose your gender,Age,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?,Year 1,Year 2,Year 3,year 1,year 2,year 3,year 4
0,0,18.0,3.00 - 3.49,No,Yes,No,Yes,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,21.0,3.00 - 3.49,No,No,Yes,No,No,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,19.0,3.00 - 3.49,No,Yes,Yes,Yes,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,3.00 - 3.49,Yes,Yes,No,No,No,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,23.0,3.00 - 3.49,No,No,No,No,No,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [102]:
df['What is your CGPA?'].unique()

array(['3.00 - 3.49', '3.50 - 4.00', '3.50 - 4.00 ', '2.50 - 2.99',
       '2.00 - 2.49', '0 - 1.99'], dtype=object)

In [103]:
df['What is your CGPA?'] = df['What is your CGPA?'].str.strip()
df['What is your CGPA?'].unique()

array(['3.00 - 3.49', '3.50 - 4.00', '2.50 - 2.99', '2.00 - 2.49',
       '0 - 1.99'], dtype=object)

In [104]:
df['What is your CGPA?'] = df['What is your CGPA?'].map({'0 - 1.99': 0, '2.00 - 2.49': 1, '2.50 - 2.99': 2, '3.00 - 3.49': 3, '3.50 - 4.00': 4})
df['What is your CGPA?'].unique()

array([3, 4, 2, 1, 0])

In [105]:
df.head()

Unnamed: 0,Choose your gender,Age,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?,Year 1,Year 2,Year 3,year 1,year 2,year 3,year 4
0,0,18.0,3,No,Yes,No,Yes,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,21.0,3,No,No,Yes,No,No,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,19.0,3,No,Yes,Yes,Yes,No,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,3,Yes,Yes,No,No,No,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,23.0,3,No,No,No,No,No,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [106]:
df['Marital status'] = df['Marital status'].map({'No': 0, 'Yes': 1})
df['Do you have Depression?'] = df['Do you have Depression?'].map({'No': 0, 'Yes': 1})
df['Do you have Anxiety?'] = df['Do you have Anxiety?'].map({'No': 0, 'Yes': 1})
df['Do you have Panic attack?'] = df['Do you have Panic attack?'].map({'No': 0, 'Yes': 1})
df['Did you seek any specialist for a treatment?'] = df['Did you seek any specialist for a treatment?'].map({'No': 0, 'Yes': 1})

In [107]:
df.head()

Unnamed: 0,Choose your gender,Age,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?,Year 1,Year 2,Year 3,year 1,year 2,year 3,year 4
0,0,18.0,3,0,1,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,21.0,3,0,0,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,19.0,3,0,1,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,3,1,1,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,23.0,3,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Choose your gender                            101 non-null    int64  
 1   Age                                           100 non-null    float64
 2   What is your CGPA?                            101 non-null    int64  
 3   Marital status                                101 non-null    int64  
 4   Do you have Depression?                       101 non-null    int64  
 5   Do you have Anxiety?                          101 non-null    int64  
 6   Do you have Panic attack?                     101 non-null    int64  
 7   Did you seek any specialist for a treatment?  101 non-null    int64  
 8   Year 1                                        101 non-null    float64
 9   Year 2                                        101 non-null    flo

In [109]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Choose your gender                            100 non-null    int64  
 1   Age                                           100 non-null    float64
 2   What is your CGPA?                            100 non-null    int64  
 3   Marital status                                100 non-null    int64  
 4   Do you have Depression?                       100 non-null    int64  
 5   Do you have Anxiety?                          100 non-null    int64  
 6   Do you have Panic attack?                     100 non-null    int64  
 7   Did you seek any specialist for a treatment?  100 non-null    int64  
 8   Year 1                                        100 non-null    float64
 9   Year 2                                        100 non-null    floa

In [110]:
y = df['Do you have Depression?'].values
X = df.drop('Do you have Depression?', axis=1)

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Choose your gender                            100 non-null    int64  
 1   Age                                           100 non-null    float64
 2   What is your CGPA?                            100 non-null    int64  
 3   Marital status                                100 non-null    int64  
 4   Do you have Depression?                       100 non-null    int64  
 5   Do you have Anxiety?                          100 non-null    int64  
 6   Do you have Panic attack?                     100 non-null    int64  
 7   Did you seek any specialist for a treatment?  100 non-null    int64  
 8   Year 1                                        100 non-null    float64
 9   Year 2                                        100 non-null    floa

In [112]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(100, 14)
(100, 9)


In [113]:
# build mask
gender_0_mask = df['Choose your gender'] == 0
gender_1_mask = df['Choose your gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  75
Male:  25


In [114]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

In [115]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  0.400000  0.933333  0.066667  0.600000       4      14   
1     2  Female  0.400000  1.000000  0.000000  0.600000       4      15   
2     3  Female  0.666667  1.000000  0.000000  0.333333       6      16   
3     1    Male  0.000000  1.000000  0.000000  1.000000       0       7   
4     2    Male  0.000000  1.000000  0.000000  1.000000       0       6   
5     3    Male  0.000000  1.000000  0.000000  1.000000       0       6   

   SVM_FP  SVM_FN  ...  ANN_FP  ANN_FN    NB_TPR    NB_TNR    NB_FPR  \
0       1       6  ...   



In [116]:
results_df = pd.concat(results_list, ignore_index=True)

In [117]:
result_path = './results/K48_result.xlsx'
results_df.to_excel(result_path, index=False)

In [118]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.4,0.933333,0.066667,0.6,4,14,1,6,...,2,4,0.4,1.0,0.0,0.6,4,15,0,6
1,2,Female,0.4,1.0,0.0,0.6,4,15,0,6,...,3,1,0.4,1.0,0.0,0.6,4,15,0,6
2,3,Female,0.666667,1.0,0.0,0.333333,6,16,0,3,...,4,1,0.555556,1.0,0.0,0.444444,5,16,0,4
3,1,Male,0.0,1.0,0.0,1.0,0,7,0,2,...,0,2,0.5,1.0,0.0,0.5,1,7,0,1
4,2,Male,0.0,1.0,0.0,1.0,0,6,0,2,...,0,2,0.0,1.0,0.0,1.0,0,6,0,2


In [119]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)

SVM -TPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.05934643879191985))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.5049850750938458))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.6428348264908044))
DT -TPR: TtestResult(statistic=np.float64(0.6327363945441675), pvalue=np.float64(0.5612720742914208), df=np.float64(4.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(1.0), pvalue=np.float64(0.1840386271964254))
DT - FN/FP: TtestResult(statistic=np.float64(0.9159291318336055), pvalue=np.float64(0.41152733622135773), df=np.float64(4.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.6579050194284821))
RF - FPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.6579050194284821))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(3.0), pvalue=np.float64(0.6579050194284821))
LR -TPR: MannwhitneyuResult(statistic=np.float64(6.0), pvalue=np.float64(0.657905