In [2]:
import arff
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import TargetEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

from sklearn.ensemble import IsolationForest 
from sklearn.neighbors import LocalOutlierFactor

from imblearn.over_sampling import SMOTE


In [3]:
with open(r"../student-mental-health-analysis/data/raw/depression_dataset.arff", 'r') as f:
    dataset = arff.load(f)

In [4]:
df = pd.DataFrame(dataset['data'], columns=[a[0] for a in dataset['attributes']])

In [5]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2.0,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1.0
1,8.0,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0.0
2,26.0,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0.0
3,30.0,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1.0
4,32.0,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0.0


In [6]:
print("\nMissing values before handling:")
print(df.isnull().sum())


Missing values before handling:
id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [7]:
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)
        print(f"Filled missing values in '{col}' with MEAN")

Filled missing values in 'Financial Stress' with MEAN


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [8]:
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)
        print(f"Filled missing values in '{col}' with MODE")

In [9]:
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [10]:
num_cols = df.select_dtypes(include=[np.number]).columns
df_numeric = df[num_cols]

iso = IsolationForest(contamination=0.05, random_state=42) 
iso_preds = iso.fit_predict(df_numeric) 
iso_outliers = iso_preds == -1

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05) 
lof_preds = lof.fit_predict(df_numeric) 
lof_outliers = lof_preds == -1

combined_outliers = iso_outliers | lof_outliers
print("Number of outliers detected:", combined_outliers.sum())

df = df[~combined_outliers].reset_index(drop=True)
print("After removing outliers:", df.shape)

Number of outliers detected: 2723
After removing outliers: (25178, 18)


In [None]:
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

te_city = TargetEncoder()
df['City'] = te_city.fit_transform(df['City'], df['Depression'])

In [12]:
prof_counts = df['Profession'].value_counts()

rare_prof = prof_counts[prof_counts < 10].index
df['Profession'] = LabelEncoder().fit_transform(df['Profession'].apply(lambda x: 'Other' if x in rare_prof else x))

In [13]:
df['Sleep Duration'] = df['Sleep Duration'].replace('Others', 'More than 8 hours')

sleep_order = ['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours']
ordinal_sleep = OrdinalEncoder(categories=[sleep_order])
df['Sleep Duration'] = ordinal_sleep.fit_transform(df[['Sleep Duration']])


In [14]:
df['Dietary Habits'] = df['Dietary Habits'].replace('Others', 'Moderate')

diet_order = ['Unhealthy', 'Moderate', 'Healthy']
ordinal_diet = OrdinalEncoder(categories=[diet_order])
df['Dietary Habits'] = ordinal_diet.fit_transform(df[['Dietary Habits']])

In [15]:
te_degree = TargetEncoder()
df['Degree'] = te_degree.fit_transform(df['Degree'], df['Depression'])

In [16]:
le_suicide = LabelEncoder()
df['Have you ever had suicidal thoughts ?'] = le_suicide.fit_transform(df['Have you ever had suicidal thoughts ?'])

le_family = LabelEncoder()
df['Family History of Mental Illness'] = le_family.fit_transform(df['Family History of Mental Illness'])

In [17]:
print(df.dtypes)

id                                       float64
Gender                                     int32
Age                                      float64
City                                     float64
Profession                                 int32
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                           float64
Dietary Habits                           float64
Degree                                   float64
Have you ever had suicidal thoughts ?      int32
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness           int32
Depression                               float64
dtype: object


In [18]:
df.head(5)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,83.0,1,24.0,0.571429,1,3.0,0.0,6.1,3.0,0.0,1.0,1.0,0.729001,1,11.0,1.0,1,1.0
1,91.0,1,33.0,0.577532,1,3.0,0.0,7.03,4.0,0.0,0.0,2.0,0.558348,1,10.0,2.0,1,0.0
2,94.0,1,27.0,0.609773,1,5.0,0.0,7.04,1.0,0.0,0.0,1.0,0.511254,0,10.0,1.0,1,1.0
3,100.0,0,19.0,0.633952,1,2.0,0.0,8.52,4.0,0.0,0.0,0.0,0.729001,0,6.0,2.0,1,0.0
4,106.0,1,29.0,0.571776,1,3.0,0.0,8.58,3.0,0.0,3.0,1.0,0.511254,1,10.0,2.0,1,1.0


In [19]:
numerical_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
                  'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']

X_num = df[numerical_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_num)

df_scaled = pd.DataFrame(X_scaled, columns=numerical_cols)

df[numerical_cols] = df_scaled

In [20]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,83.0,1,-0.364916,0.571429,1,-0.113165,0.0,-1.08459,0.049268,-0.00874,1.0,1.0,0.729001,1,1.030903,-1.514206,1,1.0
1,91.0,1,1.510116,0.577532,1,-0.113165,0.0,-0.439964,0.799247,-0.00874,0.0,2.0,0.558348,1,0.753778,-0.810531,1,0.0
2,94.0,1,0.260094,0.609773,1,1.355907,0.0,-0.433033,-1.450691,-0.00874,0.0,1.0,0.511254,0,0.753778,-1.514206,1,1.0
3,100.0,0,-1.406601,0.633952,1,-0.847701,0.0,0.592823,0.799247,-0.00874,0.0,0.0,0.729001,0,-0.354722,-0.810531,1,0.0
4,106.0,1,0.676768,0.571776,1,-0.113165,0.0,0.634412,0.049268,-0.00874,3.0,1.0,0.511254,1,0.753778,-0.810531,1,1.0


In [21]:
df.drop(columns=['id'], inplace=True)

X = df.drop('Depression', axis=1)
y = df['Depression']

In [22]:
selector = SelectKBest(mutual_info_classif, k=10)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print("\nSelected top 10 features:")
print(selected_features)


Selected top 10 features:
Index(['Age', 'Profession', 'Academic Pressure', 'Study Satisfaction',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness'],
      dtype='object')


In [23]:
df = df[selected_features]
df['Depression'] = y 

In [24]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(df.drop('Depression', axis=1))

In [25]:
print("\nNumber of PCA components:", pca.n_components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)


Number of PCA components: 7
Explained variance ratio: [0.22927584 0.16307705 0.15664156 0.14430105 0.13290754 0.10036167
 0.04077053]


In [26]:
df.head()

Unnamed: 0,Age,Profession,Academic Pressure,Study Satisfaction,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,-0.364916,1,-0.113165,0.049268,1.0,0.729001,1,1.030903,-1.514206,1,1.0
1,1.510116,1,-0.113165,0.799247,2.0,0.558348,1,0.753778,-0.810531,1,0.0
2,0.260094,1,1.355907,-1.450691,1.0,0.511254,0,0.753778,-1.514206,1,1.0
3,-1.406601,1,-0.847701,0.799247,0.0,0.729001,0,-0.354722,-0.810531,1,0.0
4,0.676768,1,-0.113165,0.049268,1.0,0.511254,1,0.753778,-0.810531,1,1.0


In [27]:
numeric_cols = df.select_dtypes(include=['number']).columns
X = df[numeric_cols].drop(columns=['Depression'], errors='ignore')
y = df['Depression']

counts_before = y.value_counts()

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

counts_after = y_res.value_counts()

print("counts_before :",counts_before)
print("counts_after :",counts_after)

df_resampled = pd.DataFrame(X_res, columns=X.columns)
df_resampled['Depression'] = y_res

counts_before : Depression
1.0    15088
0.0    10090
Name: count, dtype: int64
counts_after : Depression
1.0    15088
0.0    15088
Name: count, dtype: int64


In [28]:
df_resampled.to_csv(r'../student-mental-health-analysis/results/outputs/preprocessed_dataset.csv', index=False)