# **Cloning the git repo to use the source codes**

In [1]:
!git clone https://github.com/IJustKen/depression_project.git

fatal: destination path 'depression_project' already exists and is not an empty directory.


In [30]:
!git pull

remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)[Kremote: Counting objects:  28% (2/7)[Kremote: Counting objects:  42% (3/7)[Kremote: Counting objects:  57% (4/7)[Kremote: Counting objects:  71% (5/7)[Kremote: Counting objects:  85% (6/7)[Kremote: Counting objects: 100% (7/7)[Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects:  25% (1/4)[Kremote: Compressing objects:  50% (2/4)[Kremote: Compressing objects:  75% (3/4)[Kremote: Compressing objects: 100% (4/4)[Kremote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects:  25% (1/4)Unpacking objects:  50% (2/4)Unpacking objects:  75% (3/4)Unpacking objects: 100% (4/4)Unpacking objects: 100% (4/4), 1.08 KiB | 553.00 KiB/s, done.
From https://github.com/IJustKen/depression_project
   9529538..ad43dbc  main       -> origin/main
Updating 9529538..ad43dbc
Fast-forward
 funcs/feature_se

In [31]:
import sys
sys.path.append('/content/depression_project/funcs')
sys.path.append('/content/depression_project/data')

In [32]:
%cd depression_project

/content/depression_project/depression_project


# **Importing Necessary Functions**

In [33]:
# Imports from data_preprocessing.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from funcs.data_preprocessing import (
    num_cat_column_analysis,
    rm_rows_with_val,
    rm_rows_with_rare_cats,
    one_hot_encode_dataframe,
    Z_Scaler,
    MinMax_Scaler,
)

# Imports from eda_plotting.py
from funcs.eda_plotting import (
    analyze_feature_distributions,
    plot_feature_target_correlations,
    plot_grouped_boxplots,
)

# Imports from feature_selection.py
from funcs.feature_selection import (
    feat_eng,
    select_features_with_rf,
    select_features_with_lasso,
    apply_pca,
)

# Imports from classification.py
from funcs.classification import (
    classify_svc,
    classify_rf,
    classify_logistic,
)

# Imports from model_evaluation.py
from funcs.model_evaluation import (
    compare_clf,
)

# Imports from hyperparameter_tuning.py
from funcs.hyperparameter_tuning import (
    best_svc_params_gridsearch,
    best_svc_params_randomizedsearch,
)

# Imports from regression.py
from funcs.regression import (
    get_training_scores,
    regress_linear,
    regress_bagging,
    regress_rf,
    compare_regs,
)

# **Data Preprocessing**

In [34]:
import pandas as pd
# %cd ..
data = pd.read_csv("data/student_depression_dataset.csv")

In [35]:
data

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.90,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,140685,Female,27.0,Surat,Student,5.0,0.0,5.75,5.0,0.0,'5-6 hours',Unhealthy,'Class 12',Yes,7.0,1.0,Yes,0
27897,140686,Male,27.0,Ludhiana,Student,2.0,0.0,9.40,3.0,0.0,'Less than 5 hours',Healthy,MSc,No,0.0,3.0,Yes,0
27898,140689,Male,31.0,Faridabad,Student,3.0,0.0,6.61,4.0,0.0,'5-6 hours',Unhealthy,MD,No,12.0,2.0,No,0
27899,140690,Female,18.0,Ludhiana,Student,5.0,0.0,6.88,2.0,0.0,'Less than 5 hours',Healthy,'Class 12',Yes,10.0,5.0,No,1


In [36]:
data.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Depression
count,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0
mean,70442.149421,25.8223,3.141214,0.00043,7.656104,2.943837,0.000681,7.156984,0.585499
std,40641.175216,4.905687,1.381465,0.043992,1.470707,1.361148,0.044394,3.707642,0.492645
min,2.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35039.0,21.0,2.0,0.0,6.29,2.0,0.0,4.0,0.0
50%,70684.0,25.0,3.0,0.0,7.77,3.0,0.0,8.0,1.0
75%,105818.0,30.0,4.0,0.0,8.92,4.0,0.0,10.0,1.0
max,140699.0,59.0,5.0,5.0,10.0,5.0,4.0,12.0,1.0


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [38]:
data.isnull().sum()

Unnamed: 0,0
id,0
Gender,0
Age,0
City,0
Profession,0
Academic Pressure,0
Work Pressure,0
CGPA,0
Study Satisfaction,0
Job Satisfaction,0


In [39]:
num_cols,cat_cols=num_cat_column_analysis(data)

Column Analysis for: id
    - Total Unique Values (n): 27901
    - Type: Numerical/High Cardinality
    - Min: 2
    - Max: 140699
    - Mean: 70442.15
Column Analysis for: Gender
    - Total Unique Values (n): 2
    - Type: Categorical/Ordinal/Low Cardinality
    - Value Counts:
Gender
Male      15547
Female    12354
Name: count, dtype: int64
Column Analysis for: Age
    - Total Unique Values (n): 34
    - Type: Numerical/High Cardinality
    - Min: 18.0
    - Max: 59.0
    - Mean: 25.82
Column Analysis for: City
    - Total Unique Values (n): 52
    - Type: Categorical/Ordinal/Low Cardinality
    - Value Counts:
City
Kalyan                  1570
Srinagar                1372
Hyderabad               1340
Vasai-Virar             1290
Lucknow                 1155
Thane                   1139
Ludhiana                1111
Agra                    1094
Surat                   1078
Kolkata                 1066
Jaipur                  1036
Patna                   1007
Visakhapatnam            

In [40]:
print(f"**Numerical Columns** ({len(num_cols)}):")
print(num_cols)
print()
print(f"**Categorical Columns** ({len(cat_cols)}):")
print(cat_cols)

**Numerical Columns** (9):
['id', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Depression']

**Categorical Columns** (9):
['Gender', 'City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Financial Stress', 'Family History of Mental Illness']


In [41]:
X=data.drop('Depression',axis=1)
data_new=X.drop('id',axis=1).copy()

In [42]:
Y=data['Depression']

In [43]:
column_name = 'Dietary Habits'
value_to_drop = 'Others'
data_new=rm_rows_with_val(data_new,column_name,value_to_drop)

Dataset Size Before Drop: 27901 rows
Rows Dropped (Dietary Habits is 'Others'): 12
Dataset Size After Drop: 27889 rows


In [44]:
column_name = 'Sleep Duration'
value_to_drop = 'Others'
data_new=rm_rows_with_val(data_new,column_name,value_to_drop)

Dataset Size Before Drop: 27889 rows
Rows Dropped (Sleep Duration is 'Others'): 18
Dataset Size After Drop: 27871 rows


In [45]:
data_new=rm_rows_with_rare_cats(data_new,10)
#note that financial stress is still a categorical column instead of numerical
#we have now removed the '?'.

Starting with 27871 rows.
----------------------------------------
| Gender              : 0 rare categories found.
| City                : 22 rare categories found.
| Profession          : 13 rare categories found.
| Sleep Duration      : 0 rare categories found.
| Dietary Habits      : 0 rare categories found.
| Degree              : 0 rare categories found.
| Have you ever had suicidal thoughts ?: 0 rare categories found.
| Financial Stress    : 1 rare categories found.
| Family History of Mental Illness: 0 rare categories found.
----------------------------------------
Processing Complete.
Total Rows Removed: 60
Final Rows Remaining: 27811


In [46]:
num_cols1,cat_cols1=num_cat_column_analysis(data_new)

Column Analysis for: Gender
    - Total Unique Values (n): 2
    - Type: Categorical/Ordinal/Low Cardinality
    - Value Counts:
Gender
Male      15497
Female    12314
Name: count, dtype: int64
Column Analysis for: Age
    - Total Unique Values (n): 34
    - Type: Numerical/High Cardinality
    - Min: 18.0
    - Max: 59.0
    - Mean: 25.82
Column Analysis for: City
    - Total Unique Values (n): 30
    - Type: Categorical/Ordinal/Low Cardinality
    - Value Counts:
City
Kalyan           1563
Srinagar         1370
Hyderabad        1337
Vasai-Virar      1287
Lucknow          1154
Thane            1139
Ludhiana         1107
Agra             1090
Surat            1078
Kolkata          1064
Jaipur           1033
Patna            1006
Pune              968
Visakhapatnam     967
Ahmedabad         946
Bhopal            933
Chennai           884
Meerut            820
Rajkot            815
Bangalore         765
Delhi             765
Ghaziabad         743
Mumbai            697
Vadodara          6

In [47]:
data_num=data_new[num_cols1]
data_cat=data_new[cat_cols1]

In [48]:
#Financial stress needs to go to numerical columns after removal of  '?'.
data_num['Financial Stress']=data_cat['Financial Stress']
data_cat=data_cat.drop('Financial Stress',axis=1)
data_num['Financial Stress'] = pd.to_numeric(data_num['Financial Stress'], errors='raise')

# #target is a numerical column
# data_num=data_num.drop(['id','Depression'],axis=1)

# # removal of '?'
#column_name = 'Financial Stress'
# print(f"Initial unique values: {data_num[column_name].unique()}")
# data_num[column_name] = data_num[column_name].replace('?', np.nan)
# data_num['Financial Stress'] = pd.to_numeric(data_num['Financial Stress'], errors='raise')
# print(f"New data type for '{column_name}': {data_num[column_name].dtype}")
# print(f"Number of missing values (NaN) now: {data_num[column_name].isna().sum()}")

# #imputation with median
# median_value = data_num[column_name].median()
# data_num[column_name]=data_num[column_name].fillna(median_value)
# print(f"Imputed NaN with median: {median_value}")
# print(f"Missing values after imputation: {data_num[column_name].isna().sum()}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_num['Financial Stress']=data_cat['Financial Stress']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_num['Financial Stress'] = pd.to_numeric(data_num['Financial Stress'], errors='raise')


In [49]:
data_num.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress
count,27811.0,27811.0,27811.0,27811.0,27811.0,27811.0,27811.0,27811.0
mean,25.819496,3.141059,0.000431,7.65595,2.943871,0.000683,7.159254,3.140088
std,4.907076,1.381904,0.044063,1.471008,1.360952,0.044466,3.706694,1.437018
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,21.0,2.0,0.0,6.28,2.0,0.0,4.0,2.0
50%,25.0,3.0,0.0,7.77,3.0,0.0,8.0,3.0
75%,30.0,4.0,0.0,8.92,4.0,0.0,10.0,4.0
max,59.0,5.0,5.0,10.0,5.0,4.0,12.0,5.0


In [50]:
data_num.columns

Index(['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours',
       'Financial Stress'],
      dtype='object')

In [51]:
data_cat.columns

Index(['Gender', 'City', 'Profession', 'Sleep Duration', 'Dietary Habits',
       'Degree', 'Have you ever had suicidal thoughts ?',
       'Family History of Mental Illness'],
      dtype='object')

In [52]:
#label encoding using .map() for two columns sleep duration and dietary habits as they have ordinal categories.
sleep_mapping = {
    "'Less than 5 hours'": 1,
    "'5-6 hours'": 2,
    "'7-8 hours'": 3,
    "'More than 8 hours'":4,

    # Add other categories if they exist (e.g., 'More than 8 hours': 4)
}
diet_mapping = {
    'Unhealthy': 1,
    'Moderate': 2,
    'Healthy': 3
}

data_cat['Sleep Duration_Encoded'] = data_cat['Sleep Duration'].map(sleep_mapping)
data_cat['Dietary Habits_Encoded'] = data_cat['Dietary Habits'].map(diet_mapping)

In [53]:
data_cat

Unnamed: 0,Gender,City,Profession,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Family History of Mental Illness,Sleep Duration_Encoded,Dietary Habits_Encoded
0,Male,Visakhapatnam,Student,'5-6 hours',Healthy,B.Pharm,Yes,No,2,3
1,Female,Bangalore,Student,'5-6 hours',Moderate,BSc,No,Yes,2,2
2,Male,Srinagar,Student,'Less than 5 hours',Healthy,BA,No,Yes,1,3
3,Female,Varanasi,Student,'7-8 hours',Moderate,BCA,Yes,Yes,3,2
4,Female,Jaipur,Student,'5-6 hours',Moderate,M.Tech,Yes,No,2,2
...,...,...,...,...,...,...,...,...,...,...
27896,Female,Surat,Student,'5-6 hours',Unhealthy,'Class 12',Yes,Yes,2,1
27897,Male,Ludhiana,Student,'Less than 5 hours',Healthy,MSc,No,Yes,1,3
27898,Male,Faridabad,Student,'5-6 hours',Unhealthy,MD,No,No,2,1
27899,Female,Ludhiana,Student,'Less than 5 hours',Healthy,'Class 12',Yes,No,1,3


In [54]:
data_cat['Dietary Habits_Encoded'].value_counts()

Unnamed: 0_level_0,count
Dietary Habits_Encoded,Unnamed: 1_level_1
1,10287
2,9897
3,7627


In [55]:
data_cat['Sleep Duration_Encoded'].value_counts()

Unnamed: 0_level_0,count
Sleep Duration_Encoded,Unnamed: 1_level_1
1,8290
3,7324
2,6166
4,6031


In [61]:
df_new_feat = feat_eng(data_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Total_Pressure'] = df['Academic Pressure'] + df['Work Pressure']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # we could assume the opposite for someone prone to depression.


KeyError: 'Total_Pressure'

In [57]:
data_ohe=data_cat.drop(['Sleep Duration','Dietary Habits','Sleep Duration_Encoded','Dietary Habits_Encoded'],axis=1)
df_new_feat[['Sleep Duration','Dietary Habits']]=data_cat[['Sleep Duration_Encoded','Dietary Habits_Encoded']]
data_num_scaled_Z=Z_Scaler(df_new_feat)
data_num_scaled_MM=MinMax_Scaler(df_new_feat)

NameError: name 'df_new_feat' is not defined

In [58]:
data_num_scaled_MM

NameError: name 'data_num_scaled_MM' is not defined

In [59]:
data_num_scaled_Z

NameError: name 'data_num_scaled_Z' is not defined

In [60]:
data_ohe.describe()

Unnamed: 0,Gender,City,Profession,Degree,Have you ever had suicidal thoughts ?,Family History of Mental Illness
count,27811,27811,27811,27811,27811,27811
unique,2,30,1,28,2,2
top,Male,Kalyan,Student,'Class 12',Yes,No
freq,15497,1563,27811,6074,17594,14348


In [None]:
data_ohe=one_hot_encode_dataframe(data_ohe)

In [None]:
data_ohe.columns

In [None]:
data_final=pd.concat([data_num_scaled_Z,data_ohe],axis=1)

In [None]:
data_final #final preprocessed data with scaled numerical columns adn label encoded columns and One hot encoded column(bulk of th e columns)
#69 columns ==> 59 One hot encoded : 2 z_scaled label encoded : 8 z_scaled numerical columns

# **Training the classification model**

In [None]:
from sklearn.model_selection import train_test_split
Y = Y.reindex(data_final.index)
X_train, X_test, Y_train, Y_test = train_test_split(data_final, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
selected_features_list = select_features_with_rf(X_train, Y_train)

#Selected features only
X_train_final = X_train[selected_features_list]
X_test_final = X_test[selected_features_list] # Make sure to apply to X_test as well


In [None]:
classify_logistic(X_train_final, Y_train, X_test_final, Y_test)

In [None]:
classify_svc(X_train_final, Y_train, X_test_final, Y_test)

In [None]:
classify_rf(X_train_final, Y_train, X_test_final, Y_test)

# **Applying PCA**

In [None]:
# Now, apply PCA to retain 95% of the variance
X_train_pca, X_test_pca, pca = apply_pca(X_train_final, X_test_final, n_components=0.95)

# Or, apply PCA to get exactly 10 components
# X_train_pca, X_test_pca, pca = apply_pca(X_train_final, X_test_final, n_components=10)

In [None]:

classify_logistic(X_train_pca, Y_train, X_test_pca, Y_test, class_weight='balanced')
#X_train_pca.shape

# **HyperParameter Tuning**

In [None]:
from scipy.stats import uniform

C = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2]
gammaValues = ["scale", "auto"]
param_grid = {
    "kernel": ["linear", "rbf", "poly"],
    "C": C,
    "degree": [i for i in range(1, 20)],
    "gamma": gammaValues,
    "decision_function_shape": ["ovo", "ovr"],
}

In [None]:
param_grid['C']

In [None]:
best_clf = best_svc_params_gridsearch(X_train_final, Y_train, param_grid)

In [None]:
#C=9.999999999999999e-06, decision_function_shape=ovo, degree=1, gamma=scale, kernel=linear;

trial_clf = SVC(C=9.999999999999999e-06, decision_function_shape='ovo', degree=1, gamma='scale', kernel='linear')

trial_clf.fit(X_train_final, Y_train)
Y_pred = trial_clf.predict(X_test_final)
accuracy_score(Y_test,Y_pred)

In [None]:
from sklearn.metrics import recall_score
print(recall_score(Y_test,Y_pred))

In [None]:
Y.value_counts()