In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("/content/final_depression_dataset_1.csv")
data.columns

Index(['Name', 'Gender', 'Age', 'City', 'Working Professional or Student',
       'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [3]:
data['Name'].unique()

array(['Pooja', 'Reyansh', 'Manvi', 'Isha', 'Aarav', 'Sanya', 'Zara',
       'Kiran', 'Kush', 'Damini', 'Rudransh', 'Pratham', 'Raghavendra',
       'Siddharth', 'Rahil', 'Charvi', 'Yuvraj', 'Advait', 'Palak',
       'Sanket', 'Ritik', 'Vibha', 'Harshil', 'Tushar', 'Anushka',
       'Rashi', 'Monika', 'Aaradhya', 'Ranveer', 'Leela', 'Suhani',
       'Shlok', 'Rhea', 'Gagan', 'Tanisha', 'Tina', 'Lakshay', 'Vidya',
       'Jai', 'Ayush', 'Vivaan', 'Anaya', 'Mayank', 'Nisha', 'Krishna',
       'Trisha', 'Nirvaan', 'Chirag', 'Rishi', 'Kashish', 'Parth',
       'Tanya', 'Neil', 'Ritika', 'Kunal', 'Armaan', 'Prachi', 'Khushi',
       'Riya', 'Janvi', 'Vikram', 'Mihir', 'Anand', 'Rohan', 'Shivam',
       'Harsha', 'Yogesh', 'Ila', 'Nikhil', 'Eshita', 'Nikita', 'Bhavna',
       'Manan', 'Kian', 'Pranav', 'Aarush', 'Abhishek', 'Shreya',
       'Tanmay', 'Shiv', 'Prisha', 'Vedant', 'Vani', 'Vidhi', 'Shaurya',
       'Diya', 'Utkarsh', 'Barkha', 'Avni', 'Vaishnavi', 'Ishaani',
       'Navya', 'De

In [4]:
def categorize_by_name(name):
  first_letter = name[0]
  if 'A' <= first_letter <= 'M':
    return 'A-M'
  else:
    return 'N-Z'

data['Names_grouped'] = data['Name'].apply(categorize_by_name)
data['Names_grouped'].value_counts()
# data['Names_grouped']

Unnamed: 0_level_0,count
Names_grouped,Unnamed: 1_level_1
A-M,1282
N-Z,1274


In [5]:
name_group_by_suicide = pd.crosstab(data['Names_grouped'], data['Depression'])
name_group_by_suicide
#Name seems to not matter, especially that we can catogarize humans on their genders, thus we can drop the name
#Maybe if some name would start from Sir, or Mrs, but none of the suffice these requirements
#Notice that we will contain, and model will learn on it's own that the gender has much significance, we do not need to
#divide names in some else fashion.

Depression,No,Yes
Names_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1
A-M,1051,231
N-Z,1050,224


In [6]:
data_in_work = data.copy()
data_in_work.drop(columns=['Name', 'Names_grouped'], axis = 1, inplace = True)
data_in_work.head()

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Female,37,Ghaziabad,Working Professional,Teacher,,2.0,,,4.0,7-8 hours,Moderate,MA,No,6,2,No,No
1,Male,60,Kalyan,Working Professional,Financial Analyst,,4.0,,,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
2,Female,42,Bhopal,Working Professional,Teacher,,2.0,,,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No
3,Female,44,Thane,Working Professional,Teacher,,3.0,,,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
4,Male,48,Indore,Working Professional,UX/UI Designer,,4.0,,,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No


In [7]:
#Genders should be OneHotEncoded
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False)
gender_encoded = ohe.fit_transform(data_in_work[['Gender']])
gender_encoded_df = pd.DataFrame(gender_encoded, columns=ohe.get_feature_names_out(['Gender']))
data_in_work = pd.concat([data_in_work, gender_encoded_df], axis = 1)
data_in_work.drop(columns=['Gender'], axis = 1, inplace = True)
data_in_work.head()

Unnamed: 0,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Gender_Female,Gender_Male
0,37,Ghaziabad,Working Professional,Teacher,,2.0,,,4.0,7-8 hours,Moderate,MA,No,6,2,No,No,1.0,0.0
1,60,Kalyan,Working Professional,Financial Analyst,,4.0,,,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No,0.0,1.0
2,42,Bhopal,Working Professional,Teacher,,2.0,,,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No,1.0,0.0
3,44,Thane,Working Professional,Teacher,,3.0,,,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No,1.0,0.0
4,48,Indore,Working Professional,UX/UI Designer,,4.0,,,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No,0.0,1.0


In [8]:
"""
The dataset itself is pretty small, very likely that it contains a linear,
conclusions within it's data, so we will try firstly, to use DT, with highest grid search
then SVC, and if these two would not yield proper results, some relatively borad and
shallow DL models will be tried, also ensemble might be a good idea, I'll see.
Data in tabular form, it seems to cotain mostly linear complications, it is
not big enough for big models, also the model is slightly biased towards
saying that the particular patient does not have depression solely, based on
the number of 1/0 targets, respectively r. 1/5, which also might hurt DL model
(more than ML model)

"""
# (data['CGPA'].unique() < 5.5).sum()
def categorize_by_cgpa(cgpa):
  if np.isnan(cgpa):
    return 'NaN'
  elif cgpa < 6 and cgpa >= 5:
    return '5-6'
  elif cgpa < 7 and cgpa >= 6:
    return '6-7'
  elif cgpa < 8 and cgpa >= 7:
    return '7-8'
  elif cgpa < 9 and cgpa >= 8:
    return '8-9'
  elif cgpa < 10 and cgpa >= 9:
    return '9-10'
  else:
    return '10+'

In [9]:
data_in_work['cgpa_grouped'] = data_in_work['CGPA'].apply(categorize_by_cgpa)
data_in_work['cgpa_grouped'].value_counts()

Unnamed: 0_level_0,count
cgpa_grouped,Unnamed: 1_level_1
,2054
8-9,111
5-6,104
9-10,103
7-8,102
6-7,80
10+,2


In [10]:
data_in_work['CGPA'].mean()

7.567808764940239

In [11]:
"""
Now we shall incur( we have sufficient data for it also it might be a good idea, to
                   check whether our data is nit biased in any way) who has the most probability
of ilness in groups and people without it shouldnt be judged diferently, I will decide if mean is a good idea.
"""
cgba_and_depression = pd.crosstab(data_in_work['cgpa_grouped'], data_in_work['Depression'])

# for el in cgba_and_depression:
  # print(el)
cgba_and_depression['yes_ratio'] = cgba_and_depression['Yes'] / (cgba_and_depression['Yes'] + cgba_and_depression['No'])
cgba_and_depression
# CGPA seems to do not contain linear differences within itself,but it could have some influence or correletion with different variales

Depression,No,Yes,yes_ratio
cgpa_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10+,1,1,0.5
5-6,56,48,0.461538
6-7,37,43,0.5375
7-8,57,45,0.441176
8-9,47,64,0.576577
9-10,52,51,0.495146
,1851,203,0.098832


In [13]:
# data_in_work.corr()
#Bofre this i shall chnage it to num values
# For sleep duration I will take mean of ([0] and [3]), and if 8 in then I assume 9 (people only confident in this statement would bouc for it thus not 8,5)
#For less tahn 5 I will take 4.5, four is too low
data_in_work['Sleep Duration'].value_counts()

Unnamed: 0_level_0,count
Sleep Duration,Unnamed: 1_level_1
7-8 hours,658
Less than 5 hours,648
5-6 hours,628
More than 8 hours,622


In [32]:
#Thankfully there are not any Nans to deal with, thus we will create afunction
#to analyse and create new column with well suited data for sleep duration (sd)
def adjust_sleep_duration(sd):
  if '5' in sd and '6' in sd:
    return 5.5
  if '5' in sd and '6' not in sd:
    return 4.5
  if '7' in sd and '8' in sd:
    return 7.5
  if '8' in sd and '7' not in sd:
    return 9
  # Firstly I thought about it as a good approach but now I consider in invalid and redunadant I will use Ohe...

In [33]:
# data_in_work['sd_prepared'] = data_in_work['Sleep Duration'].apply(adjust_sleep_duration)
# data_in_work['sd_prepared'].value_counts()

Unnamed: 0_level_0,count
sd_prepared,Unnamed: 1_level_1
7.5,658
4.5,648
5.5,628
9.0,622


In [None]:
data_in_work['sd_prepared'] =

In [34]:
pd.crosstab(data_in_work['Depression'], data_in_work['sd_prepared'])

sd_prepared,4.5,5.5,7.5,9.0
Depression,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,507,519,544,531
Yes,141,109,114,91


The more sleep, the less depression.

In [36]:
data_in_work.drop(columns=['Sleep Duration', 'cgpa_grouped'], axis = 1, inplace = True)

In [37]:
data_in_work.head()

Unnamed: 0,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Gender_Female,Gender_Male,sd_prepared
0,37,Ghaziabad,Working Professional,Teacher,,2.0,,,4.0,Moderate,MA,No,6,2,No,No,1.0,0.0,7.5
1,60,Kalyan,Working Professional,Financial Analyst,,4.0,,,3.0,Unhealthy,B.Com,Yes,0,4,Yes,No,0.0,1.0,5.5
2,42,Bhopal,Working Professional,Teacher,,2.0,,,3.0,Moderate,M.Com,No,0,2,No,No,1.0,0.0,5.5
3,44,Thane,Working Professional,Teacher,,3.0,,,5.0,Healthy,MD,Yes,1,2,Yes,No,1.0,0.0,7.5
4,48,Indore,Working Professional,UX/UI Designer,,4.0,,,3.0,Moderate,BE,Yes,6,5,Yes,No,0.0,1.0,7.5
