Manual Feature Engineering

In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [100]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [101]:
train.describe()

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140700.0,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,70349.5,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,40616.735775,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,0.0,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,35174.75,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,70349.5,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,105524.25,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,140699.0,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [102]:
train.drop(['id', 'Name'], axis=1, inplace=True)

In [103]:
train.isnull().sum()

Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

For 'Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction',  you can just map Nan values to 0.

In [104]:
train['Academic Pressure'] = train['Academic Pressure'].fillna(0)
train['Work Pressure'] = train['Work Pressure'].fillna(0)
train['Study Satisfaction'] = train['Study Satisfaction'].fillna(0)
train['Job Satisfaction'] = train['Job Satisfaction'].fillna(0)

In [105]:
train['Profession'] = train['Profession'].fillna('None')
train['CGPA'] = train['CGPA'].fillna(-1)
train['Dietary Habits'] = train['Dietary Habits'].fillna('Not Mentioned')
train['Degree'] = train['Degree'].fillna('None')
train['Financial Stress'] = train['Financial Stress'].fillna(3)
train.isnull().sum()

Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [106]:
train.dtypes

Gender                                    object
Age                                      float64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Depression                                 int64
dtype: object

In [107]:
print("Gender:", train['Gender'].unique())
print("City:", train['City'].nunique(), "Unique values")
print("P or S:", train['Working Professional or Student'].unique())
print("Profession:", train['Profession'].nunique(), "Unique values")
print("Sleep Duration:", train['Sleep Duration'].unique())
print("Dietary Habits:", train['Dietary Habits'].unique())
print("Degree:", train['Degree'].nunique(), "Unique values")
print("Suicidal Thoughts:", train['Have you ever had suicidal thoughts ?'].unique())
print("Family History:", train['Family History of Mental Illness'].unique())

Gender: ['Female' 'Male']
City: 98 Unique values
P or S: ['Working Professional' 'Student']
Profession: 65 Unique values
Sleep Duration: ['More than 8 hours' 'Less than 5 hours' '5-6 hours' '7-8 hours'
 'Sleep_Duration' '1-2 hours' '6-8 hours' '4-6 hours' '6-7 hours'
 '10-11 hours' '8-9 hours' '40-45 hours' '9-11 hours' '2-3 hours'
 '3-4 hours' 'Moderate' '55-66 hours' '4-5 hours' '9-6 hours' '1-3 hours'
 'Indore' '45' '1-6 hours' '35-36 hours' '8 hours' 'No' '10-6 hours'
 'than 5 hours' '49 hours' 'Unhealthy' 'Work_Study_Hours' '3-6 hours'
 '45-48 hours' '9-5' 'Pune' '9-5 hours']
Dietary Habits: ['Healthy' 'Unhealthy' 'Moderate' 'Yes' 'Pratham' 'BSc' 'Gender' '3'
 'More Healthy' 'Less than Healthy' 'Mihir' '1.0' 'Hormonal' 'Electrician'
 'Not Mentioned' 'No Healthy' 'Less Healthy' 'M.Tech' 'Vegas' 'No' 'Male'
 'Indoor' 'Class 12' '2']
Degree: 116 Unique values
Suicidal Thoughts: ['No' 'Yes']
Family History: ['No' 'Yes']


Some features you can encode with just 1's and 0's. Some features need to be cleaned and some have a lot of different values.

In [108]:
train['Gender'] = train['Gender'].map({'Female': 1, 'Male': 0})
train['Working Professional or Student'] = train['Working Professional or Student'].map({'Working Professional': 0, 'Student': 1})
train['Have you ever had suicidal thoughts ?'] = train['Have you ever had suicidal thoughts ?'].map({'No': 0, 'Yes': 1})
train['Family History of Mental Illness'] = train['Family History of Mental Illness'].map({'No': 0, 'Yes': 1})

'Sleep Duration' has a lot of garbage answers. Using my intuition to map these values. I will fill the garbage values with the mean after I map the proper answers.

In [109]:

sleep_map = {
    'More than 8 hours': 9,
    'Less than 5 hours': 4,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    '1-2 hours': 1.5,
    '6-8 hours': 7.0,
    '4-6 hours': 5.0,
    '6-7 hours': 6.5,
    '10-11 hours': 10.5,
    '8-9 hours': 8.5,
    '9-11 hours': 10.0,
    '2-3 hours': 2.5,
    '3-4 hours': 3.5,
    '4-5 hours': 4.5,
    '9-6 hours': 7.5,
    '1-3 hours': 2.0,
    '1-6 hours': 3.5,
    '8 hours': 8.0,
    '10-6 hours': 8.0,
    '3-6 hours': 4.5,
    '9-5': 7.0,
    '9-5 hours': 7.0,
    'Unhealthy': 3
}

garbage_values = {
    'Sleep_Duration', 'Moderate', 'Work_Study_Hours',
    'Indore', '45', 'Pune', 'No', 'than 5 hours', '55-66 hours',
    '35-36 hours', '45-48 hours', '40-45 hours', '49 hours'
}

train['Sleep Duration'] = train['Sleep Duration'].map(sleep_map)
train.loc[train['Sleep Duration'].isin(garbage_values), 'Sleep Duration'] = np.nan
train['Sleep Duration'] = train['Sleep Duration'].fillna(train['Sleep Duration'].median())
train['Sleep Duration']

0         9.0
1         4.0
2         5.5
3         4.0
4         5.5
         ... 
140695    5.5
140696    7.5
140697    9.0
140698    5.5
140699    4.0
Name: Sleep Duration, Length: 140700, dtype: float64

In [110]:
train['Dietary Habits'].value_counts()

Dietary Habits
Moderate             49705
Unhealthy            46227
Healthy              44741
Not Mentioned            4
Yes                      2
No                       2
More Healthy             2
Class 12                 1
Indoor                   1
Male                     1
Vegas                    1
M.Tech                   1
Less Healthy             1
No Healthy               1
Hormonal                 1
Electrician              1
1.0                      1
Mihir                    1
Less than Healthy        1
3                        1
Gender                   1
BSc                      1
Pratham                  1
2                        1
Name: count, dtype: int64

In [111]:
#Dietary Habits: ['Healthy' 'Unhealthy' 'Moderate' 'Yes' 'Pratham' 'BSc' 'Gender' '3'
# 'More Healthy' 'Less than Healthy' 'Mihir' '1.0' 'Hormonal' 'Electrician'
# 'Not Mentioned' 'No Healthy' 'Less Healthy' 'M.Tech' 'Vegas' 'No' 'Male'
# 'Indoor' 'Class 12' '2']
dietary_map = {
    'Unhealthy': 0,
    'Moderate': 1,
    'Healthy': 2,
    'Yes': 2,
    '3': 0, # I'm assuming they are saying 3/10
    'More Healthy': 2,
    'Less than Healthy': 0,
    '1.0': 0,
    'Hormonal': 1,
    'Electrician': 0, # Joking about electricians having difficult jobs? Or maybe they mixed up the questions somehow. Either cacse they are probably unhealthy.
    'No Healthy': 0, # Self explanatory
    'Less Healthy': 0,
    'No': 0,
    '2': 0
}

garbage_values = {
    'Pratham',
    'BSc',
    'Gender', # How does someone even come up with this answer
    'Mihir',
    'Not Mentioned',
    'M.tech',
    'Vegas',
    'Male',
    'Indoor',
    'Class 12'
}

train['Dietary Habits'] = train['Dietary Habits'].map(dietary_map)
train.loc[train['Dietary Habits'].isin(garbage_values), 'Dietary Habits'] = np.nan
train['Dietary Habits'] = train['Dietary Habits'].fillna(1)

train['Dietary Habits'] = train['Dietary Habits'].astype('int8')

train['Dietary Habits']


0         2
1         0
2         2
3         1
4         0
         ..
140695    0
140696    1
140697    1
140698    1
140699    2
Name: Dietary Habits, Length: 140700, dtype: int8

In [112]:
pd.set_option('display.max_columns', None)
train['Profession'].value_counts().to_frame().T

Profession,None,Teacher,Content Writer,Architect,Consultant,HR Manager,Pharmacist,Doctor,Business Analyst,Entrepreneur,Chemist,Chef,Educational Consultant,Data Scientist,Researcher,Lawyer,Customer Support,Marketing Manager,Pilot,Travel Consultant,Plumber,Sales Executive,Manager,Judge,Electrician,Financial Analyst,Software Engineer,Civil Engineer,UX/UI Designer,Digital Marketer,Accountant,Finanancial Analyst,Mechanical Engineer,Graphic Designer,Research Analyst,Investment Banker,Student,Academic,Unemployed,Profession.1,Yogesh,BCA,MBA,LLM,PhD,Patna,Analyst,Pranav,Visakhapatnam,M.Ed,Moderate,Nagpur,B.Ed,Unveil,BBA,MBBS,Working Professional,Medical Doctor,City Manager,FamilyVirar,Dev,BE,B.Com,Family Consultant,Yuvraj
count,36630,24906,7814,4370,4229,4022,3893,3255,3161,2968,2967,2862,2852,2390,2328,2212,2055,1976,1913,1860,1748,1739,1737,1712,1582,1574,1510,1470,1452,1372,1339,1329,1177,1163,690,393,7,5,3,3,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Going to put similar professions into bins. I most likely misplaced something.

In [113]:
profession_map = {
    # Engineering/Tech
    'Software Engineer': 'Tech',
    'Civil Engineer': 'Tech',
    'Mechanical Engineer': 'Tech',
    'Electrical Engineer': 'Tech',
    'UX/UI Designer': 'Tech',
    'Graphic Designer': 'Tech',
    'Research Analyst': 'Tech',
    'Analyst': 'Tech',
    'Data Scientist': 'Tech',
    'Academic': 'Tech',
    'Researcher': 'Tech', # I was debating between tech and other

    # Finance/Business
    'Business Analyst': 'Business',
    'Financial Analyst': 'Finance',
    'Finanancial Analyst': 'Finance',  # spelling fix
    'Investment Banker': 'Finance',
    'Consultant': 'Business',
    'Manager': 'Business',
    'City Manager': 'Business',
    'Entrepreneur': 'Business',

    # Healthcare
    'Doctor': 'Healthcare',
    'Medical Doctor': 'Healthcare',
    'Pharmacist': 'Healthcare',
    'MBBS': 'Healthcare',
    'Chemist': 'Healthcare',

    # Education
    'Teacher': 'Education',
    'Educational Consultant': 'Education',
    'Academic': 'Education',

    # Legal
    'Lawyer': 'Legal',
    'Judge': 'Legal',

    # Marketing & Writing
    'Digital Marketer': 'Marketing',
    'Marketing Manager': 'Marketing',
    'Content Writer': 'Marketing',

    # Services
    'Customer Support': 'Service',
    'Sales Executive': 'Service',
    'Chef': 'Service',
    'Pilot': 'Service',
    'Travel Consultant': 'Service',
    'Plumber': 'Service',
    'Electrician': 'Service',
    'Family Consultant': 'Service',
    'HR Manager': 'Service'
}


other_professions = {
    'Profession', 'None', 'Moderate', 'Yogesh', 'Dev', 'Pranav', 'Unveil', 'Yuvraj',
    'BCA', 'MBA', 'LLM', 'M.Ed', 'B.Ed', 'PhD', 'BBA', 'BE', 'B.Com',
    'Nagpur', 'Patna', 'Visakhapatnam', 'FamilyVirar', 'Student', 'Unemployed', 'Working Professional'
}

train.loc[train['Profession'].isin(other_professions), 'Profession'] = np.nan
train['Profession'] = train['Profession'].map(profession_map)
train['Profession'] = train['Profession'].fillna('Other')

train['Profession'].value_counts()


Profession
Other         42379
Education     27763
Service       17782
Tech          12181
Business      12096
Marketing     11162
Healthcare    10117
Legal          3924
Finance        3296
Name: count, dtype: int64

In [114]:
le = LabelEncoder()
train['Profession'] = le.fit_transform(train['Profession'])
train['Profession'].value_counts()

Profession
6    42379
1    27763
7    17782
8    12181
0    12096
5    11162
3    10117
4     3924
2     3296
Name: count, dtype: int64

In [115]:
train['City'].unique()

array(['Ludhiana', 'Varanasi', 'Visakhapatnam', 'Mumbai', 'Kanpur',
       'Ahmedabad', 'Thane', 'Nashik', 'Bangalore', 'Patna', 'Rajkot',
       'Jaipur', 'Pune', 'Lucknow', 'Meerut', 'Agra', 'Surat',
       'Faridabad', 'Hyderabad', 'Srinagar', 'Ghaziabad', 'Kolkata',
       'Chennai', 'Kalyan', 'Nagpur', 'Vadodara', 'Vasai-Virar', 'Delhi',
       'Bhopal', 'Indore', 'Ishanabad', 'Vidhi', 'Ayush', 'Gurgaon',
       'Krishna', 'Aishwarya', 'Keshav', 'Harsha', 'Nalini', 'Aditya',
       'Malyansh', 'Raghavendra', 'Saanvi', 'M.Tech', 'Bhavna',
       'Less Delhi', 'Nandini', 'M.Com', 'Plata', 'Atharv', 'Pratyush',
       'City', '3.0', 'Less than 5 Kalyan', 'MCA', 'Mira', 'Moreadhyay',
       'Morena', 'Ishkarsh', 'Kashk', 'Mihir', 'Vidya', 'Tolkata', 'Anvi',
       'Krinda', 'Ayansh', 'Shrey', 'Ivaan', 'Vaanya', 'Gaurav', 'Harsh',
       'Reyansh', 'Kashish', 'Kibara', 'Vaishnavi', 'Chhavi', 'Parth',
       'Mahi', 'Tushar', 'MSc', 'No', 'Rashi', 'ME', 'Molkata',
       'Researcher', '

These are all cities in India. I am going to add a feature for crime.

In [None]:
valid_cities = {
    'Ludhiana', 'Varanasi', 'Visakhapatnam', 'Mumbai', 'Kanpur',
    'Ahmedabad', 'Thane', 'Nashik', 'Bangalore', 'Patna', 'Rajkot',
    'Jaipur', 'Pune', 'Lucknow', 'Meerut', 'Agra', 'Surat',
    'Faridabad', 'Hyderabad', 'Srinagar', 'Ghaziabad', 'Kolkata',
    'Chennai', 'Kalyan', 'Nagpur', 'Vadodara', 'Vasai-Virar', 'Delhi',
    'Bhopal', 'Indore', 'Gurgaon'
}

all_values = set(train['City'].unique())
garbage_cities = all_values - valid_cities
train.loc[~train['City'].isin(valid_cities), 'City'] = np.nan

# Frequency encoding cause I am going to rely on models like XGBoost
city_counts = train['City'].value_counts()
train['CityFreq'] = train['City'].map(city_counts).astype('float32')
train['CityFreq'] = train['CityFreq'].fillna(train['CityFreq'].mean())

crime_map = {
    'Kolkata': 86.5,       # lowest major city rate :contentReference[oaicite:1]{index=1}
    'Chennai': 169.2,      # one of safer cities :contentReference[oaicite:2]{index=2}
    'Surat': 215.3,        # in safe list :contentReference[oaicite:3]{index=3}
    'Pune': 256.8,         # safe list & NCRB data :contentReference[oaicite:4]{index=4}
    'Hyderabad': 259.9,    # safe list & NCRB data :contentReference[oaicite:5]{index=5}
    'Bangalore': 337.3,    # safe list & cities ranking :contentReference[oaicite:6]{index=6}
    'Ahmedabad': 360.1,    # safe list & state data :contentReference[oaicite:7]{index=7}
    'Mumbai': 376.3,       # safe list & NCRB data :contentReference[oaicite:8]{index=8}
    'Nagpur': 364.0,       # NCRB city data :contentReference[oaicite:9]{index=9}
    'Nasik': 389.2,        # NCRB city data :contentReference[oaicite:10]{index=10}
    'Rajkot': 401.6,       # NCRB city data :contentReference[oaicite:11]{index=11}
    'Meerut': 283.4,       # NCRB city data :contentReference[oaicite:12]{index=12}
    'Srinagar': 216.5,     # NCRB city data :contentReference[oaicite:13]{index=13}
    'Varanasi': 175.7,     # NCRB city data :contentReference[oaicite:14]{index=14}
    'Ludhiana': 252.5,     # NCRB city data :contentReference[oaicite:15]{index=15}
    'Vasai-Virar': 141.4,  # NCRB city data :contentReference[oaicite:16]{index=16}
    'Patna': 528.2,        # NCRB city data :contentReference[oaicite:17]{index=17}
    'Lucknow': 455.1,      # NCRB city data :contentReference[oaicite:18]{index=18}
    'Delhi': 1479.9        # NCRB city data (highest) :contentReference[oaicite:19]{index=19}
}

train['CityCrimeRate'] = train['City'].map(crime_map)
train['CityCrimeRate'] = np.log1p(train['CityCrimeRate'])
train['CityCrimeRate'] = train['CityCrimeRate'].fillna(train['CityCrimeRate'].median())

train.drop('City', axis=1,inplace=True)
train

Unnamed: 0,Gender,Age,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,CityFreq,CityCrimeRate
0,1,49.0,0,7,0.0,5.0,-1.00,0.0,2.0,9.0,2,BHM,0,1.0,2.0,0,0,5226.0,5.535364
1,0,26.0,0,1,0.0,4.0,-1.00,0.0,3.0,4.0,0,LLB,1,7.0,3.0,0,1,4606.0,5.174453
2,0,33.0,1,6,5.0,0.0,8.97,2.0,0.0,5.5,2,B.Pharm,1,3.0,1.0,0,1,5176.0,5.564137
3,0,22.0,0,1,0.0,5.0,-1.00,0.0,1.0,4.0,1,BBA,1,10.0,1.0,1,1,4966.0,5.933041
4,1,30.0,0,0,0.0,1.0,-1.00,0.0,1.0,5.5,0,BBA,1,9.0,4.0,1,0,4398.0,5.564137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,1,18.0,0,6,0.0,5.0,-1.00,0.0,4.0,5.5,0,Class 12,0,2.0,4.0,1,1,5613.0,5.889155
140696,1,41.0,0,5,0.0,5.0,-1.00,0.0,4.0,7.5,1,B.Tech,1,6.0,5.0,1,0,4496.0,5.564137
140697,1,24.0,0,5,0.0,3.0,-1.00,0.0,1.0,9.0,1,B.Com,0,4.0,4.0,0,0,5689.0,4.471639
140698,1,49.0,0,7,0.0,5.0,-1.00,0.0,2.0,5.5,1,ME,1,10.0,1.0,0,0,5074.0,5.382199


In [117]:
train['Degree'].value_counts().to_frame().T

Degree,Class 12,B.Ed,B.Arch,B.Com,B.Pharm,BCA,M.Ed,MCA,BBA,BSc,MSc,LLM,M.Pharm,M.Tech,B.Tech,LLB,BHM,MBA,BA,ME,MD,MHM,BE,PhD,M.Com,MBBS,MA,M.Arch,UX/UI Designer,B.Sc,Kalyan,M,LLBA,None,BArch,L.Ed,BPharm,P.Com,Nalini,BEd,B,Degree.1,Jhanvi,Bhopal,MEd,LL B.Ed,LLTech,M_Tech,5.88,Pihu,HCA,Marsh,Lata,S.Arch,BB,LHM,8.56,Entrepreneur,Aarav,B.Student,E.Tech,M.S,Navya,Mihir,RCA,B B.Com,LCA,N.Pharm,Doctor,CGPA,LLEd,LLS,Esha,Working Professional,Mthanya,B.3.79,K.Ed,Mahika,24,M. Business Analyst,Brithika,ACA,Badhya,HR Manager,Unite,P.Pharm,MPharm,Data Scientist,LL.Com,Business Analyst,H_Pharm,Class 11,20,S.Tech,Veda,BH,MPA,S.Pharm,Vrinda,Bhavesh,Brit,B.B.Arch,7.06,B BA,5.56,Ritik,B.03,5.61,0,Plumber,BPA,Vivaan,MTech,29,LLCom,Advait
count,14729,11691,8742,8113,5856,5739,5668,5234,5030,5027,4879,4647,4537,4475,4425,4348,4305,3818,3750,3632,3383,3286,3104,3103,3094,3082,2887,5,4,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [118]:
degree_map = {
    'Class 12': "Highschool",
    'Class 11': "Highschool",

    # Bachelor's
    'BSc': "Bachelor's",
    'B.Sc': "Bachelor's",
    'BA': "Bachelor's",
    'B.Com': "Bachelor's",
    'BBA': "Bachelor's",
    'B.Arch': "Bachelor's",
    'B.Tech': "Bachelor's",
    'BTech': "Bachelor's",
    'BE': "Bachelor's",
    'LLB': "Bachelor's",
    'B.Pharm': "Bachelor's",
    'BPharm': "Bachelor's",
    'B.Ed': "Bachelor's",
    'BHM': "Bachelor's",
    'BCA': "Bachelor's",
    'BPA': "Bachelor's",
    'MBBS': "Bachelor's",

    # Master's
    'MSc': "Master's",
    'M.Sc': "Master's",
    'MA': "Master's",
    'M.Com': "Master's",
    'MCA': "Master's",
    'MBA': "Master's",
    'M.Ed': "Master's",
    'M.Tech': "Master's",
    'ME': "Master's",
    'MHM': "Master's",
    'LLM': "Master's",
    'MPharm': "Master's",
    'M.Pharm': "Master's",
    'MArch': "Master's",
    'M.Arch': "Master's",
    'M.S': "Master's",
    'MPA': "Master's",

    # Doctorate
    'PhD': "Doctorate",
    'MD': "Doctorate",
    'Doctor': "Doctorate",
}

train['Degree'] = train['Degree'].map(degree_map).fillna('Unknown')
train['Degree'] = train['Degree'].map({'Unknown': 0, 'Highschool': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'Doctorate': 3})
# Mapping 'Unknown' to 0 along with 'Highschool'
train['Degree'].value_counts()




Degree
1    73218
2    46165
0    14830
3     6487
Name: count, dtype: int64

In [119]:
train.dtypes

Gender                                     int64
Age                                      float64
Working Professional or Student            int64
Profession                                 int32
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                           float64
Dietary Habits                              int8
Degree                                     int64
Have you ever had suicidal thoughts ?      int64
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness           int64
Depression                                 int64
CityFreq                                 float32
CityCrimeRate                            float64
dtype: object

In [120]:
train.isnull().sum()

Gender                                   0
Age                                      0
Working Professional or Student          0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
CityFreq                                 0
CityCrimeRate                            0
dtype: int64

Values are all numerical and null values are fixed.

In [121]:
y = train['Depression']
X = train.drop('Depression', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (112560, 18)
X_test shape: (28140, 18)
y_train shape: (112560,)
y_test shape: (28140,)


Parameters found in optuna.ipynb

In [122]:
params = {'n_estimators': 529, 
            'max_depth': 3, 
            'learning_rate': 0.1420285246256477, 
            'subsample': 0.6617422555272853, 
            'colsample_bytree': 0.6053925503341293, 
            'gamma': 1.7799292133978808, 
            'reg_alpha': 2.5110255593175745, 
            'reg_lambda': 3.4353207613114574,
            'random_state': 1006,
            'n_jobs':-1,
            "eval_metric": "logloss"
            }

final_model = Pipeline([
        ("xgb", XGBClassifier(**params))
])

print("Train accuracy:", np.mean(cross_val_score(final_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)))
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Train accuracy: 0.9393390191897654
Test accuracy: 0.940227434257285
