In [1]:
library(tidyverse) 
library(data.table) 
library(patchwork) 

## Theme 1
theme_custom <- theme_classic() +
                theme(## Axis labels
                      axis.text.x = element_text(size=18, family="Econ Sans Cnd light"),
                      axis.text.y = element_text(size=18, family="Econ Sans Cnd light"),
                      ## Title, subtitle, caption, legend
                      plot.title = element_text(size=22, face="bold", family="Econ Sans Cnd bold", 
                                                hjust = 0, vjust = 1, margin = margin(b = 10)),
                      plot.subtitle = element_text(size=18, family="Econ Sans Cnd regular",
                                                   hjust = 0),
                      plot.caption = element_text(size=12, face="bold", family="Econ Sans Cnd light", hjust=0, colour="#6F8793"),
                      legend.position = "top",
                      legend.text = element_text(size=15, hjust=0, vjust=.1),
                      legend.title = element_blank(),
                      ## y-axis lines
                      panel.grid.major.y = element_line(color="#6F8793")
                     )
               

figsize <- function(width=22, heigth=8){
    options(repr.plot.width=width, repr.plot.height=heigth)
}
figsize()

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     


── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors



Attaching package: ‘data.table’




The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year




The following objects are masked from ‘package:dplyr’:

    between, first, last




The following object is masked from ‘package:purrr’:

    transpose




# Reading Data

In [2]:
original <- read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv")
train <- read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test  <- read_csv("/kaggle/input/playground-series-s4e11/test.csv")

test_id <- test$id

sample_submission <- read_csv("/kaggle/input/playground-series-s4e11/sample_submission.csv")

[1mRows: [22m[34m2556[39m [1mColumns: [22m[34m19[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (11): Name, Gender, City, Working Professional or Student, Profession, S...
[32mdbl[39m  (8): Age, Academic Pressure, Work Pressure, CGPA, Study Satisfaction, J...



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1mRows: [22m[34m140700[39m [1mColumns: [22m[34m20[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (10): Name, Gender, City, Working Professional or Student, Profession, S...
[32mdbl[39m (10): id, Age, Academic Pressure, Work Pressure, CGPA, Study Satisfactio...



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1mRows: [22m[34m93800[39m [1mColumns: [22m[34m19[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (10): Name, Gender, City, Working Professional or Student, Profession, S...
[32mdbl[39m  (9): id, Age, Academic Pressure, Work Pressure, CGPA, Study Satisfactio...



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1mRows: [22m[34m93800[39m [1mColumns: [22m[34m2[39m


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (2): id, Depression



[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
head(original, 5)

Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>
Pooja,Female,37,Ghaziabad,Working Professional,Teacher,,2,,,4,7-8 hours,Moderate,MA,No,6,2,No,No
Reyansh,Male,60,Kalyan,Working Professional,Financial Analyst,,4,,,3,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
Manvi,Female,42,Bhopal,Working Professional,Teacher,,2,,,3,5-6 hours,Moderate,M.Com,No,0,2,No,No
Isha,Female,44,Thane,Working Professional,Teacher,,3,,,5,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
Aarav,Male,48,Indore,Working Professional,UX/UI Designer,,4,,,3,7-8 hours,Moderate,BE,Yes,6,5,Yes,No


In [4]:
head(train, 5)

id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
0,Aaradhya,Female,49,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1,2,No,0
1,Vivan,Male,26,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7,3,No,1
2,Yuvraj,Male,33,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3,1,No,1
3,Yuvraj,Male,22,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10,1,Yes,1
4,Rhea,Female,30,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9,4,Yes,0


In [5]:
head(test, 5)

id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
140700,Shivam,Male,53,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9,3,Yes
140701,Sanya,Female,58,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6,4,No
140702,Yash,Male,53,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12,4,No
140703,Nalini,Female,23,Rajkot,Student,,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10,4,No
140704,Shaurya,Male,47,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3,4,No


In [6]:
original <- 
    original %>% 
    mutate("data_indicator" = "original") 

train <- 
    train %>% 
    mutate("data_indicator" = "train") %>%
    select(!id)

test <- 
    test %>% 
    mutate("data_indicator" = "test",
           "Depression" = -999
          ) %>%
    select(!id)

## Merges original, train, and test dataframes together
full <- rbind(original, train, test)

In [7]:
head(full,3)
tail(full,3)

Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,data_indicator
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
Pooja,Female,37,Ghaziabad,Working Professional,Teacher,,2,,,4,7-8 hours,Moderate,MA,No,6,2,No,No,original
Reyansh,Male,60,Kalyan,Working Professional,Financial Analyst,,4,,,3,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No,original
Manvi,Female,42,Bhopal,Working Professional,Teacher,,2,,,3,5-6 hours,Moderate,M.Com,No,0,2,No,No,original


Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,data_indicator
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
Rishi,Male,24,Visakhapatnam,Student,,1.0,,7.51,4.0,,7-8 hours,Moderate,B.Tech,No,7,1,No,-999,test
Eshita,Female,23,Kalyan,Working Professional,Marketing Manager,,4.0,,,2.0,5-6 hours,Healthy,BA,Yes,7,5,Yes,-999,test
Gauri,Female,43,Varanasi,Working Professional,Educational Consultant,,5.0,,,2.0,More than 8 hours,Healthy,B.Ed,No,11,2,No,-999,test


In [8]:
## Rename features to remove spaces with _, and strip any punctuations
full <- 
    full %>%
    rename_with(~ gsub(" ", "_", gsub("[[:punct:]]", " ", .)))

In [9]:
names(full)

In [10]:
## Inspects NA count
full %>%
    is.na() %>%
    apply(2, sum) %>%
    data.frame() %>%
    rownames_to_column() %>%
    rename("NA_count"=".") %>%
    arrange(desc(NA_count))

rowname,NA_count
<chr>,<int>
Academic_Pressure,189890
CGPA,189890
Study_Satisfaction,189890
Profession,61935
Work_Pressure,47198
Job_Satisfaction,47186
Dietary_Habits,9
Degree,4
Financial_Stress,4
Name,0


In [11]:
## Inspects duplicate count

In [12]:
## Encodes 'Depression' feature

In [13]:
## Rename features to remove spaces with _, and strip any punctuations

# An Investigation on Numerical Features

# An Investigation on Categorical Features