In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=Warning)

# 1. data loading and preview

In [None]:
sample_df = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
print(sample_df.shape)
sample_df[:2]
# (93800, 2)

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
print(test_df.shape)
test_df[:2]
# (93800, 19)

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
print(train_df.shape)
train_df[:2]
# (140700, 20)

In [None]:
train_df.info()
# numerical and categorical data with NaN values

# 2. data description
- CGPA : Cumulative Grade Point Average


# 3. EDA
## 1. Target, Depression ratio

In [None]:
# Function that return column class, count, and ratio
def class_count_ratio(df, column): 
    value_counts = df[column].value_counts() 
    CCR_df = value_counts.to_frame(name='count')
    CCR_df['ratio'] = CCR_df['count'] / CCR_df['count'].sum()
    CCR_df = CCR_df.reset_index()
    return CCR_df

In [None]:
# Target, Depression class, count and ratio
class_count_ratio(train_df, 'Depression')

## 2. Depression ratio by Gender

In [None]:
# Depression_by_Object_analysis Function
def depression_by_object_analysis(df, object_column, depression_column):
    object_counts = df[object_column].value_counts()
    analysis_df = object_counts.to_frame(name='count')
    analysis_df['depression_count'] = df.groupby(object_column)[depression_column].sum()
    analysis_df['depression_ratio'] = analysis_df['depression_count'] / analysis_df['count']
    analysis_df = analysis_df.reset_index()
    return analysis_df

In [None]:
# Depression_by_Gender
depression_by_object_analysis(train_df, 'Gender', 'Depression')
# Men have slightly higher depression_ratio than women.

## 3. Depression count by Age

In [None]:
# Countplot of Depression by Age
train_df['Age'] = train_df['Age'].astype(int)
plt.figure(figsize=(14, 2))
sns.countplot(x="Age", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Age")
plt.show()
# The younger the generation, the higher the rate of depression.

## 4. Depression ratio by City

In [None]:
plt.figure(figsize=(16, 2))
sns.countplot(x="City", hue='Depression', data=train_df)
plt.title("Countplot of Depression by City")
plt.xticks(rotation=75, ha='right')
plt.show()

In [None]:
# select City with enough Depression data
depression_by_city = depression_by_object_analysis(train_df, 'City', 'Depression')
depression_by_city_filt  = depression_by_city[depression_by_city['count'] >= 6]
print(depression_by_city_filt.shape)
depression_by_city_filt[:1]

In [None]:
# Depression Ratio by City
depression_by_city_filt = depression_by_city_filt.sort_values(by='depression_ratio', ascending=False)

plt.figure(figsize=(14, 2))
sns.barplot(x="City", y="depression_ratio",data=depression_by_city_filt)
plt.title("Depression ratio by city")
plt.xticks(rotation=45, ha='right')
plt.show()

## 5. Depression Ratio by Working Professional or Student

In [None]:
depression_by_object_analysis(train_df, 'Working Professional or Student', 'Depression')

## 6. Profession

In [None]:
plt.figure(figsize=(14, 2))
sns.countplot(x="Profession", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Profession")
plt.xticks(rotation=60, ha='right')
plt.show()

In [None]:
# select Profession with enough Depression data
profession_by_depression = depression_by_object_analysis(train_df, 'Profession', 'Depression')
profession_by_depression_filt  = profession_by_depression[profession_by_depression['count'] >= 10]
print(profession_by_depression_filt.shape)
profession_by_depression_filt[:5]

In [None]:
# Depression Ratio by Profession
profession_by_depression_filt = profession_by_depression_filt.sort_values(by='depression_ratio', 
                                                        ascending=False)

plt.figure(figsize=(14, 2))
sns.barplot(x="Profession", y="depression_ratio",data=profession_by_depression_filt)
plt.title("Depression Ratio by Profession")
plt.xticks(rotation=45, ha='right')
plt.show()
# Graphic Designer, Judge - Chemist, Pharmacist, Entrepreneur, Content Writer

## 7. Academic & Work Pressure

In [None]:
# Countplot of Depression by Academic & Work Pressure
plt.figure(figsize=(14,2))  
plt.subplot(1, 2, 1) 
sns.countplot(x="Academic Pressure", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Academic Pressure")

plt.subplot(1, 2, 2)  
sns.countplot(x="Work Pressure", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Work Pressure")

plt.tight_layout() 
plt.show()

## 8. CGPA

In [None]:
# CGPA
class_count_ratio(train_df, 'CGPA')

In [None]:
# CGPA grouping 
CGPA_df = pd.DataFrame()
CGPA_df['CGPA'] = train_df.CGPA
CGPA_df['Depression'] = train_df.Depression
CGPA_df = CGPA_df.dropna(axis=0).reset_index(drop=True)
CGPA_df['CGPA_group'] = pd.qcut(CGPA_df['CGPA'], q=10, 
                                labels=['Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8','Q9','Q10'])
print(CGPA_df.shape)
CGPA_df.head()

In [None]:
# depression_by_CGPA_group
depression_by_CGPA_group = depression_by_object_analysis(CGPA_df, 'CGPA_group', 'Depression')
depression_by_CGPA_group.sort_values(by='CGPA_group', ascending=True)

In [None]:
# Depression Ratio by CGPA_group
depression_by_CGPA_group = depression_by_CGPA_group.sort_values(by='CGPA_group', ascending=True)

plt.figure(figsize=(14, 2))
sns.barplot(x="CGPA_group", y="depression_ratio",data=depression_by_CGPA_group)
plt.title("Depression Ratio by CGPA_group")
plt.show()
# Students in the upper and middle CGPA (Q7, 8, 6) tend to be more depressed.

## 9. Study & Job Satisfaction

In [None]:
# depression by study satisfaction
depression_by_study = depression_by_object_analysis(train_df, 'Study Satisfaction', 'Depression')
depression_by_study.sort_values(by='Study Satisfaction', ascending=False)

In [None]:
# depression by job satisfaction
depression_by_job = depression_by_object_analysis(train_df, 'Job Satisfaction', 'Depression')
depression_by_job.sort_values(by='Job Satisfaction', ascending=False)

In [None]:
# Countplot of Depression by Study & Job Satisfaction
plt.figure(figsize=(14,2))  
plt.subplot(1, 2, 1) 
sns.countplot(x="Study Satisfaction", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Study Satisfaction")

plt.subplot(1, 2, 2)  
sns.countplot(x="Job Satisfaction", hue='Depression', data=train_df)
plt.title("Countplot of Depression by Job Satisfaction")

plt.tight_layout() 
plt.show()

## 10. Sleep Duration

In [None]:
# Sleep Duration class, count & ratio
class_count_ratio(train_df, 'Sleep Duration')[:3]

In [None]:
# select Sleep Duration
sleep_df = pd.DataFrame()
sleep_df = train_df[(train_df['Sleep Duration'] == 'Less than 5 hours') | (
    train_df['Sleep Duration'] == '5-6 hours') | (
    train_df['Sleep Duration'] == '7-8 hours') | (
    train_df['Sleep Duration'] == 'More than 8 hours')].reset_index(drop=True)

print(sleep_df.shape)
sleep_df[:1]

In [None]:
# Depression by selected Sleep Duration analysis
depression_by_sleep_duration_filt = depression_by_object_analysis(sleep_df, 'Sleep Duration', 'Depression')
depression_by_sleep_duration_filt
# Men have slightly higher rates of depression than women.

In [None]:
# Depression Ratio by Sleep Duration_filt 
depression_by_sleep_duration_filt = depression_by_sleep_duration_filt.sort_values(by='depression_ratio', 
                                                                        ascending=False)

plt.figure(figsize=(14, 2))
sns.barplot(x="Sleep Duration", y="depression_ratio",data=depression_by_sleep_duration_filt)
            #order=depression_1_by_Sleep_Duration['Sleep Duration']
plt.title("Depression Ratio by Sleep Duration")
plt.show()
# Getting enough sleep can help reduce depression.

## 11. Dietary Habits

In [None]:
# Dietary Habits class, count & ratio
class_count_ratio(train_df, 'Dietary Habits')[:3]

In [None]:
# select main Dietary_Habits data 
Dietary_Habits_filt = pd.DataFrame()
Dietary_Habits_filt = train_df[(train_df['Dietary Habits'] == 'Moderate') | (
    train_df['Dietary Habits'] == 'Unhealthy') | (
    train_df['Dietary Habits'] == 'Healthy')].reset_index(drop=True)

print(Dietary_Habits_filt.shape)
Dietary_Habits_filt[:1]

In [None]:
# depression by Dietary Habits_filt analysis
depression_by_object_analysis_filt = depression_by_object_analysis(Dietary_Habits_filt, 'Dietary Habits', 'Depression')
depression_by_object_analysis_filt.sort_values(by='depression_ratio', ascending=False)

In [None]:
# Countplot of Depression by Dietary Habits_filt
plt.figure(figsize=(14, 2))
sns.barplot(x="Dietary Habits", y="depression_ratio", data=depression_by_object_analysis_filt)
plt.title("Depression ratio by Dietary Habits")
plt.show()

## 12. Degree

In [None]:
# Degree class, count & ratio
depression_by_degree = class_count_ratio(train_df, 'Degree')
print(depression_by_degree.shape)
depression_by_degree[:5]
# total len of case 115, 27 classes have enough cases

In [None]:
# select Degree with enough Depression data and analysis
depression_by_degree = depression_by_object_analysis(train_df, 'Degree', 'Depression')
depression_by_degree_filt  = depression_by_degree[depression_by_degree['count'] >= 6].reset_index(drop=True)
print(depression_by_degree_filt.shape)
depression_by_degree_filt[:1]

In [None]:
# Depression Ratio by Degree
depression_by_degree_filt = depression_by_degree_filt.sort_values(
    by='depression_ratio', ascending=False)

plt.figure(figsize=(14, 2))
sns.barplot(x="Degree", y="depression_ratio",data=depression_by_degree_filt)
plt.title("Depression Ratio by Degree")
plt.xticks(rotation=45, ha='right')
plt.show()


### 13. Have you ever had suicidal thoughts ?

In [None]:
# depression by "Have you ever had suicidal thoughts ?"
depression_by_object_analysis(train_df, 'Have you ever had suicidal thoughts ?', 'Depression')

## 14. Work/Study Hours

In [None]:
# depression by work/study hours analysis
depression_by_work_study_hours = depression_by_object_analysis(train_df, 'Work/Study Hours', 'Depression')
depression_by_work_study_hours.sort_values(by='Work/Study Hours', ascending=False)[:5]
# too much Work/Study Hours increase depression ratio

In [None]:
# Depression Ratio by Degree
#work_study_hours_by_depression = degree_by_depression_filt.sort_values(by='depression_ratio', ascending=False)
plt.figure(figsize=(14, 2))
sns.barplot(x="Work/Study Hours", y="depression_ratio",data=depression_by_work_study_hours)
plt.title("Depression Ratio by work/study hours")
plt.show()

## 15. Financial Stress

In [None]:
# depression_by_Financial_Stress
depression_by_Financial_Stress = depression_by_object_analysis(train_df, 'Financial Stress', 'Depression')
depression_by_Financial_Stress.sort_values(by='Financial Stress', ascending=False)[:5]
# Financial Stress increase depression ratio

In [None]:
# Depression Ratio by Financial Stress
plt.figure(figsize=(14, 2))
sns.barplot(x="Financial Stress", y="depression_ratio",data=depression_by_Financial_Stress)
plt.title("Depression Ratio by Financial Stress")
plt.show()

## 16. Family History of Mental Illness

In [None]:
# depression_by_Family_Mental_Illness_History
depression_by_Family_Mental_Illness_History = depression_by_object_analysis(
    train_df, 'Family History of Mental Illness', 'Depression')
depression_by_Family_Mental_Illness_History
# Financial Stress increase depression ratio

# 4. feature engineering

In [None]:
# feature, target, test(_feature) data
feature = train_df.drop(['id', 'Name', 'Depression'], axis=1)
target = train_df[['Depression']]
test = test_df.drop(['id', 'Name'], axis=1)

feature.shape, target.shape, test.shape

In [None]:
missing_values = train_df.isnull().sum()
print(missing_values)
# 

## 1. Numerical data

In [None]:
# numeric variables in feature data
feature_num = feature.select_dtypes(include=['int', 'float']).columns
feature_num = feature[feature_num]

test_num = test.select_dtypes(include=['int', 'float']).columns
test_num = test[test_num]

feature_num.shape, test_num.shape
# (140700, 8), (93800, 8)

In [None]:
# count of NaN values 
feature_num.isna().sum().sum(), test_num.isna().sum().sum()
# (394240, 262652)

In [None]:
# imputation for NaN
# from sklearn.impute import KNNImputer
# imputer = KNNImputer(n_neighbors=3)
# feature_num_imputed = imputer.fit_transform(feature_num)
# test_num_imputed = imputer.transform(test_num)
# feature_num_imputed = pd.DataFrame(feature_num_imputed, columns=feature_num.columns)
# test_num_imputed = pd.DataFrame(test_num_imputed, columns=test_num.columns)
# feature_num_imputed.shape, test_num_imputed.shape
# It's taking too long. So let's try using data with NaN.

## 2. Standardization

In [None]:
feature_num.columns

In [None]:
# Standardization

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(feature[feature_num.columns])

feature_num_st = scaler.transform(feature[feature_num.columns])
feature_num_st = pd.DataFrame(feature_num_st, columns = feature_num.columns)

test_num_st = scaler.transform(test[test_num.columns])
test_num_st = pd.DataFrame(test_num_st, columns = test_num.columns)

print(feature_num_st.shape, test_num_st.shape)
feature_num_st.head()

In [None]:
feature_num_st.fillna(feature_num_st.median(), inplace=True)
test_num_st.fillna(test_num_st.median(), inplace=True)
print(feature_num_st.isna().sum().sum(), test_num_st.isna().sum().sum())

## 3. Categorical data

In [None]:
# categoric variables in feature data

feature_cat = feature.select_dtypes(include=['object']).columns
feature_cat = feature[feature_cat]

test_cat = test.select_dtypes(include=['object']).columns
test_cat = test[test_cat]

feature_cat.shape, test_cat.shape
# ((140700, 9), (93800, 9))

In [None]:
# classes number of object columns
for column in feature_cat.columns:
    print(f"{column} unique? {feature_cat[column].nunique()}") # feature_cat[column].unique()
# different calsses in categorical columns

In [None]:
# classes number of object columns
for column in test_cat.columns:
    print(f"{column} unique? {test_cat[column].nunique()}") # test_cat[column].unique()
# different calsses in categorical columns

## 4. select common classes in feature and test 

In [None]:
feature_cat.shape, test_cat.shape
# (140700, 9), (93800, 9)

In [None]:
feature_cat.isna().sum().sum()

In [None]:
feature_cat.info()

In [None]:
# common classes
common_classes_col1 = set(feature_cat['Gender']).intersection(set(test_cat['Gender']))
common_classes_col2 = set(feature_cat['City']).intersection(set(test_cat['City']))
common_classes_col3 = set(feature_cat['Working Professional or Student']).intersection(set(
    test_cat['Working Professional or Student']))
common_classes_col4 = set(feature_cat['Profession']).intersection(set(test_cat['Profession']))
common_classes_col5 = set(feature_cat['Sleep Duration']).intersection(set(test_cat['Sleep Duration']))
common_classes_col6 = set(feature_cat['Dietary Habits']).intersection(set(test_cat['Dietary Habits']))
common_classes_col7 = set(feature_cat['Degree']).intersection(set(test_cat['Degree']))
common_classes_col8 = set(feature_cat['Have you ever had suicidal thoughts ?']).intersection(set(
    test_cat['Have you ever had suicidal thoughts ?']))
common_classes_col9 = set(feature_cat['Family History of Mental Illness']).intersection(set(
    test_cat['Family History of Mental Illness']))

In [None]:
common_classes_col1

In [None]:
# common classes & unknown
def map_to_common_classes(df, column, common_classes):
    return df[column].apply(lambda x: x if x in common_classes else 'unknown')

In [None]:
# new categorical_data with common classes & unknown
feature_cat['Gender'] = map_to_common_classes(feature_cat, 'Gender', common_classes_col1)
test_cat['Gender'] = map_to_common_classes(test_cat, 'Gender', common_classes_col1)

feature_cat['City'] = map_to_common_classes(feature_cat, 'City', common_classes_col2)
test_cat['City'] = map_to_common_classes(test_cat, 'City', common_classes_col2)

feature_cat['Working Professional or Student'] = map_to_common_classes(
    feature_cat, 'Working Professional or Student', common_classes_col3)
test_cat['Working Professional or Student'] = map_to_common_classes(
    test_cat, 'Working Professional or Student', common_classes_col3)

feature_cat['Profession'] = map_to_common_classes(feature_cat, 'Profession', common_classes_col4)
test_cat['Profession'] = map_to_common_classes(test_cat, 'Profession', common_classes_col4)

feature_cat['Sleep Duration'] = map_to_common_classes(feature_cat, 'Sleep Duration', common_classes_col5)
test_cat['Sleep Duration'] = map_to_common_classes(test_cat, 'Sleep Duration', common_classes_col5)

feature_cat['Dietary Habits'] = map_to_common_classes(feature_cat, 'Dietary Habits', common_classes_col6)
test_cat['Dietary Habits'] = map_to_common_classes(test_cat, 'Dietary Habits', common_classes_col6)

feature_cat['Degree'] = map_to_common_classes(feature_cat, 'Degree', common_classes_col7)
test_cat['Degree'] = map_to_common_classes(test_cat, 'Degree', common_classes_col7)

feature_cat['Have you ever had suicidal thoughts ?'] = map_to_common_classes(
    feature_cat, 'Have you ever had suicidal thoughts ?', common_classes_col8)
test_cat['Have you ever had suicidal thoughts ?'] = map_to_common_classes(
    test_cat, 'Have you ever had suicidal thoughts ?', common_classes_col8)

feature_cat['Family History of Mental Illness'] = map_to_common_classes(
    feature_cat, 'Family History of Mental Illness', common_classes_col9)
test_cat['Family History of Mental Illness'] = map_to_common_classes(
    test_cat, 'Family History of Mental Illness', common_classes_col9)

In [None]:
feature_cat.shape

In [None]:
# one_hot_encoding 
feature_cat_ohe = pd.get_dummies(feature_cat, columns = feature_cat.columns, 
                                 dtype=int, drop_first=True)
test_cat_ohe = pd.get_dummies(test_cat, columns = test_cat.columns, 
                                 dtype=int, drop_first=True)
print(feature_cat_ohe.shape, test_cat_ohe.shape)
feature_cat_ohe.head(2)

In [None]:
feature_cat.isna().sum().sum(), test_cat.isna().sum().sum()

In [None]:
feature_cat_ohe.isna().sum().sum(), test_cat_ohe.isna().sum().sum()

In [None]:
feature_tf = pd.concat([feature_num_st, feature_cat_ohe], axis=1)
test_tf = pd.concat([test_num_st, test_cat_ohe], axis=1)
feature_tf.shape, test_tf.shape, target.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(feature_tf, target, test_size=0.2, random_state=2411)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

# 5. Modeling
## 1. Fully Connected Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([Dense(units=64, activation='relu', input_dim=177),  
                    Dropout(0.4), 
                    Dense(units=32, activation='relu'),            
                    Dropout(0.2),
                    Dense(units=16, activation='relu'),
                    Dense(units=1, activation='sigmoid')])

model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, 
                    validation_data=(X_val, y_val), verbose=1)

In [None]:
# accuracy results
print("Final training loss:", history.history['loss'][-1])
print("Final validation loss:", history.history['val_loss'][-1])
print("Final training accuracy:", history.history['accuracy'][-1])
print("Final validation accuracy:", history.history['val_accuracy'][-1])

# 6. Submission

In [None]:
# test predict
test_pred_proba = model.predict(test_tf)
test_pred = (test_pred_proba > 0.4999).astype(int)
test_pred = pd.DataFrame(test_pred, columns=['Depression'])
test_pred.tail()

In [None]:
submission = pd.DataFrame({'id': test_df.id, 'Depression': test_pred.Depression})
print(submission.shape)
submission.tail()

In [None]:
# submission, Depression class, count and ratio
class_count_ratio(submission, 'Depression')
# train_df 0.181713
#  test_df 0.174424 0.5 0.175586 0.49 0.175586

In [None]:
submission.to_csv('submission.csv', index=False)