In [11]:
import os
from collections import Counter
import numpy as np
import pandas as pd

In [3]:
INP_PATH = "./../inp/"
train_df = pd.read_csv(os.path.join(INP_PATH, "train.csv"))
test_df = pd.read_csv(os.path.join(INP_PATH, "test.csv"))
sub_df = pd.read_csv(os.path.join(INP_PATH, "sample_submission.csv"))

In [4]:
train_df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [21]:
train_df.nunique()

id                                       140700
Name                                        422
Gender                                        2
Age                                          43
City                                         98
Working Professional or Student               2
Profession                                   64
Academic Pressure                             5
Work Pressure                                 5
CGPA                                        331
Study Satisfaction                            5
Job Satisfaction                              5
Sleep Duration                               36
Dietary Habits                               23
Degree                                      115
Have you ever had suicidal thoughts ?         2
Work/Study Hours                             13
Financial Stress                              5
Family History of Mental Illness              2
Depression                                    2
dtype: int64

In [58]:
def map_sleep_hours(response):
    label = -1
    if response == "Less Than 5 hours":
        label = 0
    elif response == "5-6 hours":
        label = 1
    elif response == "7-8 hours":
        label = 2
    elif response == "More than 8 hours":
        label = 3

    return label

def map_diet(response):
    label = -1
    if response == "Unhealthy":
        label = 0
    elif response == "Moderate":
        label = 1
    elif response == "Healthy":
        label = 2
    
    return label
    pass

def preprocess(df):
    df = df.copy()
    # drop the id col
    df.drop(columns='id', inplace=True)

    # convert name to counts
    df['Name'] = df["Name"].map(Counter(df["Name"]))
    
    # Female as 0, Male as 1
    df["Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

    # convert City to counts
    df["City"] = df["City"].map(Counter(df["City"]))

    # working professional or student: binary col
    df["Working Professional or Student"] = df["Working Professional or Student"].map({
        "Working Professional": 1,
        'Student': 0
    })

    # profession to counts
    df['Profession'] = df['Profession'].map(Counter(df['Profession']))

    # # fill academic pressure as -1 for all working professionals
    df['Academic Pressure'] = df['Academic Pressure'].fillna(-1)

    # same for Work Pressue: Combine these columns in future
    df['Work Pressure'] = df['Work Pressure'].fillna(-1)

    # same for CGPA
    df['CGPA'] = df['CGPA'].fillna(-1)
    df['Study Satisfaction'] = df['Study Satisfaction'].fillna(-1)
    df['Job Satisfaction'] = df['Job Satisfaction'].fillna(-1)
    
    df['Sleep Duration'] = df["Sleep Duration"].apply(map_sleep_hours)
    df['Dietary Habits'] = df['Dietary Habits'].apply(map_diet)

    # ignore degree col for now, high cardinality
    df = df.drop(columns="Degree")

    df["Have you ever had suicidal thoughts ?"] = df["Have you ever had suicidal thoughts ?"].map({
        "Yes": 1, "No": 0
    })

    df["Family History of Mental Illness"] = df["Family History of Mental Illness"].map({
        "Yes": 1, "No": 0
    })

    # financial stress NaN?? Dear lord.
    df["Financial Stress"] = df["Financial Stress"].fillna(-1)

    assert df.isnull().sum().sum() == 0, "stil some NaN left"
    return df

In [60]:
processed_train_df = preprocess(train_df)
processed_test_df = preprocess(test_df)

In [67]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [80]:
skf = StratifiedKFold(5, shuffle=True, random_state=2024)

# params = {
#     'objective': 'binary:logistic',     # For binary classification
#     'eval_metric': 'error',             # "error" gives accuracy; can also use 'logloss' for log-loss
#     'scale_pos_weight': 4.0,            # Approximate ratio of 80:20 (negative:positive) to balance classes
#     'max_depth': 6,                     # Depth of the trees; higher for more complex relationships
#     'learning_rate': 0.1,               # Smaller step sizes; can be tuned for faster convergence
#     'n_estimators': 500,                # Number of trees; increase if the model underfits
#     'subsample': 0.8,                   # Fraction of samples used per tree
#     'colsample_bytree': 0.8,            # Fraction of features used per tree
#     'gamma': 1,                         # Minimum loss reduction to split a node; helps reduce overfitting
#     'min_child_weight': 5,              # Minimum sum of weights for child nodes; helps reduce overfitting
#     'reg_alpha': 0.5,                   # L1 regularization; adds sparsity to the model
#     'reg_lambda': 1.0                   # L2 regularization; stabilizes model by reducing weights
# }

for fold, (train_idx, val_idx) in enumerate(skf.split(processed_train_df, processed_train_df['Depression'])):
    x_train = processed_train_df.iloc[train_idx].drop(columns='Depression')
    y_train = processed_train_df.iloc[train_idx]['Depression']
    x_val = processed_train_df.iloc[val_idx].drop(columns='Depression')
    y_val = processed_train_df.iloc[val_idx]['Depression']

    model = XGBClassifier()
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print(f"{fold} : {accuracy_score(y_true=y_val, y_pred=y_pred)}")

0 : 0.9386638237384506
1 : 0.937953091684435
2 : 0.9374911158493248
3 : 0.9392679459843639
4 : 0.9367093105899076


In [81]:
y_pred_test = model.predict(processed_test_df)

In [82]:
sub_df['Depression'] = y_pred_test
sub_df

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
...,...,...
93795,234495,0
93796,234496,1
93797,234497,0
93798,234498,1


In [83]:
sub_df.to_csv("./../op/baseline.csv", index=False)