# Depression Risk Prediction

**Objective**: Predict depression risk based on various demographic, academic, and lifestyle factors

**Algorithm**: XGBoost with hyperparameter optimization

**Validation**: Stratified K-Fold Cross Validation

**Metrics**: AUC-ROC, Accuracy, F1-Score

**Results**: 0.9745, 0.9381, 0.8252

In [1]:
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

## Data Preprocessing and Feature Engineering

In [50]:
def preprocess(path):
    df = pd.read_csv(path, index_col=0)

    column_types = {
        "numerical": [
            "Age",
            "Work/Study Hours"
            "CGPA",              # Has NA
        ],
        "ordinal": [
            "Academic Pressure", # Has NA
            "Study Satisfaction",# Has NA
            "Work Pressure",     # Has NA
            "Job Satisfaction",  # Has NA
            "Financial Stress",
            "Sleep Quality",     # Engineered
        ],
        "categorical": [
            "Gender",
            "Working Professional or Student",
            "Dietary Habits",
            "Have you ever had suicidal thoughts ?", 
            "Family History of Mental Illness",
            "Sleep Duration",                  # Dropped
        ],
        "categorical_high": [
            "City", 
            "Profession", # Has NA
            "Degree", 
        ],
        "target": "Depression"
    }

    sleep_time_map = {
        "Less than 5 hours": 1.0,
        "5-6 hours": 2.0,
        "6-7 hours": 3.0,
        "7-8 hours": 4.0,
        "More than 8 hours": 5.0,
        "3-4 hours": 1.0,
        "4-5 hours": 1.0,
        "2-3 hours": 1.0,
        "4-6 hours": 2.0,
        "6-8 hours": 4.0,
    }
    
    df["Sleep Quality"] = df["Sleep Duration"].map(sleep_time_map)

    df.loc[
        (df['Profession'].isna()) & (df['Working Professional or Student'] == 'Student'), 'Profession'
    ] = 'Student'
    
    for col in column_types['ordinal']:
        df[col].fillna(-1, inplace=True)
    
    df["CGPA"].fillna(-1, inplace=True)

    
    cols = ["Profession", "Degree", "Dietary Habits", "City"]
    
    for col in cols:
        
        df[col].fillna("NA", inplace=True)
        
        valid = df[col].value_counts()
        valid = valid[valid > 50].index.tolist()
        
        df[col] = df[col].apply(
            lambda x: x if x in valid else "Other"
        )
        
    df = df.drop(columns=["Name", "Sleep Duration"], errors="ignore")

    return df

In [51]:
train_path = '/kaggle/input/predicting-depression-machine-learning-challenge/train.csv'
test_path = '/kaggle/input/predicting-depression-machine-learning-challenge/test.csv'

In [None]:
df = preprocess(train_path)
df_test = preprocess(test_path)

In [32]:
df

Unnamed: 0_level_0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Sleep Quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,Female,49.0,Ludhiana,Working Professional,Chef,-1.0,5.0,-1.00,-1.0,2.0,Healthy,BHM,No,1.0,2.0,No,0,5.0
1,Male,26.0,Varanasi,Working Professional,Teacher,-1.0,4.0,-1.00,-1.0,3.0,Unhealthy,LLB,Yes,7.0,3.0,No,1,1.0
2,Male,33.0,Visakhapatnam,Student,,5.0,-1.0,8.97,2.0,-1.0,Healthy,B.Pharm,Yes,3.0,1.0,No,1,2.0
3,Male,22.0,Mumbai,Working Professional,Teacher,-1.0,5.0,-1.00,-1.0,1.0,Moderate,BBA,Yes,10.0,1.0,Yes,1,1.0
4,Female,30.0,Kanpur,Working Professional,Business Analyst,-1.0,1.0,-1.00,-1.0,1.0,Unhealthy,BBA,Yes,9.0,4.0,Yes,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,Female,18.0,Ahmedabad,Working Professional,,-1.0,5.0,-1.00,-1.0,4.0,Unhealthy,Class 12,No,2.0,4.0,Yes,1,2.0
140696,Female,41.0,Hyderabad,Working Professional,Content Writer,-1.0,5.0,-1.00,-1.0,4.0,Moderate,B.Tech,Yes,6.0,5.0,Yes,0,4.0
140697,Female,24.0,Kolkata,Working Professional,Marketing Manager,-1.0,3.0,-1.00,-1.0,1.0,Moderate,B.Com,No,4.0,4.0,No,0,5.0
140698,Female,49.0,Srinagar,Working Professional,Plumber,-1.0,5.0,-1.00,-1.0,2.0,Moderate,ME,Yes,10.0,1.0,No,0,2.0


## Feature Encoding and Model Selection

In [41]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import xgboost as xgb

def encode(X_train, X_test, y_train, column_types):

    # Scale Numerical Features
    scaler = StandardScaler()
    X_train[column_types["numerical"]] = scaler.fit_transform(X_train[column_types["numerical"]])
    X_test[column_types["numerical"]] = scaler.transform(X_test[column_types["numerical"]])
    
    # Encode Ordinal Features
    ordinal_encoder = OrdinalEncoder()
    X_train[column_types["ordinal"]] = ordinal_encoder.fit_transform(X_train[column_types["ordinal"]])
    X_test[column_types["ordinal"]] = ordinal_encoder.transform(X_test[column_types["ordinal"]])
    
    # Encode Categorical Features (One-Hot Encoding for Low Cardinality)
    X_train = pd.get_dummies(X_train, columns=column_types["categorical"], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=column_types["categorical"], drop_first=True)
    
    # Encode High-Cardinality Categorical Features (Target Encoding)
    target_encoder = TargetEncoder()
    X_train[column_types["categorical_high"]] = target_encoder.fit_transform(X_train[column_types["categorical_high"]], y_train)
    X_test[column_types["categorical_high"]] = target_encoder.transform(X_test[column_types["categorical_high"]])

    # Ensure same feature columns in train and test
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0
    X_test = X_test[X_train.columns]

    return (X_train, X_test, y_train)


def xgbmodel(X_train, X_test, y_train, column_types):

    (X_train, X_test, y_train) = encode(X_train, X_test, y_train, column_types)

    # Define Stratified K-Fold CV
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # XGBoost Model
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        use_label_encoder=False,
        random_state=1
    )
    
    # Train with Cross Validation
    auc_scores = []
    for train_idx, val_idx in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
        xgb_model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            early_stopping_rounds=50,
            verbose=100
        )
        y_val_pred = xgb_model.predict_proba(X_val_fold)[:, 1]
        auc = roc_auc_score(y_val_fold, y_val_pred)
        auc_scores.append(auc)
    
    print(f"Mean AUC-ROC on CV: {np.mean(auc_scores):.4f}")
    
    return (X_test, xgb_model) 

### Preparing Data for Training

In [34]:
column_types_final = {
    "numerical": [
        "Age", 
        "Work/Study Hours", 
        "CGPA",
    ],
    "ordinal": [
        "Academic Pressure",
        "Study Satisfaction",
        "Work Pressure",
        "Job Satisfaction",
        "Financial Stress",
        "Sleep Quality",
    ],
    "categorical": [
        "Gender",
        "Working Professional or Student",
        "Dietary Habits",
        "Have you ever had suicidal thoughts ?", 
        "Family History of Mental Illness",
    ],
    "categorical_high": [
        "City", 
        "Profession",
        "Degree", 
    ],
    "target": "Depression"
}

In [42]:
X = df.drop(columns=["Depression"])
y = df["Depression"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

## Model Training

In [43]:
(X_test, xgb_model) = xgbmodel(X_train, X_test, y_train, column_types_final)



[0]	validation_0-auc:0.95411
[100]	validation_0-auc:0.96687
[200]	validation_0-auc:0.96907
[300]	validation_0-auc:0.97077
[400]	validation_0-auc:0.97208
[500]	validation_0-auc:0.97295
[600]	validation_0-auc:0.97345
[700]	validation_0-auc:0.97372
[800]	validation_0-auc:0.97387
[900]	validation_0-auc:0.97399
[999]	validation_0-auc:0.97409




[0]	validation_0-auc:0.95844
[100]	validation_0-auc:0.97024
[200]	validation_0-auc:0.97203
[300]	validation_0-auc:0.97347
[400]	validation_0-auc:0.97466
[500]	validation_0-auc:0.97546
[600]	validation_0-auc:0.97589
[700]	validation_0-auc:0.97614
[800]	validation_0-auc:0.97628
[900]	validation_0-auc:0.97637
[999]	validation_0-auc:0.97643




[0]	validation_0-auc:0.95605
[100]	validation_0-auc:0.96861
[200]	validation_0-auc:0.97058
[300]	validation_0-auc:0.97209
[400]	validation_0-auc:0.97311
[500]	validation_0-auc:0.97389
[600]	validation_0-auc:0.97427
[700]	validation_0-auc:0.97449
[800]	validation_0-auc:0.97465
[900]	validation_0-auc:0.97471
[999]	validation_0-auc:0.97475




[0]	validation_0-auc:0.95572
[100]	validation_0-auc:0.96829
[200]	validation_0-auc:0.97049
[300]	validation_0-auc:0.97216
[400]	validation_0-auc:0.97353
[500]	validation_0-auc:0.97436
[600]	validation_0-auc:0.97486
[700]	validation_0-auc:0.97514
[800]	validation_0-auc:0.97531
[900]	validation_0-auc:0.97541
[999]	validation_0-auc:0.97549




[0]	validation_0-auc:0.95789
[100]	validation_0-auc:0.96976
[200]	validation_0-auc:0.97194
[300]	validation_0-auc:0.97345
[400]	validation_0-auc:0.97461
[500]	validation_0-auc:0.97534
[600]	validation_0-auc:0.97579
[700]	validation_0-auc:0.97601
[800]	validation_0-auc:0.97615
[900]	validation_0-auc:0.97621
[999]	validation_0-auc:0.97625
Mean AUC-ROC on CV: 0.9754


## Model Evaluation

In [44]:
y_test_pred = xgb_model.predict(X_test)
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_test_proba)
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test AUC-ROC: {auc_score:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-Score: {f1:.4f}")

Test AUC-ROC: 0.9745
Test Accuracy: 0.9381
Test F1-Score: 0.8252


### For better accuracy, re-train the model on the entire training set

In [46]:
X_train_final = df.drop(columns=["Depression"])
y_train_final = df["Depression"]
X_test_final = df_test

(X_test_final, xgb_model_final) = xgbmodel(X_train_final, X_test_final, y_train_final, column_types_final)



[0]	validation_0-auc:0.95655
[100]	validation_0-auc:0.96814
[200]	validation_0-auc:0.97038
[300]	validation_0-auc:0.97201
[400]	validation_0-auc:0.97318
[500]	validation_0-auc:0.97408
[600]	validation_0-auc:0.97462
[700]	validation_0-auc:0.97489
[800]	validation_0-auc:0.97508
[900]	validation_0-auc:0.97520
[999]	validation_0-auc:0.97527




[0]	validation_0-auc:0.95356
[100]	validation_0-auc:0.96631
[200]	validation_0-auc:0.96869
[300]	validation_0-auc:0.97041
[400]	validation_0-auc:0.97180
[500]	validation_0-auc:0.97268
[600]	validation_0-auc:0.97319
[700]	validation_0-auc:0.97351
[800]	validation_0-auc:0.97368
[900]	validation_0-auc:0.97379
[999]	validation_0-auc:0.97386




[0]	validation_0-auc:0.95753
[100]	validation_0-auc:0.96882
[200]	validation_0-auc:0.97073
[300]	validation_0-auc:0.97228
[400]	validation_0-auc:0.97340
[500]	validation_0-auc:0.97414
[600]	validation_0-auc:0.97458
[700]	validation_0-auc:0.97485
[800]	validation_0-auc:0.97503
[900]	validation_0-auc:0.97514
[999]	validation_0-auc:0.97523




[0]	validation_0-auc:0.95852
[100]	validation_0-auc:0.97061
[200]	validation_0-auc:0.97249
[300]	validation_0-auc:0.97380
[400]	validation_0-auc:0.97493
[500]	validation_0-auc:0.97575
[600]	validation_0-auc:0.97615
[700]	validation_0-auc:0.97638
[800]	validation_0-auc:0.97651
[900]	validation_0-auc:0.97657
[999]	validation_0-auc:0.97663




[0]	validation_0-auc:0.95678
[100]	validation_0-auc:0.96826
[200]	validation_0-auc:0.97044
[300]	validation_0-auc:0.97211
[400]	validation_0-auc:0.97327
[500]	validation_0-auc:0.97414
[600]	validation_0-auc:0.97464
[700]	validation_0-auc:0.97492
[800]	validation_0-auc:0.97510
[900]	validation_0-auc:0.97521
[999]	validation_0-auc:0.97527
Mean AUC-ROC on CV: 0.9753


## Final Predictions

In [47]:
y_test_pred_final = xgb_model_final.predict(X_test_final)

In [49]:
predictions_df = pd.DataFrame({
    "id": X_test_final.index,
    "Depression": y_test_pred_final
})

predictions_df.to_csv("predictions.csv", index=False)