# Enhanced Mental Health Prediction Notebook

### Key Highlights:
- Comprehensive Data Cleaning and Feature Engineering.
- Advanced Modeling using Stratified K-Fold.
- Model Stacking and Feature Importance.
- Submission Generation with Optimized Predictions.


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

## Load Data
Load the training and test datasets and preview the first few rows.

In [9]:
# train_df = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
# test_df = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
# submission_df = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

train_df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


### Initial Data Inspection

Check dataset structure, missing values, duplicates, and descriptive statistics.

In [10]:
print(train_df.info())
print('-' * 50)
print(train_df.describe())
print('-' * 50)
print(train_df.isnull().sum())
print('-' * 50)
print('Duplicated Rows:', train_df.duplicated().sum())
print('-' * 50)
print('Number of Rows:', train_df.shape[0])
print('Number of Columns:', train_df.shape[1])
print('-' * 50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

## Data Cleaning
Handle missing values, ambiguous categories, and inconsistent data.

In [11]:
# Replace ambiguous or irrelevant values
values_to_replace = [
    'Pratham', 'BSc', 'Gender', '3', 'Mihir', '1.0',
    'Electrician', 'M.Tech', 'Vegas', 'Male',
    'Indoor', 'Class 12', '2'
]
train_df.replace(values_to_replace, 'Unknown', inplace=True)

# Fill missing values
categorical_cols = ['Profession', 'Dietary Habits', 'Degree']
numerical_cols = ['Age', 'Work Pressure', 'Job Satisfaction', 'Financial Stress']

for col in categorical_cols:
    train_df[col].fillna('Unknown', inplace=True)
    test_df[col].fillna('Unknown', inplace=True)

for col in numerical_cols:
    median_value = train_df[col].median()
    train_df[col].fillna(median_value, inplace=True)
    test_df[col].fillna(median_value, inplace=True)

### Standardize `Dietary Habits` Column

In [12]:
train_df['Dietary Habits'] = train_df['Dietary Habits'].replace({
    'More Healthy': 'Healthy',
    'Yes': 'Healthy',
    'No': 'Unhealthy',
    'No Healthy': 'Unhealthy',
    'Less Healthy': 'Moderate',
    'Less than Healthy': 'Unhealthy',
    'Hormonal': 'Moderate'
})

## Feature Engineering
Transform features such as `Sleep Duration` and encode categorical variables.

In [13]:
# Simplify Sleep Duration
def group_sleep_duration(duration):
    if pd.isnull(duration):
        return 'Unknown'
    elif 'hours' not in duration:
        return 'Less than 3 hours'
    elif '3-5' in duration or '5-7' in duration:
        return '3-7 hours'
    elif '7-9' in duration:
        return '7-9 hours'
    else:
        return 'More than 9 hours'

# Apply transformation and handle missing values
train_df['duration'] = train_df['Sleep Duration'].apply(group_sleep_duration)
test_df['duration'] = test_df['Sleep Duration'].apply(group_sleep_duration)

# Drop the original column
train_df.drop(columns=['Sleep Duration'], inplace=True, errors='ignore')
test_df.drop(columns=['Sleep Duration'], inplace=True, errors='ignore')

# Encode Categorical Variables
def encode_categorical_features(train, test, categorical_features):
    """Encodes categorical features using OrdinalEncoder."""
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    train[categorical_features] = encoder.fit_transform(train[categorical_features])
    test[categorical_features] = encoder.transform(test[categorical_features])
    return train, test

# Identify categorical features
categorical_features = train_df.select_dtypes(include='object').columns

# Handle missing values for categorical features
train_df[categorical_features] = train_df[categorical_features].fillna('Unknown')
test_df[categorical_features] = test_df[categorical_features].fillna('Unknown')

# Encode features
train_df, test_df = encode_categorical_features(train_df, test_df, categorical_features)


## Modeling and Evaluation
Evaluate multiple models using Stratified K-Fold and select the best.

In [14]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Prepare Data
X = train_df.drop(columns=['id', 'Depression'])
y = train_df['Depression']
X_test = test_df.drop(columns=['id'])

# Identify categorical columns
categorical_columns = X.select_dtypes(include='object').columns.tolist()

# Convert categorical columns to strings
for col in categorical_columns:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# CatBoost Hyperparameters
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'depth': 6,
    'loss_function': 'Logloss',
    'random_state': RANDOM_STATE,
    'eval_metric': 'AUC',
}

# Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = []
test_preds = []

# Define CatBoost Pool with cat_features
X_test_pool = Pool(X_test, cat_features=[X.columns.get_loc(col) for col in categorical_columns])


In [15]:
print("Performing Stratified K-Fold Cross-Validation with CatBoost...")
for i, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    # Create training and validation splits
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Create CatBoost Pools
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=[X.columns.get_loc(col) for col in categorical_columns])
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=[X.columns.get_loc(col) for col in categorical_columns])

    # Train CatBoost
    model = CatBoostClassifier(**catboost_params)
    model.fit(X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=200)

    # Validation Metrics
    val_pred = model.predict(X_valid_pool)
    val_proba = model.predict_proba(X_valid_pool)[:, 1]
    accuracy = accuracy_score(y_val_fold, val_pred)
    roc_auc = roc_auc_score(y_val_fold, val_proba)

    # Collect Results
    scores.append(accuracy)
    print(f"Fold {i+1} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")

    # Predict on Test Set
    test_preds.append(model.predict_proba(X_test_pool)[:, 1])

# Ensemble Test Predictions
final_test_preds = np.mean(test_preds, axis=0)


Performing Stratified K-Fold Cross-Validation with CatBoost...
Fold 1 - Accuracy: 0.9388, ROC AUC: 0.9750
Fold 2 - Accuracy: 0.9372, ROC AUC: 0.9729
Fold 3 - Accuracy: 0.9400, ROC AUC: 0.9752
Fold 4 - Accuracy: 0.9412, ROC AUC: 0.9766
Fold 5 - Accuracy: 0.9390, ROC AUC: 0.9748


In [16]:
print(f"\nCross-validated Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")


Cross-validated Accuracy: 0.9392 ± 0.0013


## Submission
Use the best model for test predictions and generate the submission file.

In [17]:
submission_df['Depression'] = (final_test_preds > 0.5).astype(int)
submission_df.to_csv('submission.csv', index=False)
print("Submission file created!")

Submission file created!
