# Imports and Installs

In [None]:
!pip3 install -U xgboost catboost optuna hillclimbers



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from hillclimbers import climb_hill, partial
import optuna
import logging

# Data Loading

In [None]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
data = data.drop(columns=['Name','id'])
test_data = test_data.drop(columns=['Name'])
original = pd.read_csv('final_depression_dataset_1.csv').drop(columns=['Name'])
original['Depression'] = original['Depression'].map({'No': 0, 'Yes': 1})
data_comb = pd.concat([data, original], ignore_index=True)
data = data_comb.copy()

In [None]:
data_mapped = data.copy()
encoding_dict = {
    "Gender": {"Female": 0, "Male": 1},
    "Sleep Duration": {"Less than 5 hours": 0, "5-6 hours": 1, "7-8 hours": 2, "More than 8 hours": 3},
    "Dietary Habits": {"Unhealthy": 0, "Moderate": 1, "Healthy": 2},
    "Degree": {
        "Class 12": 0,
        "B.Sc": 1, "BHM": 1, "B.Ed": 1, "B.Arch": 1, "B.Com": 1, "BBA": 1, "B.Pharm": 1, "BE": 1, "LLB": 1,
        "ME": 2, "M.Tech": 2, "MA": 2, "MBA": 2, "MCA": 2, "MHM": 2, "PhD": 3
    },
    "Have you ever had suicidal thoughts ?": {"No": 0, "Yes": 1},
    "Family History of Mental Illness": {"No": 0, "Yes": 1},
}

for column, mapping in encoding_dict.items():
    if column in data.columns:
        data_mapped[column] = data_mapped[column].map(mapping)

# Data Preprocessing

## Data Cleaning

In [None]:
for d in [data, test_data]:
  for column in d.columns:
    if d.isna().sum()[column] > 0:
      try:
        d[column].fillna(d[column].mean(), inplace=True)
      except:
        mode_value = d[column].dropna().mode()[0]
        d[column].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  d[column].fillna(d[column].mean(), inplace=True)


## Encoding

In [None]:
data_cat = data.copy().apply(lambda x: x.astype(int).astype('category') if x.dtypes == 'float' else x.astype('category'))
test_cat = test_data.copy().apply(lambda x: x.astype(int).astype('category') if x.dtypes == 'float' else x.astype('category'))
nominal_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
data_ohe = pd.DataFrame(ohe.fit_transform(data[nominal_cols]), columns=ohe.get_feature_names_out(nominal_cols), index=data.index)
test_data_ohe = pd.DataFrame(ohe.transform(test_data[nominal_cols]), columns=ohe.get_feature_names_out(nominal_cols), index=test_data.index)
data = data.drop(columns=nominal_cols).join(data_ohe)
test_data = test_data.drop(columns=nominal_cols).join(test_data_ohe)
for d in [data, test_data]:
  d['Pressure'] = d.apply(lambda row: row['Academic Pressure'] if pd.isna(row['Work Pressure']) else row['Work Pressure'], axis=1)
  d = d.drop(columns=['Academic Pressure', 'Work Pressure'])
  d['Life Difficulty'] = (d['Financial Stress'] * d['Pressure'])

## Train-Test Split

In [None]:
# Train-test split already performed by Kaggle
X_train = data.drop(columns=["Depression"])
y_train = data['Depression']
X_test = test_data

## Normalisation

In [None]:
nominal_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
comb = pd.concat([X_train, X_test])
X_train[numerical_cols] = X_train[numerical_cols].apply(pd.to_numeric, errors='coerce')
X_test[numerical_cols] = X_test[numerical_cols].apply(pd.to_numeric, errors='coerce')
comb[numerical_cols] = comb[numerical_cols].apply(pd.to_numeric, errors='coerce')
for col in X_train.columns:
  if col not in nominal_cols:
    mu = comb[col].mean()
    sigma = comb[col].std()
    X_train[col] = (X_train[col] - mu) / sigma
    X_test[col] = (X_test[col] - mu) / sigma

# Model Handling

## Model Selection

In [None]:
CB_params = {
    "cat_features": list(data_cat.drop(columns=["Depression"]).columns),
    "iterations": 1442,
    "depth": 5,
    "learning_rate": 0.06905364829124568,
    "l2_leaf_reg": 0.0066929612736035355,
    "border_count": 162,
    "random_strength": 0.7450573059421522,
    "bootstrap_type": "Bernoulli",
    "od_type": "Iter",
    "od_wait": 45,
    "min_data_in_leaf": 4,
    "scale_pos_weight": 0.6987162615326387,
    "verbose": 100,
}

XGB_params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "error",
    "max_depth": 6,
    "eta": 0.1,
    "gamma": 0.5,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1,
    "alpha": 0.5,
    "scale_pos_weight": 1,
    "early_stopping_rounds": 50
}

models = {
    'XGB': XGBClassifier(random_state=42, **XGB_params),
    'LR': LogisticRegression(random_state=42),
    'CB': CatBoostClassifier(task_type="GPU", devices="0", random_state=42, **CB_params)
}

## Model Evaluation

In [None]:
def cross_validate(model, X_train, X_test, y_train, cv=2):
  skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
  accuracy_scores = []
  oof_pred = np.zeros(len(y_train))
  test_pred = np.zeros(len(X_test))

  for train_index, test_index in skf.split(X_train, y_train):
  # Model training
      X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
      y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
      if model.__class__.__name__ == 'CatBoostClassifier':
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_test_fold, y_test_fold)],
                  use_best_model=True,
                  early_stopping_rounds=50)
      if model.__class__.__name__ == 'LogisticRegression':
        model.fit(X_train_fold, y_train_fold)
      if model.__class__.__name__ == 'XGBClassifier':
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_test_fold, y_test_fold)])

  # Model predictions
      y_pred = model.predict(X_test_fold)
      oof_pred[test_index] = y_pred
      test_pred += model.predict(X_test) / cv
      accuracy = accuracy_score(y_test_fold, y_pred)
      accuracy_scores.append(accuracy)

  # Performance summary
  return np.mean(accuracy_scores), oof_pred, test_pred

In [None]:
def cross_validate(model, X_train, X_test, y_train, cv=2):
  skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
  accuracy_scores = []
  oof_pred = np.zeros(len(y_train))
  test_pred = np.zeros(len(X_test))

  for train_index, test_index in skf.split(X_train, y_train):
  # Model training
      X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
      y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
      if model.__class__.__name__ == 'CatBoostClassifier':
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_test_fold, y_test_fold)],
                  use_best_model=True,
                  early_stopping_rounds=50)
      if model.__class__.__name__ == 'LogisticRegression':
        model.fit(X_train_fold, y_train_fold)
      if model.__class__.__name__ == 'XGBClassifier':
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_test_fold, y_test_fold)])

  # Model predictions
      y_pred = model.predict(X_test_fold)
      oof_pred[test_index] = y_pred
      test_pred += model.predict(X_test) / cv
      accuracy = accuracy_score(y_test_fold, y_pred)
      accuracy_scores.append(accuracy)

  # Performance summary
  return np.mean(accuracy_scores), oof_pred, test_pred

oof_preds, test_preds = {}, {}
for model_name, model in models.items():
  if model_name == 'CB':
    X_train_cb = X_train.copy()
    X_test_cb = X_test.copy().drop(columns=['id'])
    X_train_cb.columns = [f"{col}_copy" for col in X_train_cb.columns]
    X_test_cb.columns = [f"{col}_copy" for col in X_test_cb.columns]
    X_train_cb = pd.concat([X_train_cb, data_cat.drop(columns=["Depression"])], axis=1)
    X_test_cb = pd.concat([X_test_cb, test_cat.drop(columns=["id"])], axis=1)
    X_tr = X_train_cb
    X_te = X_test_cb
  else:
    X_tr, X_te = X_train, X_test.drop(columns=['id'])

  mean_accuracy, oof_pred, test_pred = cross_validate(model, X_tr, X_te, y_train, cv=5)
  oof_preds[model_name] = oof_pred
  test_preds[model_name] = test_pred

[0]	validation_0-error:0.18166
[1]	validation_0-error:0.18166
[2]	validation_0-error:0.18166
[3]	validation_0-error:0.18166
[4]	validation_0-error:0.12750
[5]	validation_0-error:0.09902
[6]	validation_0-error:0.08540
[7]	validation_0-error:0.08129
[8]	validation_0-error:0.07636
[9]	validation_0-error:0.07364
[10]	validation_0-error:0.07165
[11]	validation_0-error:0.06987
[12]	validation_0-error:0.06911
[13]	validation_0-error:0.06774
[14]	validation_0-error:0.06771
[15]	validation_0-error:0.06708
[16]	validation_0-error:0.06642
[17]	validation_0-error:0.06614
[18]	validation_0-error:0.06558
[19]	validation_0-error:0.06575
[20]	validation_0-error:0.06495
[21]	validation_0-error:0.06478
[22]	validation_0-error:0.06492
[23]	validation_0-error:0.06436
[24]	validation_0-error:0.06415
[25]	validation_0-error:0.06383
[26]	validation_0-error:0.06331
[27]	validation_0-error:0.06324
[28]	validation_0-error:0.06293
[29]	validation_0-error:0.06310
[30]	validation_0-error:0.06289
[31]	validation_0-

CatBoostError: catboost/cuda/cuda_lib/cuda_manager.cpp:201: Condition violated: `State == nullptr'

## Model Tuning

### Hyperparameter Tuning

In [None]:
# Done in separate notebook already

{'iterations': 1442, 'depth': 5, 'learning_rate': 0.06905364829124568, 'l2_leaf_reg': 0.0066929612736035355, 'border_count': 162, 'random_strength': 0.7450573059421522, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 45, 'min_data_in_leaf': 4, 'scale_pos_weight': 0.6987162615326387}

### Re-evaluation

# Ensemble Learning

In [None]:
oof_pred_df = pd.DataFrame({
    'LR': oof_preds['LR'],
    'CB': oof_preds['CB'],
    'XGB': oof_preds['XGB']
})

test_pred_df = pd.DataFrame({
    'LR': test_preds['LR'],
    'CB': test_preds['CB'],
    'XGB': test_preds['XGB']
})

def thresholded_accuracy_score(y_true, y_pred):
    y_pred_binary = (y_pred >= 0.5).astype(int)
    return accuracy_score(y_true, y_pred_binary)

blended_test_preds, oof_blend = climb_hill(
    train=data[['Depression']],
    oof_pred_df=oof_pred_df,
    test_pred_df=test_pred_df,
    target='Depression',
    objective='maximize',
    eval_metric=partial(thresholded_accuracy_score),
    negative_weights=False,
    precision=0.01,
    plot_hill=False,
    return_oof_preds=True
)

thresholds = np.arange(0, 0.3, 0.5, 0.4, 0.6, 1, 0.01)
best_threshold = max(thresholds, key=lambda t: accuracy_score(y_train, (oof_blend >= t).astype(int)))
print(f"Optimal Threshold: {best_threshold:.2f}")
final_test_preds = (blended_test_preds >= best_threshold).astype(int)

final_blended_accuracy = accuracy_score(
    data['Depression'],  # True labels
    (oof_blend >= best_threshold).astype(int)  # Convert blended probabilities to binary predictions
)

print(f"Final blended accuracy: {final_blended_accuracy:.4f}")

# Submission

In [None]:
submission = pd.DataFrame({'id': X_test["id"], 'Depression': blended_test_preds})
submission['Depression'] = submission['Depression'].astype(int)
submission.to_csv('blended_submission.csv', index=False)

0.9382871357498223 (base)

0.9383013503909027 (normalise on train+test)

0.938711106850682 (append original dataset)

0.9387320477972129 (Stas feature engineering)

0.9412940583293394 (final)

0.9323346 (cat features)

0.9335096 (regular features)

0.9336834 (combined features)