In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import optuna
train = pd.read_csv('train.csv')
train.drop(['id', 'Name'], axis=1, inplace=True)
train.dtypes


Gender                                    object
Age                                      float64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Depression                                 int64
dtype: object

In [7]:
num_attribs   = ['Age', 'Academic Pressure', 'Work Pressure',
                 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']
cat_attribs   = ['Gender', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration',
                 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
preproc = ColumnTransformer(
    transformers=[
        ("num",  Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("scale",  StandardScaler())
        ]), num_attribs),
        ("cat",  Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_attribs)
    ]
)
y = train['Depression']
X = train.drop('Depression', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "random_state": 62,
        "tree_method": 'hist',
        "n_jobs": -1
    }

    # Create pipeline with preprocessing + model
    pipeline = Pipeline([
        ("preprocessing", preproc),
        ("xgb", XGBClassifier(**params))
    ])

    # Use Stratified CV to preserve class balance
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=62)
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)

    return np.mean(scores)

# Run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

Trial 47 finished with value: 0.9397299218194741 and parameters: {'n_estimators': 529, 'max_depth': 3, 'learning_rate': 0.1420285246256477, 'subsample': 0.6617422555272853, 'colsample_bytree': 0.6053925503341293, 'gamma': 1.7799292133978808, 'reg_alpha': 2.5110255593175745, 'reg_lambda': 3.4353207613114574}. Best is trial 47 with value: 0.9397299218194741.

In [None]:

# Print best result
print("Best trial:")
print(study.best_trial.params)
# Build final pipeline using best params
best_params = study.best_trial.params
best_params.update({
    "use_label_encoder": False,
    "eval_metric": "logloss",
    "random_state": 62
})

final_model = Pipeline([
    ("preprocessing", preproc),
    ("xgb", XGBClassifier(**best_params))
])

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Best trial:
{'n_estimators': 529, 'max_depth': 3, 'learning_rate': 0.1420285246256477, 'subsample': 0.6617422555272853, 'colsample_bytree': 0.6053925503341293, 'gamma': 1.7799292133978808, 'reg_alpha': 2.5110255593175745, 'reg_lambda': 3.4353207613114574}


Parameters: { "use_label_encoder" } are not used.



Test accuracy: 0.9396943852167733


Test accuracy: 0.9396943852167733
Train accuracy: 0.9397299218194741

In [None]:
for i in range(17, 23):
    params = {'n_estimators': 529, 
            'max_depth': 3, 
            'learning_rate': 0.1420285246256477, 
            'subsample': 0.6617422555272853, 
            'colsample_bytree': 0.6053925503341293, 
            'gamma': 1.7799292133978808, 
            'reg_alpha': 2.5110255593175745, 
            'reg_lambda': 3.4353207613114574,
            'random_state': i + 987,
            'n_jobs':-1,
            "eval_metric": "logloss"
            }

    final_model = Pipeline([
        ("preprocessing", preproc),
        ("xgb", XGBClassifier(**params))
    ])

    print((i +987), "Train Accuracy:", np.mean(cross_val_score(final_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)))
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)
    print((i +987), "Test accuracy:", accuracy_score(y_test, y_pred))
    print("--------------------------")


1004 Train Accuracy: 0.9394633972992181
1004 Test accuracy: 0.9398720682302771
--------------------------
1005 Train Accuracy: 0.9394456289978678
1005 Test accuracy: 0.9403340440653873
--------------------------
1006 Train Accuracy: 0.9397388059701492
1006 Test accuracy: 0.940724946695096
--------------------------
1007 Train Accuracy: 0.9395167022032694
1007 Test accuracy: 0.9404051172707889
--------------------------
1008 Train Accuracy: 0.9394278606965175
1008 Test accuracy: 0.9395522388059702
--------------------------
1009 Train Accuracy: 0.9396055437100213
1009 Test accuracy: 0.9395877754086709
--------------------------


In [None]:
params = {'n_estimators': 529, 
            'max_depth': 3, 
            'learning_rate': 0.1420285246256477, 
            'subsample': 0.6617422555272853, 
            'colsample_bytree': 0.6053925503341293, 
            'gamma': 1.7799292133978808, 
            'reg_alpha': 2.5110255593175745, 
            'reg_lambda': 3.4353207613114574,
            'random_state': 1006,
            'n_jobs':-1,
            "eval_metric": "logloss"
            }

final_model = Pipeline([
        ("preprocessing", preproc),
        ("xgb", XGBClassifier(**params))
])

np.mean(cross_val_score(final_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1))
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Test accuracy: 0.940724946695096


In [None]:
test = pd.read_csv('test.csv')
pred = final_model.predict(test)

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample['Depression'] = pred
sample.to_csv('submission2.csv', index=False)

In [10]:
from catboost import CatBoostClassifier
params = {
    'iterations': 1766, 
    'learning_rate': 0.31965848232720384, 
    'depth': 3, 'l2_leaf_reg': 1.6093665037487805, 
    'bagging_temperature': 0.879762393824321, 
    'random_strength': 0.33996391481994964, 
    'border_count': 214,
    'random_state': 62,
    'task_type': 'CPU',
    'verbose': 0
}

cat_pipeline = Pipeline([
        ("preprocessing", preproc),
        ("cat", CatBoostClassifier(**params))
    ])

print("Train accuracy:", np.mean(cross_val_score(cat_pipeline, X_train, y_train, cv=5, scoring='accuracy')))
cat_pipeline.fit(X_train, y_train)
y_pred = cat_pipeline.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Train accuracy: 0.937366737739872
Test accuracy: 0.9381307746979389


In [None]:
pred = cat_pipeline.predict(test)
sample = pd.read_csv('sample_submission.csv')
sample['Depression'] = pred
sample.to_csv('submission3.csv', index=False)

In [15]:
Best_parameters = {'boosting_type': 'gbdt', 
                  'num_leaves': 26, 
                  'learning_rate': 0.25174389995069674, 
                  'feature_fraction': 0.5573841124238385, 
                  'bagging_fraction': 0.8452530741496862, 
                  'bagging_freq': 6, 
                  'min_child_samples': 5, 
                  'lambda_l1': 6.993044800464817, 
                  'lambda_l2': 0.7965595052137724,
                  'random_state': 62,
                  'num_threads':-1}
from lightgbm import LGBMClassifier
lg_pipeline = Pipeline([
        ("preprocessing", preproc),
        ("lgb", LGBMClassifier(**Best_parameters))
    ])
print("Train accuracy:", np.mean(cross_val_score(lg_pipeline, X_train, y_train, cv=5, scoring='accuracy')))
lg_pipeline.fit(X_train, y_train)
y_pred = lg_pipeline.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Train accuracy: 0.9392057569296375
Test accuracy: 0.9403340440653873


In [16]:
pred = lg_pipeline.predict(test)
sample = pd.read_csv('sample_submission.csv')
sample['Depression'] = pred
sample.to_csv('submission4.csv', index=False)