In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import optuna

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [4]:
train_df["contact"].unique()

array(['cellular', 'unknown', 'telephone'], dtype=object)

In [5]:
def  one_hot_encode(df, columns):
    for col in columns:
        df = pd.get_dummies(df, columns=[col], prefix=col, dtype=float)
    return df

In [6]:
non_int_cols =  train_df.select_dtypes(include=["object"]).columns.tolist()
encoded_train_df = one_hot_encode(train_df, non_int_cols)

In [7]:
non_int_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [8]:
X = train_df.drop(['y', 'id'], axis=1)
X.loc[X['pdays'] == -1, 'pdays'] = 99999
Y = train_df['y']

In [9]:
test_df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


In [10]:
def goal(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    train_pool = Pool(X_train, y_train, cat_features=non_int_cols)
    test_pool = Pool(X_test, y_test, cat_features=non_int_cols)
    model = CatBoostClassifier(
        iterations= trial.suggest_int('iterations', 1000, 4000),
        learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),
        depth = trial.suggest_int('depth', 7, 16),
        random_state = 42,
        eval_metric = 'AUC',
        early_stopping_rounds= 50,  
        verbose=0
    )
    model.fit(train_pool, eval_set=test_pool)
    y_prob = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_prob)

In [11]:
experiment = optuna.create_study(direction='maximize')
experiment.optimize(goal, n_trials=5, show_progress_bar=True)
best_params = experiment.best_trial.params
print("\nBest Hyperparameters from Optuna:")
print(best_params)

[I 2025-08-19 13:29:18,664] A new study created in memory with name: no-name-97b72fad-1b9f-49c5-b9ec-e719d6b19256


  0%|          | 0/5 [00:00<?, ?it/s]

  learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),


[I 2025-08-19 13:40:42,040] Trial 0 finished with value: 0.9670790119887632 and parameters: {'iterations': 1173, 'learning_rate': 0.08959803716392568, 'depth': 7}. Best is trial 0 with value: 0.9670790119887632.


  learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),


[I 2025-08-19 13:57:45,877] Trial 1 finished with value: 0.9660554228731485 and parameters: {'iterations': 3777, 'learning_rate': 0.05600947714870335, 'depth': 13}. Best is trial 0 with value: 0.9670790119887632.


  learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),


[I 2025-08-19 14:18:29,593] Trial 2 finished with value: 0.9643080260838454 and parameters: {'iterations': 2552, 'learning_rate': 0.04983277804710917, 'depth': 16}. Best is trial 0 with value: 0.9670790119887632.


  learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),


[I 2025-08-19 14:54:16,494] Trial 3 finished with value: 0.9673665625059034 and parameters: {'iterations': 3716, 'learning_rate': 0.035969738703490656, 'depth': 7}. Best is trial 3 with value: 0.9673665625059034.


  learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),


[I 2025-08-19 15:18:34,059] Trial 4 finished with value: 0.9661274442898954 and parameters: {'iterations': 2506, 'learning_rate': 0.03562808968405534, 'depth': 13}. Best is trial 3 with value: 0.9673665625059034.

Best Hyperparameters from Optuna:
{'iterations': 3716, 'learning_rate': 0.035969738703490656, 'depth': 7}
