In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import optuna

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

In [None]:
train_df.head()

In [None]:
train_df["contact"].unique()

In [None]:
def  one_hot_encode(df, columns):
    for col in columns:
        df = pd.get_dummies(df, columns=[col], prefix=col, dtype=float)
    return df

In [None]:
non_int_cols =  train_df.select_dtypes(include=["object"]).columns.tolist()
encoded_train_df = one_hot_encode(train_df, non_int_cols)

In [None]:
non_int_cols

In [None]:
X = train_df.drop(['y', 'id'], axis=1)
X.loc[X['pdays'] == -1, 'pdays'] = 99999
Y = train_df['y']

In [None]:
test_df.head()

In [None]:
def goal(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    train_pool = Pool(X_train, y_train, cat_features=non_int_cols)
    test_pool = Pool(X_test, y_test, cat_features=non_int_cols)
    model = CatBoostClassifier(
        iterations= trial.suggest_int('iterations', 1000, 4000),
        learning_rate= trial.suggest_loguniform('learning_rate', 0.03, 0.1),
        depth = trial.suggest_int('depth', 7, 16),
        random_state = 42,
        eval_metric = 'AUC',
        early_stopping_rounds= 50,  
        verbose=0
    )
    model.fit(train_pool, eval_set=test_pool)
    y_prob = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_prob)

In [None]:
# experiment = optuna.create_study(direction='maximize')
# experiment.optimize(goal, n_trials=5, show_progress_bar=True)
# best_params = experiment.best_trial.params
# print("\nBest Hyperparameters from Optuna:")
# print(best_params)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
train_pool = Pool(X_train, y_train, cat_features=non_int_cols)
test_pool = Pool(X_test, y_test, cat_features=non_int_cols)
model = CatBoostClassifier(
        iterations= 2000,
        learning_rate= 0.07,
        depth = 8,
        random_state = 42,
        eval_metric = 'AUC',
        early_stopping_rounds= 50,  
        verbose=0
    )
model.fit(train_pool, eval_set=test_pool)
y_prob = model.predict_proba(X_test)[:, 1]

In [None]:
test_ids = test_df['id']
test_df = test_df.drop('id', axis=1)
test_df.loc[test_df['pdays'] == -1, 'pdays'] = 99999
out = model.predict_proba(test_df)[:, 1]

In [None]:
submission = pd.DataFrame({'id': test_ids, 'y': out})
submission.to_csv('submission.csv', index=False)