In [1]:
import numpy as np
import pandas as pd
import optuna
import logging
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Load the data
file_path_train = '/kaggle/input/playground-series-s4e1/train.csv'
file_path_test = '/kaggle/input/playground-series-s4e1/test.csv'

df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)
submission_id = df_test['id'].reset_index(drop=True)

# Preprocess the data
df_train = df_train.drop(['id', 'CustomerId', 'Surname'], axis=1)
df_test = df_test.drop(['id', 'CustomerId', 'Surname'], axis=1)

categorical_columns = ['Geography', 'Gender']
df_train[categorical_columns] = df_train[categorical_columns].astype('category')
df_test[categorical_columns] = df_test[categorical_columns].astype('category')


X = df_train.drop(columns='Exited')
y = df_train['Exited']

cat_columns = X.select_dtypes(include='category').columns.tolist()

# Define the objective function for optimization
def objective(trial):
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 1, log=True),
        "iterations": trial.suggest_int("iterations", 100, 3000),
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10, log=True),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 100.00, log=True),
        "random_seed": 1,
        "eval_metric": "AUC",
        "loss_function": "Logloss",
        "verbose": False,
        "cat_features": categorical_columns
    }

    X_train_split, X_val, y_train_split, y_val = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

    model = CatBoostClassifier(**param)
    model.fit(X_train_split, y_train_split, eval_set=(X_val, y_val), verbose=False, use_best_model=True)
    
    y_val_pred = model.predict_proba(X_val)[:, 1]
    auc_val = roc_auc_score(y_val, y_val_pred)
    
    return auc_val

# Perform hyperparameter optimization
logging.getLogger('optuna').setLevel(logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200, n_jobs=-1, show_progress_bar=True)
best_params = study.best_params
print("Best parameters:", best_params)

# Train the final model with the best parameters found
final_model = CatBoostClassifier(**best_params, cat_features=cat_columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

# Fit the model on the training data and evaluate on the validation set
final_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False, use_best_model=True) 
    
# Make predictions on the test dataset
predictions = final_model.predict_proba(df_test)[:, 1]

# Prepare the submission dataframe
submission = pd.DataFrame({
    'id': submission_id,
    'Exited': predictions
})

print('Submission head:', submission.head(10))

# Save the submission dataframe to a CSV file
submission.to_csv('catboost_optuna_submission.csv', index=False)

  0%|          | 0/200 [00:00<?, ?it/s]

Best parameters: {'learning_rate': 0.2515922647546112, 'iterations': 1595, 'depth': 3, 'l2_leaf_reg': 5.241553007957681, 'border_count': 173, 'random_strength': 7.10231301962833e-05, 'bagging_temperature': 61.51574661216057}
Submission head:        id    Exited
0  165034  0.023758
1  165035  0.827006
2  165036  0.024594
3  165037  0.238475
4  165038  0.368895
5  165039  0.048995
6  165040  0.043631
7  165041  0.104858
8  165042  0.614568
9  165043  0.009172
