- Import libraries + import datasets.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv", index_col='id')

- Define features

In [2]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']
target = 'y'

- Preprocessing

In [3]:
def preprocess_data(df, numerical_features, categorical_features):
    df = df.copy()

    for col in numerical_features:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)

    df['balance_per_age'] = df['balance'] / (df['age'] + 1)
    df['duration_campaign_ratio'] = df['duration'] / (df['campaign'] + 1)
    df['pdays_binary'] = (df['pdays'] > -1).astype(int)

    all_numerical = numerical_features + ['balance_per_age', 'duration_campaign_ratio', 'pdays_binary']
    for col in all_numerical:
        df[col] = df[col].astype(str)

    for col in categorical_features:
        df[col].fillna('unknown', inplace=True)
        df[col] = df[col].astype(str)
    
    return df

train = preprocess_data(train, numerical_features, categorical_features)
test = preprocess_data(test, numerical_features, categorical_features)
all_features = numerical_features + categorical_features + ['balance_per_age', 'duration_campaign_ratio', 'pdays_binary']

- Split data

In [4]:
X = train.drop(target, axis=1)
y = train["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

- CatBoost model with cross-validation

In [5]:
params = {
        'iterations': 12000,
        'learning_rate': 0.02,
        'depth': 4,
        'l2_leaf_reg': 1,
        'cat_features': all_features,
        'task_type': 'GPU',
        'verbose': 500,
        'early_stopping_rounds': 500,
        'random_seed': 0,
        "devices": "0:1"
}

- Cross-validation

In [6]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_splits}")
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = CatBoostClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(test)[:, 1] / n_splits

roc_auc = roc_auc_score(y, oof_preds)
print(f"Mean CV ROC-AUC: {roc_auc:.5f}")

Fold 1/5
0:	learn: 0.6635734	test: 0.6633718	best: 0.6633718 (0)	total: 201ms	remaining: 40m 10s
500:	learn: 0.1478775	test: 0.1414461	best: 0.1414461 (500)	total: 23s	remaining: 8m 48s
1000:	learn: 0.1440100	test: 0.1374373	best: 0.1374373 (1000)	total: 45.5s	remaining: 8m 19s
1500:	learn: 0.1423869	test: 0.1359122	best: 0.1359122 (1500)	total: 1m 8s	remaining: 7m 57s
2000:	learn: 0.1413483	test: 0.1350032	best: 0.1350032 (2000)	total: 1m 31s	remaining: 7m 37s
2500:	learn: 0.1405889	test: 0.1344100	best: 0.1344100 (2500)	total: 1m 54s	remaining: 7m 15s
3000:	learn: 0.1399805	test: 0.1340129	best: 0.1340129 (3000)	total: 2m 18s	remaining: 6m 54s
3500:	learn: 0.1394743	test: 0.1336919	best: 0.1336919 (3500)	total: 2m 41s	remaining: 6m 32s
4000:	learn: 0.1389818	test: 0.1333775	best: 0.1333775 (4000)	total: 3m 4s	remaining: 6m 9s
4500:	learn: 0.1385760	test: 0.1331577	best: 0.1331577 (4500)	total: 3m 28s	remaining: 5m 46s
5000:	learn: 0.1382039	test: 0.1329692	best: 0.1329692 (5000)	tota

In [7]:
sub = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")
sub['y'] = test_preds
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,id,y
0,750000,0.010403
1,750001,0.145457
2,750002,0.000332
3,750003,0.000125
4,750004,0.039538
