In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/binary-classification/train.csv
/kaggle/input/binary-classification/test.csv


In [2]:
data1 = pd.read_csv('/kaggle/input/binary-classification/train.csv')
data2 = pd.read_csv('/kaggle/input/binary-classification/test.csv')

train_df = data1.copy()
test_df = data2.copy()

print("First 5 rows of train_df: \n", train_df.head(5))
print("First 5 rows of test_df: \n", test_df.head(5))

First 5 rows of train_df: 
    id  age          job  marital  education default  balance housing loan  \
0   0   42   technician  married  secondary      no        7      no   no   
1   1   38  blue-collar  married  secondary      no      514      no   no   
2   2   36  blue-collar  married  secondary      no      602     yes   no   
3   3   27      student   single  secondary      no       34     yes   no   
4   4   26   technician  married  secondary      no      889     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome  y  
0  cellular   25   aug       117         3     -1         0  unknown  0  
1   unknown   18   jun       185         1     -1         0  unknown  0  
2   unknown   14   may       111         2     -1         0  unknown  0  
3   unknown   28   may        10         2     -1         0  unknown  0  
4  cellular    3   feb       902         1     -1         0  unknown  1  
First 5 rows of test_df: 
        id  age            job  marital

In [3]:
train_df['was_contacted_previous'] = train_df['pdays'].apply(lambda x: 1 if x > 0 else 0)
test_df['was_contacted_previous'] = test_df['pdays'].apply(lambda x: 1 if x > 0 else 0)

In [4]:
train_df.loc[train_df['pdays'] == -1, 'pdays'] = 99999
test_df.loc[test_df['pdays'] == -1, 'pdays'] = 99999

In [5]:
test_id_placeholder = test_df['id']

In [8]:
cat_cols=train_df.select_dtypes(include=['object']).columns
num_cols=train_df.select_dtypes(include=['int']).columns

print(f'Total Categorical Columns {len(cat_cols)}')
print(f'Total Numerical Columns {len(num_cols)}')

Total Categorical Columns 9
Total Numerical Columns 10


In [9]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['id', 'y'])
y = train_df['y']

test_df = test_df.drop(columns='id')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [10]:
!pip install optuna



In [11]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
cat_cols.append('was_contacted_previous')

In [12]:
!pip install catboost



In [13]:
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import optuna

In [14]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'random_state': 42,
        'eval_metric': 'AUC',
        'verbose': 0,
        'early_stopping_rounds': 50,
        'task_type': 'GPU',
        'devices': '0'
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    test_pool = Pool(X_test, y_test, cat_features=cat_cols)
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=test_pool)
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    return roc_auc

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=True)
best_params = study.best_trial.params
print("\nBest Hyperparameters from Optuna:")
print(best_params)

[I 2025-08-20 06:53:51,801] A new study created in memory with name: no-name-5565219e-183b-4f74-8f17-73e466943b70


  0%|          | 0/5 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2025-08-20 06:55:34,601] Trial 0 finished with value: 0.9633940853792613 and parameters: {'iterations': 2190, 'learning_rate': 0.02083421194848568, 'depth': 5, 'l2_leaf_reg': 6.967823559599055}. Best is trial 0 with value: 0.9633940853792613.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2025-08-20 06:59:01,499] Trial 1 finished with value: 0.9653766542692024 and parameters: {'iterations': 1985, 'learning_rate': 0.020337373509605548, 'depth': 9, 'l2_leaf_reg': 0.2903485650325186}. Best is trial 1 with value: 0.9653766542692024.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2025-08-20 07:00:16,235] Trial 2 finished with value: 0.9640052290058427 and parameters: {'iterations': 884, 'learning_rate': 0.024383544762065106, 'depth': 8, 'l2_leaf_reg': 0.014007557156310949}. Best is trial 1 with value: 0.9653766542692024.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2025-08-20 07:04:04,045] Trial 3 finished with value: 0.9644000887373474 and parameters: {'iterations': 1667, 'learning_rate': 0.010673067160784306, 'depth': 10, 'l2_leaf_reg': 0.0017069245156134643}. Best is trial 1 with value: 0.9653766542692024.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
Default metric period is 5 because AUC is/are not implemented for GPU


[I 2025-08-20 07:08:46,580] Trial 4 finished with value: 0.9656628881676753 and parameters: {'iterations': 2055, 'learning_rate': 0.02665115954238175, 'depth': 10, 'l2_leaf_reg': 2.5791476284739505}. Best is trial 4 with value: 0.9656628881676753.

Best Hyperparameters from Optuna:
{'iterations': 2055, 'learning_rate': 0.02665115954238175, 'depth': 10, 'l2_leaf_reg': 2.5791476284739505}


In [16]:
best_params['random_state'] = 42
best_params['eval_metric'] = 'AUC'
best_params['verbose'] = 100
best_params['early_stopping_rounds'] = 50

In [18]:
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_df))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"\n--- Starting Fold {fold_ + 1}/{NFOLDS} ---")
    
    trn_data = X.iloc[trn_idx]
    val_data = X.iloc[val_idx]
    trn_y = y.iloc[trn_idx]
    val_y = y.iloc[val_idx]
    
    model = CatBoostClassifier(**best_params)
    
    model.fit(trn_data, trn_y, cat_features=cat_cols,
              eval_set=(val_data, val_y), use_best_model=True)
    
    oof_preds[val_idx] = model.predict_proba(val_data)[:, 1]
    test_preds += model.predict_proba(test_df)[:, 1] / NFOLDS


--- Starting Fold 1/5 ---
0:	test: 0.9336256	best: 0.9336256 (0)	total: 863ms	remaining: 29m 32s
100:	test: 0.9587289	best: 0.9587289 (100)	total: 1m 18s	remaining: 25m 13s
200:	test: 0.9622410	best: 0.9622410 (200)	total: 2m 36s	remaining: 24m 3s
300:	test: 0.9637732	best: 0.9637732 (300)	total: 3m 54s	remaining: 22m 47s
400:	test: 0.9646771	best: 0.9646771 (400)	total: 5m 13s	remaining: 21m 31s
500:	test: 0.9652126	best: 0.9652126 (500)	total: 6m 30s	remaining: 20m 10s
600:	test: 0.9656835	best: 0.9656835 (600)	total: 7m 49s	remaining: 18m 56s
700:	test: 0.9660649	best: 0.9660649 (700)	total: 9m 8s	remaining: 17m 39s
800:	test: 0.9663794	best: 0.9663794 (800)	total: 10m 26s	remaining: 16m 21s
900:	test: 0.9666294	best: 0.9666294 (900)	total: 11m 47s	remaining: 15m 5s
1000:	test: 0.9668239	best: 0.9668239 (1000)	total: 13m 6s	remaining: 13m 48s
1100:	test: 0.9670005	best: 0.9670005 (1100)	total: 14m 26s	remaining: 12m 31s
1200:	test: 0.9671329	best: 0.9671329 (1200)	total: 15m 47s	re

In [19]:
print("OOF (Out-of-Fold) ROC AUC Score:", roc_auc_score(y, oof_preds))

OOF (Out-of-Fold) ROC AUC Score: 0.9673219211669997


In [21]:
submission = pd.DataFrame({'id': test_id_placeholder, 'y': test_preds})

submission.to_csv('submission2.csv', index=False)