In [11]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/diabetes-prediction-challenge/sample_submission.csv
/kaggle/input/diabetes-prediction-challenge/train.csv
/kaggle/input/diabetes-prediction-challenge/test.csv


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [13]:
train = pd.read_csv('/kaggle/input/diabetes-prediction-challenge/train.csv')
test = pd.read_csv('/kaggle/input/diabetes-prediction-challenge/test.csv')

In [14]:
target = 'diagnosed_diabetes'
test_id = test['id'].copy()

In [15]:
X = train.drop(target, axis=1)
y = train[target]
test_X = test.copy()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

In [16]:
cat_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 8000,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "od_type": "Iter",
    "od_wait": 300,
    "random_seed": 42,
    "verbose": 0,
    "task_type": "GPU",
    "devices": "0,1"
}

In [17]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_X))
fold_auc = []

In [18]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits = N_SPLITS, shuffle=True, random_state = 42)

In [19]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns
cat_features = [X.columns.get_loc(col) for col in cat_cols]

In [20]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_pool = Pool(
        X_train,
        y_train,
        cat_features=cat_features
    )

    val_pool = Pool(
        X_val,
        y_val,
        cat_features=cat_features
    )

    model = CatBoostClassifier(**cat_params)

    model.fit(
        train_pool,
        eval_set=val_pool,
        use_best_model=True,
        verbose=200
    )

    val_pred = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    fold_auc.append(auc)
    print(f"AUC: {auc:.5f}")

    test_preds += model.predict_proba(test_X)[:, 1] / N_SPLITS



Fold 1


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6801629	best: 0.6801629 (0)	total: 11.2s	remaining: 1d 52m 12s
200:	test: 0.7079708	best: 0.7079708 (200)	total: 19.1s	remaining: 12m 22s
400:	test: 0.7141898	best: 0.7141898 (400)	total: 27.1s	remaining: 8m 33s
600:	test: 0.7195157	best: 0.7195157 (600)	total: 34.9s	remaining: 7m 10s
800:	test: 0.7215494	best: 0.7215494 (800)	total: 42.8s	remaining: 6m 24s
1000:	test: 0.7228965	best: 0.7228965 (1000)	total: 50.8s	remaining: 5m 54s
1200:	test: 0.7237533	best: 0.7237533 (1200)	total: 58.7s	remaining: 5m 32s
1400:	test: 0.7244149	best: 0.7244149 (1400)	total: 1m 6s	remaining: 5m 13s
1600:	test: 0.7249525	best: 0.7249525 (1600)	total: 1m 14s	remaining: 4m 57s
1800:	test: 0.7252902	best: 0.7252913 (1799)	total: 1m 22s	remaining: 4m 43s
2000:	test: 0.7255989	best: 0.7255995 (1999)	total: 1m 30s	remaining: 4m 30s
2200:	test: 0.7258779	best: 0.7258779 (2200)	total: 1m 37s	remaining: 4m 18s
2400:	test: 0.7260296	best: 0.7260339 (2388)	total: 1m 45s	remaining: 4m 7s
2600:	test: 0.726

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6758430	best: 0.6758430 (0)	total: 42.1ms	remaining: 5m 36s
200:	test: 0.7056096	best: 0.7056096 (200)	total: 7.92s	remaining: 5m 7s
400:	test: 0.7122957	best: 0.7122957 (400)	total: 15.7s	remaining: 4m 58s
600:	test: 0.7167280	best: 0.7167280 (600)	total: 23.6s	remaining: 4m 50s
800:	test: 0.7191916	best: 0.7191916 (800)	total: 31.5s	remaining: 4m 43s
1000:	test: 0.7205236	best: 0.7205236 (1000)	total: 39.4s	remaining: 4m 35s
1200:	test: 0.7213929	best: 0.7213929 (1200)	total: 47.3s	remaining: 4m 27s
1400:	test: 0.7220161	best: 0.7220163 (1397)	total: 55.1s	remaining: 4m 19s
1600:	test: 0.7224405	best: 0.7224405 (1600)	total: 1m 2s	remaining: 4m 11s
1800:	test: 0.7227533	best: 0.7227533 (1800)	total: 1m 10s	remaining: 4m 4s
2000:	test: 0.7231274	best: 0.7231280 (1998)	total: 1m 18s	remaining: 3m 56s
2200:	test: 0.7234013	best: 0.7234013 (2200)	total: 1m 26s	remaining: 3m 48s
2400:	test: 0.7236078	best: 0.7236112 (2389)	total: 1m 34s	remaining: 3m 41s
2600:	test: 0.7237213	b

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6784193	best: 0.6784193 (0)	total: 42.3ms	remaining: 5m 38s
200:	test: 0.7064514	best: 0.7064514 (200)	total: 7.93s	remaining: 5m 7s
400:	test: 0.7128807	best: 0.7128807 (400)	total: 15.8s	remaining: 4m 59s
600:	test: 0.7176679	best: 0.7176679 (600)	total: 23.7s	remaining: 4m 51s
800:	test: 0.7200297	best: 0.7200297 (800)	total: 31.6s	remaining: 4m 44s
1000:	test: 0.7214217	best: 0.7214217 (1000)	total: 39.5s	remaining: 4m 36s
1200:	test: 0.7223614	best: 0.7223614 (1200)	total: 47.4s	remaining: 4m 28s
1400:	test: 0.7229205	best: 0.7229220 (1397)	total: 55.3s	remaining: 4m 20s
1600:	test: 0.7233903	best: 0.7233933 (1599)	total: 1m 3s	remaining: 4m 12s
1800:	test: 0.7237869	best: 0.7237875 (1799)	total: 1m 11s	remaining: 4m 4s
2000:	test: 0.7240574	best: 0.7240574 (2000)	total: 1m 18s	remaining: 3m 56s
2200:	test: 0.7243275	best: 0.7243279 (2198)	total: 1m 26s	remaining: 3m 48s
2400:	test: 0.7245067	best: 0.7245094 (2397)	total: 1m 34s	remaining: 3m 41s
2600:	test: 0.7246650	b

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6787901	best: 0.6787901 (0)	total: 41.8ms	remaining: 5m 34s
200:	test: 0.7070994	best: 0.7070994 (200)	total: 7.96s	remaining: 5m 8s
400:	test: 0.7134764	best: 0.7134764 (400)	total: 15.8s	remaining: 4m 59s
600:	test: 0.7189320	best: 0.7189320 (600)	total: 23.7s	remaining: 4m 51s
800:	test: 0.7212846	best: 0.7212846 (800)	total: 31.5s	remaining: 4m 42s
1000:	test: 0.7227188	best: 0.7227188 (1000)	total: 39.2s	remaining: 4m 34s
1200:	test: 0.7235798	best: 0.7235798 (1200)	total: 47.1s	remaining: 4m 26s
1400:	test: 0.7241860	best: 0.7241860 (1400)	total: 55s	remaining: 4m 18s
1600:	test: 0.7246035	best: 0.7246035 (1600)	total: 1m 2s	remaining: 4m 11s
1800:	test: 0.7251246	best: 0.7251246 (1800)	total: 1m 10s	remaining: 4m 3s
2000:	test: 0.7254325	best: 0.7254325 (2000)	total: 1m 18s	remaining: 3m 55s
2200:	test: 0.7257083	best: 0.7257083 (2200)	total: 1m 26s	remaining: 3m 47s
2400:	test: 0.7259434	best: 0.7259444 (2398)	total: 1m 34s	remaining: 3m 39s
2600:	test: 0.7261015	bes

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6761394	best: 0.6761394 (0)	total: 898ms	remaining: 1h 59m 43s
200:	test: 0.7076713	best: 0.7076713 (200)	total: 8.96s	remaining: 5m 47s
400:	test: 0.7141535	best: 0.7141535 (400)	total: 17.1s	remaining: 5m 23s
600:	test: 0.7185747	best: 0.7185747 (600)	total: 25s	remaining: 5m 7s
800:	test: 0.7209180	best: 0.7209180 (800)	total: 32.8s	remaining: 4m 55s
1000:	test: 0.7223792	best: 0.7223792 (1000)	total: 40.7s	remaining: 4m 44s
1200:	test: 0.7231424	best: 0.7231424 (1200)	total: 48.6s	remaining: 4m 35s
1400:	test: 0.7238185	best: 0.7238196 (1399)	total: 56.5s	remaining: 4m 26s
1600:	test: 0.7242969	best: 0.7242970 (1598)	total: 1m 4s	remaining: 4m 17s
1800:	test: 0.7247138	best: 0.7247138 (1800)	total: 1m 12s	remaining: 4m 9s
2000:	test: 0.7250273	best: 0.7250273 (2000)	total: 1m 20s	remaining: 4m
2200:	test: 0.7252640	best: 0.7252640 (2200)	total: 1m 28s	remaining: 3m 52s
2400:	test: 0.7254198	best: 0.7254198 (2400)	total: 1m 36s	remaining: 3m 44s
2600:	test: 0.7255861	best

In [22]:
pd.DataFrame({
    'id' : test_id,
    target : test_preds
}).to_csv('Single_CB_submission.csv', index=False)