In [1]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

In [2]:
data_test = pd.read_csv('test.csv')
data_train = pd.read_csv('train.csv')

DATA PREP

In [3]:
num_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
cat_cols = [
    'Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium'
]

In [4]:
X = data_train.drop(["id", "Heart Disease"], axis = 1)
y = data_train['Heart Disease'].map({'Presence': 1, 'Absence': 0})

In [5]:
X_test_final = data_test.drop(['id'], axis=1)

In [6]:
n_splits = 5

In [7]:
sfk = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=42)

In [8]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test_final))
scores = []

In [9]:
for fold, (train_idx, val_idx) in enumerate(sfk.split(X,y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = CatBoostClassifier(
        n_estimators=8000, 
        learning_rate=0.03,
        depth=6,
        eval_metric='AUC',
        cat_features=cat_cols,
        random_seed=42,
        verbose=100
    )
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        early_stopping_rounds=100
    )
    test_preds += model.predict_proba(X_test_final)[:, 1] / n_splits
    print(f"Fold {fold} completed and predictions accumulated!")

0:	test: 0.9352988	best: 0.9352988 (0)	total: 331ms	remaining: 44m 8s
100:	test: 0.9537225	best: 0.9537225 (100)	total: 14.6s	remaining: 19m 4s
200:	test: 0.9545271	best: 0.9545271 (200)	total: 33s	remaining: 21m 20s
300:	test: 0.9547884	best: 0.9547884 (300)	total: 50.4s	remaining: 21m 30s
400:	test: 0.9549551	best: 0.9549551 (400)	total: 1m 8s	remaining: 21m 37s
500:	test: 0.9551313	best: 0.9551313 (500)	total: 1m 27s	remaining: 21m 47s
600:	test: 0.9553162	best: 0.9553162 (600)	total: 1m 45s	remaining: 21m 38s
700:	test: 0.9554404	best: 0.9554404 (700)	total: 2m 4s	remaining: 21m 31s
800:	test: 0.9555267	best: 0.9555267 (800)	total: 2m 21s	remaining: 21m 9s
900:	test: 0.9555945	best: 0.9555945 (900)	total: 2m 41s	remaining: 21m 9s
1000:	test: 0.9556568	best: 0.9556568 (1000)	total: 3m	remaining: 21m 1s
1100:	test: 0.9556990	best: 0.9556991 (1098)	total: 3m 18s	remaining: 20m 42s
1200:	test: 0.9557268	best: 0.9557268 (1200)	total: 3m 37s	remaining: 20m 29s
1300:	test: 0.9557458	best:

In [10]:
submission = pd.DataFrame({
    'id': data_test['id'],
    'Heart Disease': test_preds
})

In [11]:
submission.to_csv('submission2.csv', index=False)

In [12]:
print(submission.head())

       id  Heart Disease
0  630000       0.953485
1  630001       0.006801
2  630002       0.988663
3  630003       0.003895
4  630004       0.202500
