In [1]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train.csv


In [2]:
import pandas as pd

train = pd.read_csv('/content/train.csv')
print(train.columns)

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
       'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
       'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium',
       'Heart Disease'],
      dtype='object')


In [3]:
import numpy as np
import pandas as pd
!pip install catboost

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [5]:
test  = pd.read_csv('/content/test.csv')

print("Train shape:", train.shape)
print("Test shape :", test.shape)

Train shape: (630000, 15)
Test shape : (270000, 14)


In [6]:
TARGET = 'Heart Disease'

y = train[TARGET]
X = train.drop(columns=[TARGET])
X_test = test.copy()

In [7]:
# Encode target labels
y = y.map({'Absence': 0, 'Presence': 1})

In [8]:
print(y.unique())

[1 0]


In [9]:
# Convert everything to numeric (non-numeric → NaN)
X = X.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

In [10]:
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())


In [11]:
print("NaNs in X:", X.isna().sum().sum())
print("NaNs in X_test:", X_test.isna().sum().sum())



NaNs in X: 0
NaNs in X_test: 0


In [12]:
# Fill missing values safely
X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X_test.median(numeric_only=True))

# Optional scaling (helps boosting stability)
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)
X_test[X_test.columns] = scaler.transform(X_test)


In [13]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [14]:
def run_cv(model, X, y, name):
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)

        val_prob = model.predict_proba(X_val)[:, 1]
        test_prob = model.predict_proba(X_test)[:, 1]

        oof_preds[val_idx] = val_prob
        test_preds += test_prob / skf.n_splits

        loss = log_loss(y_val, val_prob)
        print(f"{name} | Fold {fold+1} Log Loss: {loss:.5f}")

    overall = log_loss(y, oof_preds)
    print(f"{name} | Overall Log Loss: {overall:.5f}\n")

    return oof_preds, test_preds


In [15]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    verbose=0,
    random_seed=42
)


In [16]:
xgb_oof, xgb_test = run_cv(xgb, X, y, "XGBoost")
lgbm_oof, lgbm_test = run_cv(lgbm, X, y, "LightGBM")
cat_oof, cat_test = run_cv(cat, X, y, "CatBoost")


XGBoost | Fold 1 Log Loss: 0.26813
XGBoost | Fold 2 Log Loss: 0.27093
XGBoost | Fold 3 Log Loss: 0.26870
XGBoost | Fold 4 Log Loss: 0.27007
XGBoost | Fold 5 Log Loss: 0.26767
XGBoost | Overall Log Loss: 0.26910

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
LightGBM | Fold 1 Log Loss: 0.26812
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053272 seconds.
You can set `fo

In [17]:
ensemble_oof = (xgb_oof + lgbm_oof + cat_oof) / 3
ensemble_test = (xgb_test + lgbm_test + cat_test) / 3

print("Ensemble LogLoss:", log_loss(y, ensemble_oof))

Ensemble LogLoss: 0.2689519623881229


In [18]:
results = pd.DataFrame({
    "Model": ["XGBoost", "LightGBM", "CatBoost", "Ensemble"],
    "LogLoss": [
        log_loss(y, xgb_oof),
        log_loss(y, lgbm_oof),
        log_loss(y, cat_oof),
        log_loss(y, ensemble_oof)
    ]
})

results

Unnamed: 0,Model,LogLoss
0,XGBoost,0.269101
1,LightGBM,0.269259
2,CatBoost,0.269419
3,Ensemble,0.268952


In [19]:
submission = pd.DataFrame({
    "id": test['id'],
    "Heart Disease": ensemble_test
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created ✅")

submission.csv created ✅
