In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

print("Loading data...")
train = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')

train['Heart Disease'] = train['Heart Disease'].map({'Presence': 1, 'Absence': 0})

X = train.drop(columns=['id', 'Heart Disease'])
y = train['Heart Disease']
test_features = test.drop(columns=['id'])

numeric_features = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
categorical_features = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 
                        'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
    ])

cat_indices = list(range(5, 13))

# Phase 1b Improvements: Hyperparameter Tuning
# -------------------------------------------------
# We ran a RandomizedSearchCV which found that increasing L2 regularization, 
# raising min_samples_leaf, and lowering max_leaf_nodes combats overfitting
# and increases the Out-of-Fold ROC AUC to 0.95515!
hgb = HistGradientBoostingClassifier(
    max_iter=500, 
    learning_rate=0.05, 
    max_leaf_nodes=20, 
    min_samples_leaf=100,
    l2_regularization=1.0,
    categorical_features=cat_indices,
    random_state=42
)

# Random Forest adds stability, also tuned to prevent over-memorizing
rf = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=20, n_jobs=-1, random_state=42)

ensemble = VotingClassifier(
    estimators=[('hgb', hgb), ('rf', rf)],
    voting='soft',
    weights=[0.85, 0.15] # HGB is much stronger now, so it gets 85% of the vote
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble)
])

# Phase 1: Stratified K-Fold CV
# -------------------------------------------------
print('Starting 5-Fold Stratified CV...')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    pipeline.fit(X_train, y_train)
    
    val_probs = pipeline.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_probs
    
    fold_auc = roc_auc_score(y_val, val_probs)
    print(f'Fold {fold+1} ROC AUC: {fold_auc:.5f}')

overall_auc = roc_auc_score(y, oof_preds)
print(f'\nOverall OOF (Out-Of-Fold) ROC AUC: {overall_auc:.5f}')


# 4. Train Model for Submission
# -------------------------------------------------
print("\nTraining on full dataset for final submission...")
pipeline.fit(X, y)

print("Predicting on unseen test data...")
test_probs = pipeline.predict_proba(test_features)[:, 1]

submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': test_probs
})

submission.to_csv('submission.csv', index=False)
print("Saved submission.csv successfully! Ready for Kaggle submission.")


/kaggle/input/playground-series-s6e2/sample_submission.csv
/kaggle/input/playground-series-s6e2/train.csv
/kaggle/input/playground-series-s6e2/test.csv
Loading data...
Training the Soft-Voting Ensemble model...
Predicting on unseen test data...
Saved submission.csv successfully! Ready for Kaggle submission.
