In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 1. Load Data (Kaggle default paths)
print("Loading data...")
train = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')

# 2. Preprocess Target (The Hidden Trap!)
# We MUST map the text labels to 1 and 0 for probability predictions
train['Heart Disease'] = train['Heart Disease'].map({'Presence': 1, 'Absence': 0})

X = train.drop(columns=['id', 'Heart Disease'])
y = train['Heart Disease']
test_features = test.drop(columns=['id'])

# 3. Build the Ensemble Model (The High-Scoring Strategy)
# HistGradientBoosting is Scikit-Learn's super-fast equivalent to LightGBM
hgb = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.05, max_leaf_nodes=31, random_state=42)

# Random Forest adds stability and prevents the Gradient Booster from overfitting
rf = RandomForestClassifier(n_estimators=500, max_depth=12, min_samples_split=5, random_state=42)

# Soft Voting averages the probabilities of both models (Perfect for ROC AUC scoring)
ensemble = VotingClassifier(
    estimators=[('hgb', hgb), ('rf', rf)],
    voting='soft',
    weights=[0.7, 0.3] # Gradient Boosters usually perform slightly better, so we give it 70% of the vote
)

# Create a robust pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', ensemble)
])

# 4. Train Model
print("Training the Soft-Voting Ensemble model...")
pipeline.fit(X, y)

# 5. Predict Probabilities for Test Set
# predict_proba returns [Probability of 0, Probability of 1]. We extract index 1.
print("Predicting on unseen test data...")
test_probs = pipeline.predict_proba(test_features)[:, 1]

# 6. Create Submission File
submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': test_probs
})

submission.to_csv('submission.csv', index=False)
print("Saved submission.csv successfully! Ready for Kaggle submission.")

/kaggle/input/playground-series-s6e2/sample_submission.csv
/kaggle/input/playground-series-s6e2/train.csv
/kaggle/input/playground-series-s6e2/test.csv
Loading data...
Training the Soft-Voting Ensemble model...
Predicting on unseen test data...
Saved submission.csv successfully! Ready for Kaggle submission.
