In [None]:
# =========================
# Kaggle Playground Series S5E9 - Predict BPM of Songs
# End-to-End Notebook (Kaggle Environment)
# =========================

# 1. Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# 2. Load Data
train = pd.read_csv("../input/playground-series-s5e9/train.csv")
test = pd.read_csv("../input/playground-series-s5e9/test.csv")
sample_sub = pd.read_csv("../input/playground-series-s5e9/sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission shape:", sample_sub.shape)

# 3. Quick EDA
print("\nTrain columns:", train.columns.tolist())
print("\nMissing values:\n", train.isna().sum().sort_values(ascending=False).head())

# Target distribution
sns.histplot(train["BeatsPerMinute"], bins=50, kde=True)
plt.title("Distribution of Target (BeatsPerMinute)")
plt.show()

# 4. Prepare Features
TARGET = "BeatsPerMinute"
ID_COL = "ID"

X = train.drop([TARGET], axis=1)
y = train[TARGET]
X_test = test.copy()

# Drop ID if present
if ID_COL in X.columns:
    X = X.drop(ID_COL, axis=1)
    X_test = X_test.drop(ID_COL, axis=1)

features = X.columns.tolist()
print("Number of features:", len(features))

# 5. Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# To store predictions
oof_lgb, oof_xgb, oof_cat = np.zeros(len(X)), np.zeros(len(X)), np.zeros(len(X))
preds_lgb, preds_xgb, preds_cat = np.zeros(len(X_test)), np.zeros(len(X_test)), np.zeros(len(X_test))

# 6. LightGBM
print("\nTraining LightGBM...")
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )

    oof_lgb[val_idx] = model.predict(X_val)
    preds_lgb += model.predict(X_test) / kf.n_splits

print("LightGBM CV RMSE:", mean_squared_error(y, oof_lgb, squared=False))

# 7. XGBoost
print("\nTraining XGBoost...")
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = xgb.XGBRegressor(
        n_estimators=10000,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method="hist"
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        early_stopping_rounds=100,
        verbose=False
    )

    oof_xgb[val_idx] = model.predict(X_val)
    preds_xgb += model.predict(X_test) / kf.n_splits

print("XGBoost CV RMSE:", mean_squared_error(y, oof_xgb, squared=False))

# 8. CatBoost
print("\nTraining CatBoost...")
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        iterations=10000,
        learning_rate=0.05,
        depth=8,
        random_seed=42,
        loss_function="RMSE",
        verbose=False
    )
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)

    oof_cat[val_idx] = model.predict(X_val)
    preds_cat += model.predict(X_test) / kf.n_splits

print("CatBoost CV RMSE:", mean_squared_error(y, oof_cat, squared=False))

# 9. Stacking with Ridge
stacked_oof = np.vstack([oof_lgb, oof_xgb, oof_cat]).T
stacked_preds = np.vstack([preds_lgb, preds_xgb, preds_cat]).T

meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_oof, y)

final_oof = meta_model.predict(stacked_oof)
final_preds = meta_model.predict(stacked_preds)

print("\nFinal Stacked RMSE:", mean_squared_error(y, final_oof, squared=False))

# 10. Save Submission
submission = sample_sub.copy()
submission["BeatsPerMinute"] = final_preds
submission.to_csv("submission.csv", index=False)

print("\nSubmission file saved as submission.csv")
submission.head()
