# 🎵 Kaggle Playground Series – Season 5, Episode 9  
### Predicting Song BPM  

This notebook explores the **Kaggle Playground Series – Season 5, Episode 9** competition, where the goal is to **predict the beats-per-minute (BPM) of songs** from a set of features generated by a deep learning model trained on the original BPM Prediction Challenge dataset.  

---

## 📑 Notebook Outline  
1. **Exploratory Data Analysis (EDA):** understand the dataset, feature distributions, and relationships with the target variable.  
2. **Data Visualization:** highlight trends, correlations, and patterns that may guide feature engineering.  
3. **Baseline Modeling:** start with simple regression models and move towards tree-based models.  
4. **Model Tuning & Ensembling:** optimize models and combine predictions for improved performance.  
5. **Submission Preparation:** generate predictions for the test set and create a submission file.  

---

In [None]:
import kagglehub
kagglehub.login()

In [None]:
data_path = kagglehub.competition_download('playground-series-s5e9')

print('Data source import complete.')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))

# EDA

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
missing_table = pd.DataFrame({
    'Missing Values': data.isna().sum(),
    'Percentage (%)': (data.isnull().mean() * 100).round(2)
})

print(missing_table.sort_values(by='Missing Values', ascending=False))

In [None]:
data.nunique()

# Data visualisation

Target variable distribution

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data['BeatsPerMinute'], kde=True, bins=30)
plt.title(f"Distribution of target")
plt.show()

In [None]:
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('id')

data[numeric_cols].hist(bins=30, figsize=(20, 15), edgecolor='black')
plt.suptitle("Histograms of Numeric Features", fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(16, 6))

data_corr = data.corr(numeric_only=True)

heatmap = sns.heatmap(data_corr.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12})

plt.show()

Top 5 correlated features with target

In [None]:
correlations = data[numeric_cols].corr()['BeatsPerMinute']
correlations = correlations.drop('BeatsPerMinute')

top_features = correlations.abs().sort_values(ascending=False).head(5).index.tolist()

print("Top 5 features correlated with target:")
print(correlations[top_features])

Scatter plots for top 5 correlated features with target

In [None]:
for feature in top_features:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=data[feature], y=data['BeatsPerMinute'])
    plt.title(f"{feature} vs target (corr={correlations[feature]:.4f})")
    plt.show()

Boxplots for outliers detection

In [None]:
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('id')

for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=data[col])
    plt.title(f"Boxplot for {col}")
    plt.show()

# Data preprocessing

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.pipeline import Pipeline

In [None]:
X = data.drop(['BeatsPerMinute', 'id'], axis=1)
y = data['BeatsPerMinute']

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
test_id = test_data['id']
test_data = test_data.drop('id', axis=1)

In [None]:
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
test_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index)

# Model building

In [None]:
! pip install optuna

In [None]:
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error
import optuna

In [None]:
# params = {
#     "objective": "regression",
#     "metric": "rmse",
#     "boosting_type": "gbdt",
#     "n_estimators": 5000,
#     "learning_rate": 0.01,
#     "num_leaves": 31,
#     "max_depth": -1,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 5,
#     "lambda_l1": 0.1,
#     "lambda_l2": 0.1,
#     "min_child_samples": 20,
#     "verbose": -1,
#     "random_state": 42
# }

In [None]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "n_estimators": 5000,
        "learning_rate": 0.01,
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 1.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 1.0),
        "verbose": -1,
        "random_state": 42,
        "num_threads": -1,
    }

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    rmse_scores = []

    for train_idx, valid_idx in kf.split(X_scaled, y):
        X_train, X_valid = X_scaled.iloc[train_idx], X_scaled.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="rmse",
            callbacks=[lgb.early_stopping(300)],
        )

        preds = model.predict(X_valid, num_iteration=model.best_iteration_)
        rmse_scores.append(root_mean_squared_error(y_valid, preds))

    return np.mean(rmse_scores)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)
print("Best RMSE:", study.best_value)

In [None]:
best_params = study.best_params
best_params.update({"objective": "regression", "metric": "rmse", "n_estimators": 5000, "learning_rate": 0.01})

lgb_model = lgb.LGBMRegressor(**best_params)
lgb_model.fit(X, y)

In [None]:
test_preds = lgb_model.predict(test_scaled, num_iteration=lgb_model.best_iteration_)

In [None]:
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# oof_preds = np.zeros(len(X))
# test_preds = np.zeros(len(test_scaled))

# for fold, (train_idx, valid_idx) in enumerate(kf.split(X_scaled, y)):
#     print(f"Training fold {fold+1}...")
#     X_train, X_valid = X_scaled.iloc[train_idx], X_scaled.iloc[valid_idx]
#     y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

#     model = lgb.LGBMRegressor(**params)
#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_valid, y_valid)],
#         eval_metric="rmse",
#         callbacks=[
#         lgb.early_stopping(stopping_rounds=300)]
#         )

#     oof_preds[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration_)
#     test_preds += model.predict(test_scaled, num_iteration=model.best_iteration_) / kf.n_splits

# rmse = root_mean_squared_error(y, oof_preds)
# print(f"OOF RMSE: {rmse:.5f}")

# Submission

In [None]:
submission = pd.DataFrame({
    'id': test_id,
    'y': test_preds
})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)