SONG BEATS PER MINUTE PREDICTION (Regression Pipeline)  
Author: Mohit Kumar  
Date: 2025-09-09

In [1]:
# --- 1. IMPORTS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

ModuleNotFoundError: No module named 'pandas'

In [None]:
# --- 2. DATA LOADING & SUMMARY ---
df = pd.read_csv("train.csv")
print("Shape :", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nDescriptive Stats:\n", df.describe())

Observation: No missing values found, and 524,164 rows x 11 columns loaded.

In [None]:
# --- 3. EXPLORATORY DATA ANALYSIS (EDA) ---

# 3.1: Target distribution
plt.figure(figsize=(8,4))
sns.histplot(df['BeatsPerMinute'], bins=50, kde=True)
plt.title('Distribution of BeatsPerMinute')
plt.xlabel('BPM')
plt.show()

Observation: BPM distributes mainly between 80 and 160, with a few high outliers.

In [None]:
# 3.2: Feature distributions (first 5 features)
numeric_cols = df.select_dtypes(include=['float64', 'int']).columns.drop(['id'])
for col in numeric_cols[:5]:
    plt.figure(figsize=(6, 2))
    sns.histplot(df[col], bins=40, kde=True)
    plt.title(col)
    plt.show()

In [None]:
# 3.3: Correlation heatmap
corr = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr[['BeatsPerMinute']].sort_values(by='BeatsPerMinute', ascending=False), annot=True, cmap='coolwarm')
plt.title('Correlation with BeatsPerMinute')
plt.show()

Observation: Energy and RhythmScore appear most correlated with BPM.

In [None]:
# --- 4. FEATURE/TARGET SEPARATION & TRAIN-VALIDATION SPLIT ---
x = df.drop(['id', 'BeatsPerMinute'], axis=1)
y = df['BeatsPerMinute']
print("Features shape:", x.shape)
print("Target shape:", y.shape)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
print("X_train shape:", x_train.shape)
print("X_val shape:", x_val.shape)

In [None]:
# --- FEATURE SCALING ---
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
print("Scaled Training shape:", x_train_scaled.shape)
print("Scaled Validation shape:", x_val_scaled.shape)

In [None]:
# --- 5. MODEL TRAINING & EVALUATION ---

# 5.1 Linear Regression
lr_model = LinearRegression()
lr_model.fit(x_train_scaled, y_train)
y_val_pred = lr_model.predict(x_val_scaled)
mae_lr = mean_absolute_error(y_val, y_val_pred)
rmse_lr = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"[Linear Regression] Validation MAE: {mae_lr:.4f}")
print(f"[Linear Regression] Validation RMSE: {rmse_lr:.4f}")

In [None]:
# 5.2 Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1)
rf_model.fit(x_train_scaled, y_train)
y_val_pred_rf = rf_model.predict(x_val_scaled)
mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
print(f"[Random Forest] Validation MAE: {mae_rf:.4f}")
print(f"[Random Forest] Validation RMSE: {rmse_rf:.4f}")

In [None]:
# 5.3 Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
gb_model.fit(x_train_scaled, y_train)
y_val_pred_gb = gb_model.predict(x_val_scaled)
mae_gb = mean_absolute_error(y_val, y_val_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_val, y_val_pred_gb))
print(f"[Gradient Boosting] Validation MAE: {mae_gb:.4f}")
print(f"[Gradient Boosting] Validation RMSE: {rmse_gb:.4f}")

In [None]:
# --- 6. MODEL COMPARISON SUMMARY ---
print("\n**MODEL COMPARISON TABLE**")
print(f"Linear Regression   MAE: {mae_lr:.4f}  RMSE: {rmse_lr:.4f}")
print(f"Random Forest       MAE: {mae_rf:.4f}  RMSE: {rmse_rf:.4f}")
print(f"Gradient Boosting   MAE: {mae_gb:.4f}  RMSE: {rmse_gb:.4f}")
print("\nObservation: Linear Regression and Gradient Boosting performed almost identically and best, suggesting data is quite linear or more advanced models need heavier tuning.")

In [None]:
# --- 7. RETRAIN FINAL (BEST) MODEL ON ALL TRAINING DATA & PREPARE SUBMISSION ---
# Using Gradient Boosting here based on validation performance

x_full = x
y_full = y
x_full_scaled = scaler.fit_transform(x_full)
gb_model_final = GradientBoostingRegressor(n_estimators=50, max_depth=5, random_state=42)
gb_model_final.fit(x_full_scaled, y_full)

In [None]:
# --- 8. PREDICT ON TEST DATA ---
test_df = pd.read_csv("test.csv")
test_ids = test_df['id']
x_test = test_df.drop(['id'], axis=1)
x_test_scaled = scaler.transform(x_test)
test_preds = gb_model_final.predict(x_test_scaled)

submission = pd.DataFrame({
    'ID': test_ids,
    'BeatsPerMinute': test_preds
})
submission.to_csv('submission.csv', index=False)
print("\n[INFO] submission.csv created successfully!")

 --- 9. PROJECT REFLECTION / CONCLUSION ---
1. In this project, I explored a large music dataset and built regression models to predict BeatsPerMinute.
2. Visualization of distributions and correlations helped in understanding the data.
3. Three models (Linear Regression, Random Forest, Gradient Boosting) were compared for accuracy.
3. Gradient Boosting and Linear Regression achieved the best MAE/RMSE scores, showing the data's relationships are mostly linear.
4. For larger datasets or faster training, I would consider GPU-accelerated models (XGBoost/LightGBM).