# Preparation

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Identify data

In [3]:
df_train = pd.read_csv("/kaggle/input/playground-series-s5e9/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s5e9/test.csv")

In [4]:
df_train.columns = df_train.columns.str.replace(' ', '_').str.lower()
df_test.columns = df_test.columns.str.replace(' ', '_').str.lower()

In [5]:
df_train.head(3)

Unnamed: 0,id,rhythmscore,audioloudness,vocalcontent,acousticquality,instrumentalscore,liveperformancelikelihood,moodscore,trackdurationms,energy,beatsperminute
0,0,0.60361,-7.636942,0.0235,5e-06,1e-06,0.051385,0.409866,290715.645,0.826267,147.5302
1,1,0.639451,-16.267598,0.07152,0.444929,0.349414,0.170522,0.65101,164519.5174,0.1454,136.15963
2,2,0.514538,-15.953575,0.110715,0.173699,0.453814,0.029576,0.423865,174495.5667,0.624667,55.31989


In [6]:
df_train.isnull().sum(), df_test.isnull().sum()

(id                           0
 rhythmscore                  0
 audioloudness                0
 vocalcontent                 0
 acousticquality              0
 instrumentalscore            0
 liveperformancelikelihood    0
 moodscore                    0
 trackdurationms              0
 energy                       0
 beatsperminute               0
 dtype: int64,
 id                           0
 rhythmscore                  0
 audioloudness                0
 vocalcontent                 0
 acousticquality              0
 instrumentalscore            0
 liveperformancelikelihood    0
 moodscore                    0
 trackdurationms              0
 energy                       0
 dtype: int64)

In [7]:
X = df_train.drop('beatsperminute', axis=1)
y = df_train['beatsperminute']

# Modelling process

In [9]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [10]:
import xgboost
from xgboost import XGBRegressor

In [11]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features)
])

# Create pipeline with XGBoost Regressor
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Prediction

In [13]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [14]:
# Generate predictions
y_pred = pipeline.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print stats
print(f"Model RMSE: {rmse:.4f}")

Model RMSE: 26.4519


In [15]:
y_test_pred = pipeline.predict(df_test)

In [16]:
submission_df = pd.read_csv('/kaggle/input/playground-series-s5e9/sample_submission.csv')
submission_df.to_csv('submission.csv', index=False)

In [17]:
print(submission_df.head())
print("Successfully saved as CSV file")

       id  BeatsPerMinute
0  524164         119.035
1  524165         119.035
2  524166         119.035
3  524167         119.035
4  524168         119.035
Successfully saved as CSV file
