# Modelling - Billboard Hot 100 & Spotify Track Data

## 1.0 Import Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [3]:
#import data
data = pd.read_csv('../data/processed/prepro-merged', index_col=0)

data.head()

Unnamed: 0,BB_Title,BB_Artist,PeakPos,danceability,energy,key,loudness,mode,speechiness,acousticness,...,tm_42,tm_43,tm_44,tm_45,tm_46,tm_47,tm_48,tm_49,tm_50,tm_51
0,Stay,The Kid LAROI & Justin Bieber,1,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,...,0.0,0.0,0.0,0.0,3.0,-1.0,0.0,0.0,0.0,0.0
1,Bad Habits,Ed Sheeran,2,0.808,0.897,11.0,-3.712,0.0,0.0348,0.0469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Good 4 U,Olivia Rodrigo,1,0.563,0.664,9.0,-5.044,1.0,0.154,0.335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Rumors,Lizzo Featuring Cardi B,4,0.827,0.731,4.0,-5.524,0.0,0.088,0.13,...,0.0,0.0,2.0,0.0,4.0,-2.0,0.0,0.0,0.0,0.0
4,Kiss Me More,Doja Cat Featuring SZA,3,0.762,0.701,8.0,-3.541,1.0,0.0286,0.235,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X = data.drop(columns=['PeakPos','BB_Title', 'BB_Artist'])
y = data.PeakPos

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 2.0 Refining the Model

In [6]:
RFpipe = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(random_state=42)
)

In [7]:
RFpipe.fit(X_train, y_train)
    
y_tr_pred = RFpipe.predict(X_train)
y_te_pred = RFpipe.predict(X_test)

In [8]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.8705168732321016, 0.0773500832378754)

In [9]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(8.911822690638562, 24.198828633405636)

In [10]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(113.7607755114693, 805.8527851048444)

In [11]:
cv_results = cross_validate(RFpipe, X_train, y_train, cv=5)
cv_scores = cv_results['test_score']
cv_scores

array([0.06792352, 0.0417402 , 0.07250459, 0.05910489, 0.06425384])