# Preprocessing & Training - Billboard Hot 100 & Spotify Track Data

## 1.0 Import Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
#unique song data
songs = pd.read_csv('../data/processed/songs_eda', index_col=0)
#genres by song
genres = pd.read_csv('../data/processed/genres_eda', index_col=0)

### 1.1 Unique Song Data

In [3]:
songs.head()

Unnamed: 0,Date,BB_Title,BB_Artist,PeakPos,Weeks,SP_Title,SP_Artist,SP_id,Genres,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2021-08-28,Stay,The Kid LAROI & Justin Bieber,1,6,STAY (with Justin Bieber),The Kid LAROI,5HCyWlXZPP0y6Gqq8TgA20,australian hip hop,0.591,...,-5.484,1.0,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0
1,2021-08-28,Bad Habits,Ed Sheeran,2,8,Bad Habits,Ed Sheeran,6PQ88X9TkUIAUIZJHW2upE,"pop,uk pop",0.808,...,-3.712,0.0,0.0348,0.0469,3.1e-05,0.364,0.591,126.026,231041.0,4.0
2,2021-08-28,Good 4 U,Olivia Rodrigo,1,14,good 4 u,Olivia Rodrigo,4ZtFanR9U6ndgddUvNcjcG,pop,0.563,...,-5.044,1.0,0.154,0.335,0.0,0.0849,0.688,166.928,178147.0,4.0
3,2021-08-28,Rumors,Lizzo Featuring Cardi B,4,1,Rumors (feat. Cardi B),Lizzo,6KgtcmCF9Ky68XC7ezxl3s,"dance pop,escape room,minnesota hip hop,pop,tr...",0.827,...,-5.524,0.0,0.088,0.13,0.0,0.418,0.607,118.992,172833.0,4.0
4,2021-08-28,Kiss Me More,Doja Cat Featuring SZA,3,19,Kiss Me More (feat. SZA),Doja Cat,748mdHapucXQri7IAO8yFK,"dance pop,pop",0.762,...,-3.541,1.0,0.0286,0.235,0.000158,0.123,0.742,110.968,208867.0,4.0


In [4]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4609 entries, 0 to 4608
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              4609 non-null   object 
 1   BB_Title          4609 non-null   object 
 2   BB_Artist         4609 non-null   object 
 3   PeakPos           4609 non-null   int64  
 4   Weeks             4609 non-null   int64  
 5   SP_Title          4609 non-null   object 
 6   SP_Artist         4609 non-null   object 
 7   SP_id             4609 non-null   object 
 8   Genres            4571 non-null   object 
 9   danceability      4609 non-null   float64
 10  energy            4609 non-null   float64
 11  key               4609 non-null   float64
 12  loudness          4609 non-null   float64
 13  mode              4609 non-null   float64
 14  speechiness       4609 non-null   float64
 15  acousticness      4609 non-null   float64
 16  instrumentalness  4609 non-null   float64


#### 1.1.2 Song Attributes Only

In [5]:
song_atts = songs.drop(columns=['Date','Weeks','SP_Title','SP_Artist','SP_id','Genres'])

song_atts.head()

Unnamed: 0,BB_Title,BB_Artist,PeakPos,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Stay,The Kid LAROI & Justin Bieber,1,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0
1,Bad Habits,Ed Sheeran,2,0.808,0.897,11.0,-3.712,0.0,0.0348,0.0469,3.1e-05,0.364,0.591,126.026,231041.0,4.0
2,Good 4 U,Olivia Rodrigo,1,0.563,0.664,9.0,-5.044,1.0,0.154,0.335,0.0,0.0849,0.688,166.928,178147.0,4.0
3,Rumors,Lizzo Featuring Cardi B,4,0.827,0.731,4.0,-5.524,0.0,0.088,0.13,0.0,0.418,0.607,118.992,172833.0,4.0
4,Kiss Me More,Doja Cat Featuring SZA,3,0.762,0.701,8.0,-3.541,1.0,0.0286,0.235,0.000158,0.123,0.742,110.968,208867.0,4.0


### 1.2 Genres Data

In [6]:
genres.head()

Unnamed: 0,BB_Title,BB_Artist,Genres
0,Stay,The Kid LAROI & Justin Bieber,australian hip hop
1,Bad Habits,Ed Sheeran,pop
2,Bad Habits,Ed Sheeran,uk pop
3,Good 4 U,Olivia Rodrigo,pop
4,Rumors,Lizzo Featuring Cardi B,dance pop


#### 1.2.2 Genres Data, Categorical Data Converted to Numeric

In [7]:
counts = genres['Genres'].value_counts()
counts

pop                1661
rap                1540
dance pop          1064
trap                973
hip hop             955
                   ... 
sheffield indie       1
modern salsa          1
cedm                  1
classic j-pop         1
chill r&b             1
Name: Genres, Length: 444, dtype: int64

In [8]:
mask = genres['Genres'].isin(counts[counts < 50].index)

In [9]:
genres['Genres'][mask] = 'Other'
#genres.Genres.value_counts()

In [10]:
#genres column split into individual columns for each genres using get_dummies()
genre_dummies = pd.concat([genres, pd.get_dummies(data=genres.Genres)],axis=1)\
                .groupby(by=['BB_Title','BB_Artist'],as_index=False).sum()

genre_dummies.head()

Unnamed: 0,BB_Title,BB_Artist,Other,atl hip hop,atl trap,boy band,canadian contemporary r&b,canadian hip hop,canadian pop,chicago rap,...,rock,southern hip hop,talent show,toronto rap,trap,trap latino,tropical house,uk pop,urban contemporary,viral pop
0,#Beautiful,Mariah Carey Featuring Miguel,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,#SELFIE,The Chainsmokers,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,#thatPOWER,will.i.am Featuring Justin Bieber,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,$ave Dat Money,Lil Dicky Featuring Fetty Wap & Rich Homie Quan,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,'Tis The Damn Season,Taylor Swift,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#should I merge genres and songs? - Mentor Discussion

## 2.0 Train/Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(song_atts.drop(columns='PeakPos'), 
                                                    song_atts.PeakPos, test_size=0.3, 
                                                    random_state=47)

In [13]:
names_list = ['BB_Title', 'BB_Artist']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((3226, 13), (1383, 13))

## 3.0 Preprocessing

In [None]:
#How much of this should be in Modelling
#Scale the data (again), imput missing values, initial metrics?