In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import train_test_split
from scipy import stats

In [3]:
df = pd.read_csv('../Datasets/cleaned_data.csv')
df

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.08300,0.724,246333,0.689,0.000000,D,0.3040,-5.922,Minor,0.1350,146.496,4/4,0.6930
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.32300,0.685,186467,0.610,0.000000,C,0.1020,-5.221,Minor,0.0439,94.384,3/4,0.3230
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.06750,0.762,199520,0.520,0.000004,F,0.1140,-5.237,Minor,0.0959,75.047,4/4,0.0862
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36000,0.563,240597,0.366,0.002430,B,0.0955,-6.896,Minor,0.1210,85.352,4/4,0.7680
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.59600,0.653,213947,0.621,0.000000,B,0.0811,-5.721,Minor,0.0409,100.006,4/4,0.4660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224914,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.9620
224915,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.9690
224916,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.8130
224917,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.4890


In [745]:
df.dtypes

genre                object
artist_name          object
track_name           object
track_id             object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

In [746]:
# Prints unique categorical values
print(df['mode'].unique())
print(df['key'].unique())
print(df['time_signature'].unique())

['Minor' 'Major']
['D' 'C' 'F' 'B' 'E' 'G' 'G#' 'A#' 'C#' 'A' 'F#' 'D#']
['4/4' '3/4' '5/4' '1/4' '0/4']


In [747]:
# Convert 3 categorical columns to numerical values, saves to new dataframe and merges to the original df
mode = {'Major': 1, 'Minor': 0}
# key = {'A' : 1, 'A#' : 2, 'B' : 3, 'C' : 4, 'C#' : 5, 'D' : 6, 
#         'D#' : 7, 'E' : 9, 'F' : 10, 'F#' : 11, 'G' : 12, 'G#' : 13}
key = {'A' : 9, 'A#' : 10, 'B' : 11, 'C' : 0, 'C#' : 1, 'D' : 2, 
        'D#' : 3, 'E' : 4, 'F' : 5, 'F#' : 6, 'G' : 7, 'G#' : 8}
time_signature = {'4/4' : 4, '3/4' : 3, '5/4': 5, '1/4': 1, '0/4': 0}


df.insert(11, 'key_num', df['key'].replace(key, inplace = False))
df.insert(15, 'mode_num', df['mode'].replace(mode, inplace = False))
df.insert(19, 'time_signature_num', df['time_signature'].replace(time_signature, inplace = False))

df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,...,key_num,liveness,loudness,mode,mode_num,speechiness,tempo,time_signature,time_signature_num,valence
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,...,2,0.304,-5.922,Minor,0,0.135,146.496,4/4,4,0.693
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,...,0,0.102,-5.221,Minor,0,0.0439,94.384,3/4,3,0.323
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,...,5,0.114,-5.237,Minor,0,0.0959,75.047,4/4,4,0.0862
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,...,11,0.0955,-6.896,Minor,0,0.121,85.352,4/4,4,0.768
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,...,11,0.0811,-5.721,Minor,0,0.0409,100.006,4/4,4,0.466


In [748]:
# Creates one-hot encoding dummy variables for 'time_signatue', 'key', and 'mode' columns and adds them to the dataframe
time_sig_dummy = pd.get_dummies(df['time_signature'])
mode_dummy = pd.get_dummies(df['mode'])
key_dummy = pd.get_dummies(df['key'])

df = df.merge(time_sig_dummy, left_index = True, right_index=True)
df = df.merge(mode_dummy, left_index = True, right_index=True)
df = df.merge(key_dummy, left_index = True, right_index=True)

df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,...,B,C,C#,D,D#,E,F,F#,G,G#
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,...,0,0,0,1,0,0,0,0,0,0
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,...,0,1,0,0,0,0,0,0,0,0
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,...,0,0,0,0,0,0,1,0,0,0
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,...,1,0,0,0,0,0,0,0,0,0
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,...,1,0,0,0,0,0,0,0,0,0


In [749]:
print(df['genre'].value_counts())

Children’s Music    14756
Comedy               9681
Soundtrack           9646
Indie                9543
Jazz                 9441
Pop                  9386
Electronic           9377
Folk                 9299
Hip-Hop              9295
Rock                 9272
Alternative          9263
Classical            9256
Rap                  9232
World                9096
Soul                 9089
Blues                9023
R&B                  8992
Anime                8936
Reggaeton            8927
Ska                  8874
Reggae               8771
Dance                8701
Country              8664
Opera                8280
A Capella             119
Name: genre, dtype: int64


In [750]:
# Filters data to finalize data to be used to train and test models

# Converts 'artist_name' and 'track_name' to uppercase
df[['artist_name','track_name']] = df[['artist_name','track_name']].apply(lambda x: x.astype(str).str.upper())

# Drops all but one row of duplicate track rows and tracks classified into more than one genre since this is being treated as a multi-classification problem and not a multi-label classification problem
df.drop_duplicates(subset = ['track_id'], inplace=True)
df.drop_duplicates(subset = ['artist_name', 'track_name'], inplace=True)
 
# Resets index
df.reset_index(drop = True, inplace = True)

In [751]:
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key_num', 'liveness', 'loudness', 'mode_num', 'speechiness', 'tempo', 'time_signature_num', 'valence']

In [752]:
# Creates X, Y, X_train, y_train, X_test, y_test  
X = df[features]
y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify = y)

In [753]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(126147, 12) (42049, 12) (126147,) (42049,)


In [754]:
df.shape

(168196, 40)

In [755]:
# Removes outliers for training & test sets

mean = X_train.mean(axis = 0)
std = X_train.std(axis = 0)
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off

# Removes outliers from the training set
X_train = X_train[(X_train < upper).all(axis = 1) & (X_train > lower).all(axis = 1)]
# Removes outliers from the testing set
X_test = X_test[(X_test < upper).all(axis = 1) & (X_test > lower).all(axis = 1)]

# Filters dataframe by row indices with rows that have only non outlier values
df = df[df.index.isin(X_train.index.append(X_test.index))]

In [758]:
print(df.shape, X_train.shape, X_test.shape)

(152864, 40) (114626, 12) (38238, 12)


In [757]:
# Writes data frame to new csv file
df.to_csv('final.csv', index = False)

pd.read_csv('final.csv').head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,...,B,C,C#,D,D#,E,F,F#,G,G#
0,R&B,MARY J. BLIGE,BE WITHOUT YOU - KENDU MIX,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,...,0,0,0,1,0,0,0,0,0,0
1,R&B,RIHANNA,DESPERADO,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,...,0,1,0,0,0,0,0,0,0,0
2,R&B,YUNG BLEU,ICE ON MY BABY (FEAT. KEVIN GATES) - REMIX,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,...,0,0,0,0,0,0,1,0,0,0
3,R&B,SURFACES,HEAVEN FALLS / FALL ON ME,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,...,1,0,0,0,0,0,0,0,0,0
4,R&B,OLIVIA O'BRIEN,LOVE MYSELF,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,...,1,0,0,0,0,0,0,0,0,0
