In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

## Initialising Data

In [24]:
meta = pd.read_csv('meta_final.csv', index_col=0)

In [25]:
meta.rename(columns={'Track ID': 'track_id', 'tempo': 'tempo2'}, inplace = True)

In [26]:
meta

Unnamed: 0,track_id,length,zero_crossings,tempo2,spec_cent_mean,spec_cent_var,spectral_rolloff_mean,spectral_rolloff_var,chroma_mean0,chroma_var0,...,mfccs_mean15,mfccs_var15,mfccs_mean16,mfccs_var16,mfccs_mean17,mfccs_var17,mfccs_mean18,mfccs_var18,mfccs_mean19,mfccs_var19
0,2,660984,108342,161.0,0.365532,0.026789,6214.647549,2.281126e+06,0.734531,0.093535,...,1.253358,49.994880,-8.481487,53.345783,-2.040749,52.196274,-2.946624,51.77792,0.094077,40.441700
0,5,661560,66268,99.0,0.325568,0.032698,5437.441882,2.535670e+06,0.444994,0.075107,...,0.919858,66.263405,-4.126342,44.906055,-0.580666,44.026913,-1.600445,65.26340,1.350323,64.534930
0,10,660984,98404,112.0,0.310059,0.019171,4315.856035,6.900174e+05,0.280939,0.022039,...,-0.911099,42.735588,-3.207996,32.780640,3.043154,40.873420,-1.036512,30.22485,3.882601,29.444992
0,140,660984,29326,108.0,0.237486,0.032545,3863.175065,5.928252e+06,0.243959,0.082215,...,1.239001,43.232227,-5.205992,38.285038,-0.754437,39.603607,-6.319377,47.34824,-1.172379,44.371730
0,141,660984,40948,103.0,0.202888,0.019671,3315.637916,2.762245e+06,0.160931,0.059965,...,5.053833,99.240820,-0.170090,91.179344,3.087437,62.452350,-8.665928,70.66697,-1.098168,89.260025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,154308,660984,49268,65.0,0.292088,0.025934,3267.884577,2.954812e+06,0.403677,0.080006,...,3.838140,90.520256,4.203661,88.265230,2.577975,108.629930,1.229812,69.14249,-3.231690,65.682460
0,154309,660984,74250,96.0,0.272921,0.065196,4931.345368,9.547138e+06,0.246180,0.091449,...,-7.702279,40.592216,-10.535363,65.548470,1.828215,104.435350,7.517279,172.51152,8.652144,166.639270
0,154413,661560,25470,112.0,0.205912,0.020532,2706.338933,4.799625e+06,0.564777,0.114388,...,9.926634,178.120440,-0.730949,152.066010,-0.874068,254.400250,2.435402,206.16464,9.237540,209.552810
0,154414,660984,59693,99.0,0.279766,0.033045,4629.363461,3.583306e+06,0.230055,0.064261,...,-3.166733,45.321280,-7.242399,54.071130,-1.181526,68.359040,-7.297320,76.31981,-0.967366,86.632650


In [27]:
features = pd.read_csv('track_features_final.csv', index_col=0)

In [28]:
features

Unnamed: 0,track_id,title,artist,genre,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,2,Food,AWOL,Hip-Hop,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661
1,5,This World,AWOL,Hip-Hop,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661
2,10,Freeway,Kurt Vile,Pop,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590
3,140,Queen Of The Wires,Alec K. Redfearn & the Eyesores,Folk,0.376312,0.734079,0.265685,0.669581,0.085995,0.039068,107.952,0.609991
4,141,Ohio,Alec K. Redfearn & the Eyesores,Folk,0.963657,0.435933,0.075632,0.345493,0.105686,0.026658,33.477,0.163950
...,...,...,...,...,...,...,...,...,...,...,...,...
3277,154306,Yummy,Fleslit,Hip-Hop,0.008100,0.707000,0.354000,0.903000,0.109000,0.040600,100.015,0.069500
3278,154307,12:01 AM,Fleslit,Hip-Hop,0.059100,0.844000,0.346000,0.826000,0.054200,0.076500,130.012,0.234000
3279,154308,Devil Of Miami,Fleslit,Hip-Hop,0.005560,0.574000,0.417000,0.768000,0.363000,0.044400,139.979,0.272000
3280,154309,A1 Symphony,Fleslit,Hip-Hop,0.411000,0.706000,0.654000,0.851000,0.053900,0.514000,193.879,0.118000


In [29]:
df = features.merge(meta, on='track_id')

In [30]:
df.columns

Index(['track_id', 'title', 'artist', 'genre', 'acousticness', 'danceability',
       'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo',
       'valence', 'length', 'zero_crossings', 'tempo2', 'spec_cent_mean',
       'spec_cent_var', 'spectral_rolloff_mean', 'spectral_rolloff_var',
       'chroma_mean0', 'chroma_var0', 'chroma_mean1', 'chroma_var1',
       'chroma_mean2', 'chroma_var2', 'chroma_mean3', 'chroma_var3',
       'chroma_mean4', 'chroma_var4', 'chroma_mean5', 'chroma_var5',
       'chroma_mean6', 'chroma_var6', 'chroma_mean7', 'chroma_var7',
       'chroma_mean8', 'chroma_var8', 'chroma_mean9', 'chroma_var9',
       'chroma_mean10', 'chroma_var10', 'chroma_mean11', 'chroma_var11',
       'harm_mean', 'perc_mean', 'harm_var', 'perc_var', 'mfccs_mean0',
       'mfccs_var0', 'mfccs_mean1', 'mfccs_var1', 'mfccs_mean2', 'mfccs_var2',
       'mfccs_mean3', 'mfccs_var3', 'mfccs_mean4', 'mfccs_var4', 'mfccs_mean5',
       'mfccs_var5', 'mfccs_mean6', 'mfccs_var6', 'm

In [31]:
X = df.loc[:, 'zero_crossings':]

In [32]:
y = df[['liveness']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## XG Boost

In [34]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [35]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [36]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.159625


## Gridsearch

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor()
param_grid = {
    'eta': [0.01, 0.05, 0.1, 0.15, 0.2],
    'n_estimators': [100, 500],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [3, 5],
    'reg_alpha': [1, 1.2],
    'reg_lambda': [1, 1.2],
    'subsample': [0.7, 0.8]}

gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=5, 
        n_jobs=4, 
        scoring='neg_mean_squared_error',
        verbose=3)
    
fitted_model = gs.fit(X_train, y_train)

pred = fitted_model.predict(X_test)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


In [38]:
print(np.sqrt(-gs.best_score_))
print(gs.best_params_)

0.1531433931136915
{'colsample_bytree': 0.7, 'eta': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}


In [39]:
r2_score(y_test, pred)

0.057753949196254206

In [40]:
model = xgb.XGBRegressor()
param_grid = {
    'eta': [0.01, 0.05],
    'n_estimators': [750, 1000, 1250],
    'colsample_bytree': [0.7],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'reg_alpha': [1, 1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'subsample': [0.8]}

gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=5, 
        n_jobs=-1, 
        scoring='neg_mean_squared_error',
        verbose=3)
    
fitted_model = gs.fit(X_train, y_train)

pred = fitted_model.predict(X_test)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 

In [41]:
print(np.sqrt(-gs.best_score_))
print(gs.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [42]:
r2_score(y_test, pred)

0.057753949196254206

In [43]:
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8, learning_rate = 0.01,
                max_depth = 3, alpha = 10, n_estimators = 1000, reg_alpha=1, reg_lambda=1, subsample=0.7)

In [44]:
model.fit(X_train,y_train)

preds = model.predict(X_test)

In [45]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.158502


In [46]:
r2_score(y_test, preds)

0.06206727468041584

In [47]:
train_pred = model.predict(X_train)

In [48]:
r2_score(y_train, train_pred)

0.38081287164963795

## Getting More Data

In [49]:
features = pd.read_csv('features.csv', index_col=0, low_memory=False, header=[0,1,2])

In [50]:
features

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [51]:
echonest = pd.read_csv('echonest.csv', index_col=0, low_memory=False,header=[0,1,2])

In [52]:
echonest.columns = echonest.columns.droplevel(0).droplevel(0)

In [53]:
columns_i_want = [ ('chroma_stft', 'mean', '01'),
                   ('chroma_stft', 'mean', '02'), ('chroma_stft', 'mean', '03'),
                   ('chroma_stft', 'mean', '04'), ('chroma_stft', 'mean', '05'),
                   ('chroma_stft', 'mean', '06'), ('chroma_stft', 'mean', '07'),
                   ('chroma_stft', 'mean', '08'), ('chroma_stft', 'mean', '09'),
                   ('chroma_stft', 'mean', '10'), ('chroma_stft', 'mean', '11'),
                   ('chroma_stft', 'mean', '12'),
                   ('chroma_stft', 'std', '01'),
                   ('chroma_stft', 'std', '02'), ('chroma_stft', 'std', '03'),
                   ('chroma_stft', 'std', '04'), ('chroma_stft', 'std', '05'),
                   ('chroma_stft', 'std', '06'), ('chroma_stft', 'std', '07'),
                   ('chroma_stft', 'std', '08'), ('chroma_stft', 'std', '09'),
                   ('chroma_stft', 'std', '10'), ('chroma_stft', 'std', '11'),
                   ('chroma_stft', 'std', '12'),
                   ('mfcc', 'mean', '01'),
                   ('mfcc', 'mean', '02'), ('mfcc', 'mean', '03'),
                   ('mfcc', 'mean', '04'), ('mfcc', 'mean', '05'),
                   ('mfcc', 'mean', '06'), ('mfcc', 'mean', '07'),
                   ('mfcc', 'mean', '08'), ('mfcc', 'mean', '09'),
                   ('mfcc', 'mean', '10'), ('mfcc', 'mean', '11'),
                   ('mfcc', 'mean', '12'), ('mfcc', 'mean', '13'),
                   ('mfcc', 'mean', '14'), ('mfcc', 'mean', '15'),
                   ('mfcc', 'mean', '16'), ('mfcc', 'mean', '17'),
                   ('mfcc', 'mean', '18'), ('mfcc', 'mean', '19'),
                   ('mfcc', 'mean', '20'),
                   ('mfcc', 'std', '01'),
                   ('mfcc', 'std', '02'), ('mfcc', 'std', '03'),
                   ('mfcc', 'std', '04'), ('mfcc', 'std', '05'),
                   ('mfcc', 'std', '06'), ('mfcc', 'std', '07'),
                   ('mfcc', 'std', '08'), ('mfcc', 'std', '09'),
                   ('mfcc', 'std', '10'), ('mfcc', 'std', '11'),
                   ('mfcc', 'std', '12'), ('mfcc', 'std', '13'),
                   ('mfcc', 'std', '14'), ('mfcc', 'std', '15'),
                   ('mfcc', 'std', '16'), ('mfcc', 'std', '17'),
                   ('mfcc', 'std', '18'), ('mfcc', 'std', '19'),
                   ('mfcc', 'std', '20'),
                   ('spectral_centroid', 'mean', '01'),('spectral_centroid', 'std', '01'),
                   ('spectral_rolloff', 'mean', '01'),('spectral_rolloff', 'std', '01') ]


In [54]:
df = features[columns_i_want]
df.columns = df.columns.droplevel([1,2])
df.columns = pd.io.parsers.ParserBase({'names':df.columns})._maybe_dedup_names(df.columns)

In [55]:
df

Unnamed: 0_level_0,chroma_stft,chroma_stft.1,chroma_stft.2,chroma_stft.3,chroma_stft.4,chroma_stft.5,chroma_stft.6,chroma_stft.7,chroma_stft.8,chroma_stft.9,...,mfcc.34,mfcc.35,mfcc.36,mfcc.37,mfcc.38,mfcc.39,spectral_centroid,spectral_centroid.1,spectral_rolloff,spectral_rolloff.1
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.696063,0.447832,0.355580,0.349723,0.340518,0.266161,0.291201,0.430514,0.376956,0.380832,...,8.520863,8.560472,7.651871,7.246555,7.077188,7.391859,1639.583252,719.770508,3267.804688,1300.729736
3,0.449602,0.409319,0.477329,0.461481,0.390022,0.409889,0.485767,0.581830,0.506545,0.509018,...,9.600357,7.222888,8.398293,7.285423,7.417791,8.777440,1763.012451,972.758423,3514.619629,1650.357788
5,0.481160,0.658690,0.451551,0.368825,0.334513,0.324101,0.340564,0.354422,0.414781,0.417633,...,8.201844,7.780963,7.132692,7.539753,8.452527,7.334442,1292.958130,665.319275,2773.931885,1323.465210
10,0.388109,0.655152,0.452144,0.440090,0.355828,0.410883,0.611984,0.421226,0.452064,0.369691,...,7.071393,7.270959,7.051070,6.928591,6.430473,6.186294,1360.028687,668.700806,2603.491943,1524.401245
20,0.346976,0.366958,0.521544,0.360614,0.388255,0.463070,0.514895,0.576036,0.478274,0.554791,...,7.019398,6.983841,6.813648,7.520811,7.098001,7.032246,1732.971802,481.929871,4201.346191,1495.297119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,0.550126,0.622485,0.772728,0.651459,0.509336,0.415154,0.420334,0.418557,0.385257,0.421122,...,5.923375,5.586200,5.516872,5.755764,5.105072,5.019556,845.493652,264.716766,1732.244019,672.831116
155317,0.649968,0.615353,0.614999,0.587333,0.575341,0.525431,0.539207,0.609268,0.581010,0.565043,...,6.019901,5.757353,5.280114,5.674062,5.541949,5.266131,884.712708,412.968170,1860.772095,1039.222412
155318,0.598612,0.632105,0.695757,0.622193,0.577816,0.525048,0.540999,0.529734,0.489411,0.494412,...,6.311563,5.789986,5.466107,5.607614,5.685448,5.449005,1037.896973,368.312073,2176.805664,932.816101
155319,0.673499,0.601752,0.626450,0.609281,0.623840,0.614608,0.556207,0.527547,0.450272,0.410265,...,5.887583,5.110033,5.407447,5.317993,5.187994,5.260817,1119.052979,296.829010,2363.244385,758.186890


In [56]:
df2 = echonest.loc[:,'acousticness':'valence']

In [57]:
df2 = df2.join(df, how='left')

In [58]:
df2

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,chroma_stft,chroma_stft.1,...,mfcc.34,mfcc.35,mfcc.36,mfcc.37,mfcc.38,mfcc.39,spectral_centroid,spectral_centroid.1,spectral_rolloff,spectral_rolloff.1
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661,0.696063,0.447832,...,8.520863,8.560472,7.651871,7.246555,7.077188,7.391859,1639.583252,719.770508,3267.804688,1300.729736
3,0.374408,0.528643,0.817461,0.001851,0.105880,0.461818,126.957,0.269240,0.449602,0.409319,...,9.600357,7.222888,8.398293,7.285423,7.417791,8.777440,1763.012451,972.758423,3514.619629,1650.357788
5,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661,0.481160,0.658690,...,8.201844,7.780963,7.132692,7.539753,8.452527,7.334442,1292.958130,665.319275,2773.931885,1323.465210
10,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590,0.388109,0.655152,...,7.071393,7.270959,7.051070,6.928591,6.430473,6.186294,1360.028687,668.700806,2603.491943,1524.401245
134,0.452217,0.513238,0.560410,0.019443,0.096567,0.525519,114.290,0.894072,0.438062,0.452516,...,7.518894,6.755280,6.344675,6.761541,7.134986,6.803034,1257.696289,718.104797,2462.616943,1406.506592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124857,0.007592,0.790364,0.719288,0.853114,0.720715,0.082550,141.332,0.890461,0.493745,0.484312,...,7.062073,6.832367,7.267533,6.730730,7.210190,6.294291,1449.438232,727.547119,3333.072510,1652.505615
124862,0.041498,0.843077,0.536496,0.865151,0.547949,0.074001,101.975,0.476845,0.650595,0.621334,...,7.132393,6.556796,6.960447,6.391834,5.994080,6.122604,1245.709229,869.086243,2678.192627,1974.007446
124863,0.000124,0.609686,0.895136,0.846624,0.632903,0.051517,129.996,0.496667,0.683781,0.673807,...,6.734865,5.771911,6.377217,5.380946,5.837944,5.381021,1404.620483,561.415588,3147.964111,1349.083984
124864,0.327576,0.574426,0.548327,0.452867,0.075928,0.033388,142.009,0.569274,0.373205,0.344921,...,7.998732,8.148147,8.191489,7.981343,8.157024,7.822321,1358.554321,641.450989,2985.802002,1425.535034


In [59]:
features = list(df2.columns.values[0:8])

In [60]:
X = df2.loc[:, 'chroma_stft':]

In [61]:
X.columns

Index(['chroma_stft', 'chroma_stft.1', 'chroma_stft.2', 'chroma_stft.3',
       'chroma_stft.4', 'chroma_stft.5', 'chroma_stft.6', 'chroma_stft.7',
       'chroma_stft.8', 'chroma_stft.9', 'chroma_stft.10', 'chroma_stft.11',
       'chroma_stft.12', 'chroma_stft.13', 'chroma_stft.14', 'chroma_stft.15',
       'chroma_stft.16', 'chroma_stft.17', 'chroma_stft.18', 'chroma_stft.19',
       'chroma_stft.20', 'chroma_stft.21', 'chroma_stft.22', 'chroma_stft.23',
       'mfcc', 'mfcc.1', 'mfcc.2', 'mfcc.3', 'mfcc.4', 'mfcc.5', 'mfcc.6',
       'mfcc.7', 'mfcc.8', 'mfcc.9', 'mfcc.10', 'mfcc.11', 'mfcc.12',
       'mfcc.13', 'mfcc.14', 'mfcc.15', 'mfcc.16', 'mfcc.17', 'mfcc.18',
       'mfcc.19', 'mfcc.20', 'mfcc.21', 'mfcc.22', 'mfcc.23', 'mfcc.24',
       'mfcc.25', 'mfcc.26', 'mfcc.27', 'mfcc.28', 'mfcc.29', 'mfcc.30',
       'mfcc.31', 'mfcc.32', 'mfcc.33', 'mfcc.34', 'mfcc.35', 'mfcc.36',
       'mfcc.37', 'mfcc.38', 'mfcc.39', 'spectral_centroid',
       'spectral_centroid.1', 'spectral_

In [62]:
y = df2[['acousticness']]

In [63]:
y

Unnamed: 0_level_0,acousticness
track_id,Unnamed: 1_level_1
2,0.416675
3,0.374408
5,0.043567
10,0.951670
134,0.452217
...,...
124857,0.007592
124862,0.041498
124863,0.000124
124864,0.327576


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8, learning_rate = 0.01,
                max_depth = 12, n_estimators = 1000, reg_alpha=1, reg_lambda=1, subsample=0.7, min_child_weight = 10)

In [66]:
model.fit(X_train,y_train)

preds = model.predict(X_test)

In [67]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.212727


In [68]:
r2_score(y_test, preds)

0.6879793950505915

In [69]:
train_pred = model.predict(X_train)

In [70]:
r2_score(y_train, train_pred)

0.972045692868202

In [71]:
features

['acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'speechiness',
 'tempo',
 'valence']

In [72]:
for feature in features:
    X = df2.loc[:, 'chroma_stft':]
    y = df2[[feature]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8, learning_rate = 0.01,
                max_depth = 12, n_estimators = 1000, reg_alpha=1, reg_lambda=1, subsample=0.7, min_child_weight = 10)
    model.fit(X_train,y_train)

    preds = model.predict(X_test)
    train_pred = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2_train = r2_score(y_train, train_pred)
    r2_test = r2_score(y_test, preds)
    
    print(feature)
    print("RMSE: %f" % (rmse))
    print('R2 Train: %f' % (r2_train))
    print('R2 Test: %f' % (r2_test))
    print()

acousticness
RMSE: 0.212727
R2 Train: 0.972046
R2 Test: 0.687979

danceability
RMSE: 0.145457
R2 Train: 0.909429
R2 Test: 0.406299

energy
RMSE: 0.145319
R2 Train: 0.957484
R2 Test: 0.725942

instrumentalness
RMSE: 0.272067
R2 Train: 0.953448
R2 Test: 0.433948

liveness
RMSE: 0.140322
R2 Train: 0.856913
R2 Test: 0.200388

speechiness
RMSE: 0.106196
R2 Train: 0.894216
R2 Test: 0.483842

tempo
RMSE: 34.452502
R2 Train: 0.929732
R2 Test: 0.028187

valence
RMSE: 0.243638
R2 Train: 0.920874
R2 Test: 0.232675



In [73]:
meta = pd.read_csv('meta_final.csv', index_col=0)

In [74]:
meta.rename(columns={'Track ID': 'track_id', 'tempo': 'tempo2'}, inplace = True)

In [75]:
meta

Unnamed: 0,track_id,length,zero_crossings,tempo2,spec_cent_mean,spec_cent_var,spectral_rolloff_mean,spectral_rolloff_var,chroma_mean0,chroma_var0,...,mfccs_mean15,mfccs_var15,mfccs_mean16,mfccs_var16,mfccs_mean17,mfccs_var17,mfccs_mean18,mfccs_var18,mfccs_mean19,mfccs_var19
0,2,660984,108342,161.0,0.365532,0.026789,6214.647549,2.281126e+06,0.734531,0.093535,...,1.253358,49.994880,-8.481487,53.345783,-2.040749,52.196274,-2.946624,51.77792,0.094077,40.441700
0,5,661560,66268,99.0,0.325568,0.032698,5437.441882,2.535670e+06,0.444994,0.075107,...,0.919858,66.263405,-4.126342,44.906055,-0.580666,44.026913,-1.600445,65.26340,1.350323,64.534930
0,10,660984,98404,112.0,0.310059,0.019171,4315.856035,6.900174e+05,0.280939,0.022039,...,-0.911099,42.735588,-3.207996,32.780640,3.043154,40.873420,-1.036512,30.22485,3.882601,29.444992
0,140,660984,29326,108.0,0.237486,0.032545,3863.175065,5.928252e+06,0.243959,0.082215,...,1.239001,43.232227,-5.205992,38.285038,-0.754437,39.603607,-6.319377,47.34824,-1.172379,44.371730
0,141,660984,40948,103.0,0.202888,0.019671,3315.637916,2.762245e+06,0.160931,0.059965,...,5.053833,99.240820,-0.170090,91.179344,3.087437,62.452350,-8.665928,70.66697,-1.098168,89.260025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,154308,660984,49268,65.0,0.292088,0.025934,3267.884577,2.954812e+06,0.403677,0.080006,...,3.838140,90.520256,4.203661,88.265230,2.577975,108.629930,1.229812,69.14249,-3.231690,65.682460
0,154309,660984,74250,96.0,0.272921,0.065196,4931.345368,9.547138e+06,0.246180,0.091449,...,-7.702279,40.592216,-10.535363,65.548470,1.828215,104.435350,7.517279,172.51152,8.652144,166.639270
0,154413,661560,25470,112.0,0.205912,0.020532,2706.338933,4.799625e+06,0.564777,0.114388,...,9.926634,178.120440,-0.730949,152.066010,-0.874068,254.400250,2.435402,206.16464,9.237540,209.552810
0,154414,660984,59693,99.0,0.279766,0.033045,4629.363461,3.583306e+06,0.230055,0.064261,...,-3.166733,45.321280,-7.242399,54.071130,-1.181526,68.359040,-7.297320,76.31981,-0.967366,86.632650


In [76]:
features = pd.read_csv('track_features_final.csv', index_col=0)

In [77]:
features

Unnamed: 0,track_id,title,artist,genre,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,2,Food,AWOL,Hip-Hop,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661
1,5,This World,AWOL,Hip-Hop,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661
2,10,Freeway,Kurt Vile,Pop,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590
3,140,Queen Of The Wires,Alec K. Redfearn & the Eyesores,Folk,0.376312,0.734079,0.265685,0.669581,0.085995,0.039068,107.952,0.609991
4,141,Ohio,Alec K. Redfearn & the Eyesores,Folk,0.963657,0.435933,0.075632,0.345493,0.105686,0.026658,33.477,0.163950
...,...,...,...,...,...,...,...,...,...,...,...,...
3277,154306,Yummy,Fleslit,Hip-Hop,0.008100,0.707000,0.354000,0.903000,0.109000,0.040600,100.015,0.069500
3278,154307,12:01 AM,Fleslit,Hip-Hop,0.059100,0.844000,0.346000,0.826000,0.054200,0.076500,130.012,0.234000
3279,154308,Devil Of Miami,Fleslit,Hip-Hop,0.005560,0.574000,0.417000,0.768000,0.363000,0.044400,139.979,0.272000
3280,154309,A1 Symphony,Fleslit,Hip-Hop,0.411000,0.706000,0.654000,0.851000,0.053900,0.514000,193.879,0.118000


In [78]:
df3 = features.merge(meta, on='track_id')

In [79]:
df3

Unnamed: 0,track_id,title,artist,genre,acousticness,danceability,energy,instrumentalness,liveness,speechiness,...,mfccs_mean15,mfccs_var15,mfccs_mean16,mfccs_var16,mfccs_mean17,mfccs_var17,mfccs_mean18,mfccs_var18,mfccs_mean19,mfccs_var19
0,2,Food,AWOL,Hip-Hop,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,...,1.253358,49.994880,-8.481487,53.345783,-2.040749,52.196274,-2.946624,51.777920,0.094077,40.441700
1,5,This World,AWOL,Hip-Hop,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,...,0.919858,66.263405,-4.126342,44.906055,-0.580666,44.026913,-1.600445,65.263400,1.350323,64.534930
2,10,Freeway,Kurt Vile,Pop,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,...,-0.911099,42.735588,-3.207996,32.780640,3.043154,40.873420,-1.036512,30.224850,3.882601,29.444992
3,140,Queen Of The Wires,Alec K. Redfearn & the Eyesores,Folk,0.376312,0.734079,0.265685,0.669581,0.085995,0.039068,...,1.239001,43.232227,-5.205992,38.285038,-0.754437,39.603607,-6.319377,47.348240,-1.172379,44.371730
4,141,Ohio,Alec K. Redfearn & the Eyesores,Folk,0.963657,0.435933,0.075632,0.345493,0.105686,0.026658,...,5.053833,99.240820,-0.170090,91.179344,3.087437,62.452350,-8.665928,70.666970,-1.098168,89.260025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,154306,Yummy,Fleslit,Hip-Hop,0.008100,0.707000,0.354000,0.903000,0.109000,0.040600,...,5.531777,60.408886,0.578954,34.085102,6.731422,32.125755,3.618689,43.405476,9.467090,79.778854
3275,154307,12:01 AM,Fleslit,Hip-Hop,0.059100,0.844000,0.346000,0.826000,0.054200,0.076500,...,-0.953383,97.223320,-5.637122,50.896816,6.983463,95.540886,2.458356,151.581600,2.188587,110.020600
3276,154308,Devil Of Miami,Fleslit,Hip-Hop,0.005560,0.574000,0.417000,0.768000,0.363000,0.044400,...,3.838140,90.520256,4.203661,88.265230,2.577975,108.629930,1.229812,69.142490,-3.231690,65.682460
3277,154309,A1 Symphony,Fleslit,Hip-Hop,0.411000,0.706000,0.654000,0.851000,0.053900,0.514000,...,-7.702279,40.592216,-10.535363,65.548470,1.828215,104.435350,7.517279,172.511520,8.652144,166.639270


In [80]:
testing_X = df3[['chroma_mean0', 'chroma_mean1', 'chroma_mean2', 'chroma_mean3', 
'chroma_mean4', 'chroma_mean5', 'chroma_mean6', 'chroma_mean7', 
'chroma_mean8', 'chroma_mean9', 'chroma_mean10', 'chroma_mean11', 
'chroma_var0', 'chroma_var1', 'chroma_var2', 'chroma_var3', 
'chroma_var4', 'chroma_var5', 'chroma_var6', 'chroma_var7', 
'chroma_var8', 'chroma_var9', 'chroma_var10', 'chroma_var11',
'mfccs_mean0', 'mfccs_mean1', 'mfccs_mean2', 'mfccs_mean3', 
'mfccs_mean4', 'mfccs_mean5', 'mfccs_mean6', 'mfccs_mean7', 
'mfccs_mean8', 'mfccs_mean9', 'mfccs_mean10', 'mfccs_mean11', 
'mfccs_mean12', 'mfccs_mean13', 'mfccs_mean14', 'mfccs_mean15', 
'mfccs_mean16', 'mfccs_mean17', 'mfccs_mean18', 'mfccs_mean19', 
'mfccs_var0', 'mfccs_var1', 'mfccs_var2', 'mfccs_var3', 
'mfccs_var4', 'mfccs_var5', 'mfccs_var6', 'mfccs_var7', 
'mfccs_var8', 'mfccs_var9', 'mfccs_var10', 'mfccs_var11', 
'mfccs_var12', 'mfccs_var13', 'mfccs_var14', 'mfccs_var15', 
'mfccs_var16', 'mfccs_var17', 'mfccs_var18', 'mfccs_var19', 
'spec_cent_mean', 'spec_cent_var', 'spectral_rolloff_mean', 'spectral_rolloff_var']]

In [81]:
testing_X[['chroma_var0', 'chroma_var1', 'chroma_var2', 'chroma_var3', 
'chroma_var4', 'chroma_var5', 'chroma_var6', 'chroma_var7', 
'chroma_var8', 'chroma_var9', 'chroma_var10', 'chroma_var11',
'mfccs_var0', 'mfccs_var1', 'mfccs_var2', 'mfccs_var3', 
'mfccs_var4', 'mfccs_var5', 'mfccs_var6', 'mfccs_var7', 
'mfccs_var8', 'mfccs_var9', 'mfccs_var10', 'mfccs_var11', 
'mfccs_var12', 'mfccs_var13', 'mfccs_var14', 'mfccs_var15', 
'mfccs_var16', 'mfccs_var17', 'mfccs_var18', 'mfccs_var19', 
'spec_cent_var', 'spectral_rolloff_var']] = testing_X[['chroma_var0', 'chroma_var1', 'chroma_var2', 'chroma_var3', 
'chroma_var4', 'chroma_var5', 'chroma_var6', 'chroma_var7', 
'chroma_var8', 'chroma_var9', 'chroma_var10', 'chroma_var11',
'mfccs_var0', 'mfccs_var1', 'mfccs_var2', 'mfccs_var3', 
'mfccs_var4', 'mfccs_var5', 'mfccs_var6', 'mfccs_var7', 
'mfccs_var8', 'mfccs_var9', 'mfccs_var10', 'mfccs_var11', 
'mfccs_var12', 'mfccs_var13', 'mfccs_var14', 'mfccs_var15', 
'mfccs_var16', 'mfccs_var17', 'mfccs_var18', 'mfccs_var19', 
'spec_cent_var', 'spectral_rolloff_var']]**(1/2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [82]:
testing_y = df3[['acousticness']]

In [83]:
preds = model.predict(testing_X)
rmse = np.sqrt(mean_squared_error(testing_y, preds))
r2_test = r2_score(testing_y, preds)

In [84]:
r2_test

-0.11819774350823664