# Data preparation

## Import data

In [736]:
import pandas as pd
import numpy as np

In [737]:
songs = pd.read_json("MasterSongList.json")
songs.head(3)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],[pop],"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],[],"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436


## Edit the genre

In [738]:
songs['genres'] = songs['genres'].apply(''.join)
songs.shape

(36733, 16)

In [739]:
def split_first_genre(genre):
    if len(genre) > 0:
        return genre.split(':')[0]
    else:
        return genre

songs['genres'] = songs['genres'].apply(split_first_genre)

In [740]:
popular_songs = songs[songs['yt_views'] > 1000]
popular_songs.shape

(34267, 16)

## Audio features

In [741]:
audio_features_headers = ['key', 'energy', 'liveliness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'time_signature', 'duration', 'loudness', 'valence', 'danceability', 'mode', 'time_signature_confidence', 'tempo_confidence', 'key_confidence', 'mode_confidence']
audio_features_list = popular_songs['audio_features'].tolist()
df_full_features = pd.DataFrame(audio_features_list, columns=audio_features_headers)
df_full_features['genres'] = popular_songs['genres']
df_full_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


In [742]:
df_features_filtered = df_full_features[(df_full_features['genres'] == 'dance') | (df_full_features['genres'] == 'jazz') | (df_full_features['genres'] == 'rock') | (df_full_features['genres'] == 'rap')]
df_features_filtered.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
20,0.0,0.616411,0.171423,130.009,0.059577,0.058936,6.5e-05,1.0,4.0,284.41333,-7.443,0.476111,0.78976,0.499,0.489,0.708,1.0,dance
25,11.0,0.398953,0.17064,77.821,0.033343,0.007855,0.457147,1.0,4.0,548.03156,-19.753,0.243535,0.198917,0.901,0.676,0.362,0.996,rock
36,7.0,0.728367,0.84481,112.328,0.307629,0.00975,2.2e-05,1.0,4.0,207.15057,-14.511,0.652412,0.691151,0.844,0.384,1.0,1.0,dance
47,7.0,0.906388,0.130576,127.438,0.122818,0.000353,0.0012,0.0,4.0,210.42667,-5.856,0.318454,0.418888,0.407,0.602,0.603,0.889,rock


In [743]:
df_features_filtered['genres'].value_counts()

rock     7174
rap      2855
dance    2285
jazz     1973
Name: genres, dtype: int64

## NaN values

In [744]:
# Let's split the dataset by genres

jazz_songs = df_features_filtered[df_features_filtered['genres'] == 'jazz']
jazz_songs.name = 'jazz_songs'
rock_songs = df_features_filtered[df_features_filtered['genres'] == 'rock']
rock_songs.name = 'rock_songs'
dance_songs = df_features_filtered[df_features_filtered['genres'] == 'dance']
dance_songs.name = 'dance_songs'
rap_songs = df_features_filtered[df_features_filtered['genres'] == 'rap']
rap_songs.name = 'rap_songs'

In [745]:
def checknan(x):
    return np.isnan(x)

Let's check if the full row is always empty when containing NaN or if some NaN are orphans

In [746]:
rows_null = df_features_filtered.isnull().sum(axis=1)
for index, value in enumerate(rows_null):
    if value < 17 and value != 0:
        print(df_features_filtered.iloc[index])

key                                 5
energy                       0.948036
liveliness                   0.509308
tempo                         152.047
speechiness                       NaN
acousticness                 0.087414
instrumentalness             0.094004
time_signature                      1
duration                            4
loudness                      791.813
valence                        -6.355
danceability                 0.188733
mode                          0.20622
time_signature_confidence           0
tempo_confidence                 0.08
key_confidence                      0
mode_confidence                 0.963
genres                           rock
Name: 870, dtype: object
key                                 0
energy                       0.390561
liveliness                   0.135333
tempo                          77.343
speechiness                       NaN
acousticness                 0.385108
instrumentalness                3e-06
time_signature           

We notice there are only 3 rows containing only 1 NaN, all the other rows containing NaN are completely empty. The orphan NaNs are only present in the 'speechiness' column. We will select this column to analyze the number NaN present in each genres dataset and decide whether we need to drop them or not.

Let's count how many lines have NaN values foreach genres dataset. Based on above analysis we will focis on the 'speechiness' column

Since we require 1,500 rows for each genre we will drop the NaN rows whenever the number of non-empty rows is superior to 1,500. In case there are too many empty rows, we will use the median of each column to fill in the empty rows and select then randomly 1,500 rows.

In [747]:
genres_df = [jazz_songs, rock_songs, dance_songs, rap_songs]

In [748]:
for i in genres_df:
    genres_nan = i['speechiness'].apply(checknan)
    print(i.name)
    print(genres_nan.value_counts())

jazz_songs
False    1677
True      296
Name: speechiness, dtype: int64
rock_songs
False    6245
True      929
Name: speechiness, dtype: int64
dance_songs
False    1938
True      347
Name: speechiness, dtype: int64
rap_songs
False    2411
True      444
Name: speechiness, dtype: int64


We notice all datasets have more than 1,500 non-empty rows. Let's drop the NaN rows

In [749]:
for i in genres_df:
    i.dropna(axis=0, how='any', inplace=True)
    print(i.name)
    print(i.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


jazz_songs
(1677, 18)
rock_songs
(6245, 18)
dance_songs
(1938, 18)
rap_songs
(2411, 18)


## Balance dataframe

We need to select randomly 1,500 songs per genre

In [750]:
jazz_sample = jazz_songs.sample(n=1500, random_state=101)
jazz_sample.shape

(1500, 18)

In [751]:
rock_sample = rock_songs.sample(n=1500, random_state=101)
rock_sample.shape

(1500, 18)

In [752]:
dance_sample = dance_songs.sample(n=1500, random_state=101)
dance_sample.shape

(1500, 18)

In [753]:
rap_sample = rap_songs.sample(n=1500, random_state=101)
rap_sample.shape

(1500, 18)

## Concatenate and randomize the dataframe

In [754]:
final_df = pd.concat([jazz_sample, rock_sample, dance_sample, rap_sample])
final_df['genres'].value_counts()

rock     1500
jazz     1500
rap      1500
dance    1500
Name: genres, dtype: int64

In [755]:
final_df = final_df.sample(frac=1, random_state=101).reset_index(drop=True)
final_df.head(10)

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,8.0,0.831914,0.056928,106.245,0.232162,0.072076,0.0,1.0,4.0,254.6,-5.984,0.555338,0.837518,0.555,0.408,0.664,1.0,rap
1,7.0,0.250138,0.136842,96.105,0.028679,0.823098,0.0,1.0,4.0,153.98667,-9.814,0.577877,0.506832,0.741,0.655,0.526,0.855,rock
2,8.0,0.326645,0.843938,123.949,0.492508,0.016599,0.0,0.0,4.0,275.01714,-10.846,0.568852,0.934398,0.562,0.418,0.704,0.813,rap
3,7.0,0.873642,0.281888,97.94,0.041479,0.327251,0.264088,0.0,4.0,234.78857,-10.682,0.224093,0.600045,0.0,0.064,0.167,0.983,rap
4,6.0,0.916239,0.092331,124.731,0.046529,1.4e-05,0.003689,1.0,4.0,212.01333,-4.157,0.572196,0.581201,0.377,0.444,0.755,0.988,rock
5,2.0,0.783636,0.580456,115.402,0.189523,0.156515,2.1e-05,1.0,4.0,140.49333,-15.285,0.197368,0.330199,0.435,0.326,0.322,0.846,rock
6,2.0,0.79982,0.102666,190.142,0.053073,0.001846,0.001744,1.0,4.0,400.29333,-8.121,0.670951,0.481638,0.227,0.191,0.216,1.0,dance
7,10.0,0.77038,0.346163,118.058,0.142171,0.601119,8e-06,0.0,4.0,156.46667,-6.388,0.741407,0.881608,0.159,0.295,0.088,1.0,dance
8,1.0,0.85968,0.205867,126.032,0.034965,0.080888,0.867131,0.0,4.0,201.30399,-7.121,0.493057,0.603445,0.124,0.399,0.95,0.773,dance
9,9.0,0.712624,0.105198,98.025,0.04338,0.002306,0.851865,0.0,4.0,461.33333,-7.881,0.278169,0.530937,0.336,0.355,0.034,1.0,dance


# Logistic Regression

## LR original

In [756]:
from sklearn.linear_model import LogisticRegression

In [757]:
lr_original = LogisticRegression(solver='saga', multi_class='multinomial')

In [758]:
final_features = final_df.drop('genres', axis=1)
final_labels = final_df['genres']

In [759]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
final_features_scaled = standard_scaler.fit_transform(final_features)

In [760]:
from sklearn.model_selection import train_test_split
X = final_features_scaled
y = final_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [761]:
lr_original.fit(X_train, y_train)
lr_predictions = lr_original.predict(X_test)

In [762]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, lr_predictions))
print(classification_report(list(y_test), list(lr_predictions)))

[[322  12  67  48]
 [ 13 404   5  45]
 [ 68  19 338  24]
 [ 47  54  14 320]]
             precision    recall  f1-score   support

      dance       0.72      0.72      0.72       449
       jazz       0.83      0.87      0.85       467
        rap       0.80      0.75      0.77       449
       rock       0.73      0.74      0.73       435

avg / total       0.77      0.77      0.77      1800



## LR GridSearchCV

In [763]:
from sklearn.model_selection import GridSearchCV
param_grid = {'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'], 'multi_class':['ovr', 'multinomial']}

In [764]:
grid = GridSearchCV(lr_original, param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] multi_class=ovr, solver=newton-cg ...............................
[CV]  multi_class=ovr, solver=newton-cg, score=0.7574893009985735, total=   0.1s
[CV] multi_class=ovr, solver=newton-cg ...............................
[CV]  multi_class=ovr, solver=newton-cg, score=0.7555396711937098, total=   0.2s
[CV] multi_class=ovr, solver=newton-cg ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV]  multi_class=ovr, solver=newton-cg, score=0.771265189421015, total=   0.1s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.7574893009985735, total=   0.3s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.7555396711937098, total=   0.3s
[CV] multi_class=ovr, solver=sag .....................................
[CV]  multi_class=ovr, solver=sag, score=0.771265189421015, total=   0.2s
[CV] multi_class=ovr, solver=saga ....................................




[CV]  multi_class=ovr, solver=saga, score=0.7574893009985735, total=   0.6s
[CV] multi_class=ovr, solver=saga ....................................
[CV]  multi_class=ovr, solver=saga, score=0.7555396711937098, total=   0.5s
[CV] multi_class=ovr, solver=saga ....................................
[CV]  multi_class=ovr, solver=saga, score=0.771265189421015, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.7574893009985735, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.7555396711937098, total=   0.1s
[CV] multi_class=ovr, solver=lbfgs ...................................
[CV]  multi_class=ovr, solver=lbfgs, score=0.771265189421015, total=   0.1s
[CV] multi_class=multinomial, solver=newton-cg .......................
[CV]  multi_class=multinomial, solver=newton-cg, score=0.760342368045649, total=   0.2s
[CV] multi_class=multinomial,

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    4.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'], 'multi_class': ['ovr', 'multinomial']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [765]:
grid.best_params_

{'multi_class': 'multinomial', 'solver': 'newton-cg'}

## LR best 

In [766]:
lr_best = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr_best.fit(X_train, y_train)
lr_predictions2 = lr_best.predict(X_test)
print(confusion_matrix(y_test, lr_predictions2))
print(classification_report(list(y_test), list(lr_predictions2)))

[[322  12  67  48]
 [ 13 404   5  45]
 [ 68  19 338  24]
 [ 47  54  14 320]]
             precision    recall  f1-score   support

      dance       0.72      0.72      0.72       449
       jazz       0.83      0.87      0.85       467
        rap       0.80      0.75      0.77       449
       rock       0.73      0.74      0.73       435

avg / total       0.77      0.77      0.77      1800



We notice that we get the exact same results by changing solver parameter in our logistic regression model. Looks like this parameter has only little influence for this data

## LR SelectFromModel

In [767]:
from sklearn.feature_selection import SelectFromModel

In [768]:
lr_sfm = SelectFromModel(lr_best, prefit=True)
X_train_new = lr_sfm.transform(X_train)

In [769]:
X_train_new.shape

(4200, 6)

Only 6 features selected

In [770]:
X_test_new = lr_sfm.transform(X_test)
X_test_new.shape

(1800, 6)

In [771]:
X_test_new_index = lr_sfm.get_support()
X_test_new_name = final_features.columns[X_test_new_index]
print(X_test_new_name)

Index(['energy', 'speechiness', 'acousticness', 'danceability', 'mode',
       'key_confidence'],
      dtype='object')


In [772]:
lr_opti = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr_opti.fit(X_train_new, y_train)
lr_predictions3 = lr_opti.predict(X_test_new)
print(confusion_matrix(y_test, lr_predictions3))
print(classification_report(list(y_test), list(lr_predictions3)))

[[314   9  70  56]
 [ 11 393   6  57]
 [ 88  19 308  34]
 [ 62  60  15 298]]
             precision    recall  f1-score   support

      dance       0.66      0.70      0.68       449
       jazz       0.82      0.84      0.83       467
        rap       0.77      0.69      0.73       449
       rock       0.67      0.69      0.68       435

avg / total       0.73      0.73      0.73      1800



We can notice the accuracy score decreased after feature selection compared to the previous runs. Problably need to add a threshold. Let's try with the median

In [773]:
lr_sfm = SelectFromModel(lr_best, threshold='median', prefit=True)
X_train_new2 = lr_sfm.transform(X_train)
X_train_new2.shape

(4200, 9)

In [774]:
X_test_new2 = lr_sfm.transform(X_test)
X_test_new2.shape

(1800, 9)

We notice that by adding the median threshold the model selected more features

In [775]:
X_test_new_index2 = lr_sfm.get_support()
X_test_new_name2 = final_features.columns[X_test_new_index2]
print(X_test_new_name2)

Index(['energy', 'speechiness', 'acousticness', 'instrumentalness', 'loudness',
       'valence', 'danceability', 'mode', 'key_confidence'],
      dtype='object')


In [776]:
lr_opti2 = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr_opti2.fit(X_train_new2, y_train)
lr_predictions4 = lr_opti2.predict(X_test_new2)
print(confusion_matrix(y_test, lr_predictions4))
print(classification_report(list(y_test), list(lr_predictions4)))

[[314  10  68  57]
 [ 18 392   4  53]
 [ 70  18 325  36]
 [ 44  57  14 320]]
             precision    recall  f1-score   support

      dance       0.70      0.70      0.70       449
       jazz       0.82      0.84      0.83       467
        rap       0.79      0.72      0.76       449
       rock       0.69      0.74      0.71       435

avg / total       0.75      0.75      0.75      1800



The score is slightly higher than previously although still lower than the model including all features. However we now have around half the number of initial features, meaning a lower computation time

## LR RFE

In [777]:
from sklearn.feature_selection import RFE

In [778]:
lr_rfe = LogisticRegression(solver='newton-cg', multi_class='multinomial')
rfe_model = RFE(lr_rfe, 9)

In [779]:
rfe_model = rfe_model.fit(X_train, y_train)

In [780]:
print(rfe_model.support_)
print(rfe_model.ranking_)

[False  True False False  True  True  True False False  True  True  True
  True False False  True False]
[8 1 9 4 1 1 1 2 5 1 1 1 1 7 3 1 6]


In [781]:
selected_features = final_features.columns[rfe_model.support_]
print(selected_features)

Index(['energy', 'speechiness', 'acousticness', 'instrumentalness', 'loudness',
       'valence', 'danceability', 'mode', 'key_confidence'],
      dtype='object')


We notice the selected features are the sames as used in SelectFromModel with median threshold

In [782]:
rfe_predictions = rfe_model.predict(X_test)

In [783]:
print(confusion_matrix(y_test, rfe_predictions))
print(classification_report(list(y_test), list(rfe_predictions)))

[[314  10  68  57]
 [ 18 392   4  53]
 [ 70  18 325  36]
 [ 44  57  14 320]]
             precision    recall  f1-score   support

      dance       0.70      0.70      0.70       449
       jazz       0.82      0.84      0.83       467
        rap       0.79      0.72      0.76       449
       rock       0.69      0.74      0.71       435

avg / total       0.75      0.75      0.75      1800



We found the exact same results as in SelectFromModel with the median threshold. Let's also compare with only 6 features (same as SelectFromModel without the median threshold)

In [784]:
lr_rfe = LogisticRegression(solver='newton-cg', multi_class='multinomial')
rfe_model2 = RFE(lr_rfe, 6)
rfe_model2 = rfe_model.fit(X_train, y_train)

In [785]:
print(rfe_model2.support_)
print(rfe_model2.ranking_)

[False  True False False  True  True  True False False  True  True  True
  True False False  True False]
[8 1 9 4 1 1 1 2 5 1 1 1 1 7 3 1 6]


In [786]:
selected_features2 = final_features.columns[rfe_model2.support_]
print(selected_features2)

Index(['energy', 'speechiness', 'acousticness', 'instrumentalness', 'loudness',
       'valence', 'danceability', 'mode', 'key_confidence'],
      dtype='object')


Once again, same features as in SelectFromModel

In [787]:
rfe_predictions2 = rfe_model2.predict(X_test)

In [788]:
print(confusion_matrix(y_test, rfe_predictions2))
print(classification_report(list(y_test), list(rfe_predictions2)))

[[314  10  68  57]
 [ 18 392   4  53]
 [ 70  18 325  36]
 [ 44  57  14 320]]
             precision    recall  f1-score   support

      dance       0.70      0.70      0.70       449
       jazz       0.82      0.84      0.83       467
        rap       0.79      0.72      0.76       449
       rock       0.69      0.74      0.71       435

avg / total       0.75      0.75      0.75      1800



Let's continue the study with Support Vector Machine

# Support Vector Machine

## SVC original

In [789]:
from sklearn.svm import SVC
svc_original = SVC(C=1, gamma=1)

In [790]:
svc_original.fit(X_train, y_train)
svc_predictions = svc_original.predict(X_test)

In [791]:
print(confusion_matrix(y_test, svc_predictions))
print(classification_report(list(y_test), list(svc_predictions)))

[[194 147  50  58]
 [  5 437   4  21]
 [ 25 164 231  29]
 [ 27 208   8 192]]
             precision    recall  f1-score   support

      dance       0.77      0.43      0.55       449
       jazz       0.46      0.94      0.61       467
        rap       0.79      0.51      0.62       449
       rock       0.64      0.44      0.52       435

avg / total       0.66      0.59      0.58      1800



## SVC GridSearchCV

In [792]:
param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001]}

In [793]:
grid = GridSearchCV(svc_original, param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ........ C=0.1, gamma=1, score=0.25320970042796004, total=   0.9s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV] ......... C=0.1, gamma=1, score=0.2537526804860615, total=   0.9s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV] ......... C=0.1, gamma=1, score=0.2537526804860615, total=   0.7s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7510699001426534, total=   0.5s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7455325232308792, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7562544674767692, total=   0.6s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7560627674750356, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7455325232308792, total=   0.4s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7605432451751251, total=   0.6s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   33.6s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [794]:
grid.best_params_

{'C': 10, 'gamma': 0.01}

## SVC best

In [795]:
svc_best = SVC(C=10, gamma=0.01)
svc_best.fit(X_train, y_train)
svc_predictions2 = svc_best.predict(X_test)
print(confusion_matrix(y_test, svc_predictions2))
print(classification_report(list(y_test), list(svc_predictions2)))

[[328  14  60  47]
 [ 14 418   4  31]
 [ 58  20 351  20]
 [ 48  47  16 324]]
             precision    recall  f1-score   support

      dance       0.73      0.73      0.73       449
       jazz       0.84      0.90      0.87       467
        rap       0.81      0.78      0.80       449
       rock       0.77      0.74      0.76       435

avg / total       0.79      0.79      0.79      1800



The score increased substantially by optimizing the parameters on the SVC model. Let's try to optimize the number of parameters by using SelectKBest

## SVC SelectKBest

In [796]:
from sklearn.feature_selection import SelectKBest

Let's use the number of parameters found earlier with SelectFromModel (9 features)

In [797]:
selector = SelectKBest(k=9)

In [798]:
X_new = selector.fit_transform(X, y)
X_new.shape

(6000, 9)

In [799]:
final_features.columns[selector.get_support(indices=True)].tolist()

['energy',
 'tempo',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'valence',
 'mode',
 'key_confidence',
 'mode_confidence']

We notice that the 9 features differ from the ones selected with SelectedFromModel

In [800]:
X = X_new
y = final_labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [801]:
svc_opti = SVC(C=10, gamma=0.01)
svc_opti.fit(X_train, y_train)
svc_predictions3 = svc_opti.predict(X_test)
print(confusion_matrix(y_test, svc_predictions3))
print(classification_report(list(y_test), list(svc_predictions3)))

[[319  13  61  56]
 [ 12 406   9  40]
 [ 61  20 344  24]
 [ 54  68  15 298]]
             precision    recall  f1-score   support

      dance       0.72      0.71      0.71       449
       jazz       0.80      0.87      0.83       467
        rap       0.80      0.77      0.78       449
       rock       0.71      0.69      0.70       435

avg / total       0.76      0.76      0.76      1800



We notice that the accuracy is lower than before adjusting the parameters, however it costs less in calculation power. We can also notice this model is slightly more accurate than the Logistic Regression ones

Let's now look at the Random Forest model

# Random Forest

## RFC original

In [802]:
from sklearn.ensemble import RandomForestClassifier

In [803]:
rfc_original = RandomForestClassifier(n_estimators=5, min_samples_split=2, max_features='log2')

In [804]:
X = final_features
#No need to use the scaled features
y = final_labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [805]:
rfc_original.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [806]:
rfc_predictions = rfc_original.predict(X_test)
print(confusion_matrix(y_test, rfc_predictions))
print(classification_report(list(y_test), list(rfc_predictions)))

[[320  16  64  49]
 [ 22 392  10  43]
 [ 63  24 341  21]
 [ 67  67  29 272]]
             precision    recall  f1-score   support

      dance       0.68      0.71      0.69       449
       jazz       0.79      0.84      0.81       467
        rap       0.77      0.76      0.76       449
       rock       0.71      0.63      0.66       435

avg / total       0.74      0.74      0.73      1800



Let's now look at the parameter optimization with GridSearchCV

## RFC GridSearchCV

In [807]:
param_grid = {'n_estimators': [5, 10, 100], 'min_samples_split': [2, 3, 4, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']}

In [808]:
grid = GridSearchCV(rfc_original, param_grid, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:   54.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 100], 'min_samples_split': [2, 3, 4, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [809]:
grid.best_params_

{'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 100}

## RFC best

In [810]:
rfc_best = RandomForestClassifier(n_estimators=100, min_samples_split=3, max_features='log2')
rfc_best.fit(X_train, y_train)
rfc_predictions2 = rfc_best.predict(X_test)
print(confusion_matrix(y_test, rfc_predictions2))
print(classification_report(list(y_test), list(rfc_predictions2)))

[[327  12  62  48]
 [ 16 405   9  37]
 [ 37  21 364  27]
 [ 39  55  16 325]]
             precision    recall  f1-score   support

      dance       0.78      0.73      0.75       449
       jazz       0.82      0.87      0.84       467
        rap       0.81      0.81      0.81       449
       rock       0.74      0.75      0.75       435

avg / total       0.79      0.79      0.79      1800



We notice that the accuracy score increased compared to the previous model. We can also notice that this score is similar to the SVC score after parameter optimization

Let's now try to optimize the features

## RFC SelectFromModel

In [811]:
importances = pd.DataFrame(data={'feature': final_features.columns, 'importance': np.round(rfc_best.feature_importances_,3)})
importances

Unnamed: 0,feature,importance
0,key,0.018
1,energy,0.083
2,liveliness,0.029
3,tempo,0.072
4,speechiness,0.127
5,acousticness,0.162
6,instrumentalness,0.063
7,time_signature,0.008
8,duration,0.004
9,loudness,0.054


In [812]:
from numpy import sort
thresholds = sort(rfc_best.feature_importances_)
thresholds

array([0.00379726, 0.00775926, 0.01763303, 0.02735475, 0.02880325,
       0.03258681, 0.03379083, 0.03885615, 0.05388322, 0.05580026,
       0.06293102, 0.06666359, 0.07190449, 0.08346614, 0.12556962,
       0.12749537, 0.16170495])

In [813]:
from sklearn.metrics import accuracy_score
for i in thresholds:
    rfc_sfm = SelectFromModel(rfc_best, threshold=i, prefit=True)
    select_X_train = rfc_sfm.transform(X_train)
    rfc2 = RandomForestClassifier()
    rfc2.fit(select_X_train, y_train)
    select_X_test = rfc_sfm.transform(X_test)
    predictions = rfc2.predict(select_X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(i)
    print(select_X_train.shape[1])
    print(accuracy)
    print("")

0.0037972553896849447
17
0.7733333333333333

0.00775926333750653
16
0.7572222222222222

0.017633027299059672
15
0.75

0.027354749106531778
14
0.76

0.028803248549640396
13
0.7544444444444445

0.03258680880369927
12
0.7533333333333333

0.03379083251868591
11
0.77

0.038856151818175755
10
0.7577777777777778

0.05388321779723584
9
0.7594444444444445

0.055800257608282236
8
0.7572222222222222

0.06293102402486409
7
0.7494444444444445

0.06666359330320056
6
0.735

0.07190449301916034
5
0.7194444444444444

0.08346613505840575
4
0.6911111111111111

0.1255696210167754
3
0.6705555555555556

0.12749536707519343
2
0.5894444444444444

0.16170495427389817
1
0.4633333333333333



The above analysis seems to be in agreement with our present studies: 9 features shoudl provide a good accuracy with only a few tradeoffs

In [814]:
rfc_sfm2 = SelectFromModel(rfc_best, threshold=0.0533, prefit=True)
X_train_new = rfc_sfm2.transform(X_train)
X_train_new.shape

(4200, 9)

In [815]:
X_test_new = rfc_sfm2.transform(X_test)
X_test_new.shape

(1800, 9)

In [816]:
X_test_new_index = rfc_sfm2.get_support()
X_test_new_name = final_features.columns[X_test_new_index]
print(X_test_new_name)

Index(['energy', 'tempo', 'speechiness', 'acousticness', 'instrumentalness',
       'loudness', 'valence', 'mode', 'key_confidence'],
      dtype='object')


We can notice the selected features are different from 2 previous models

In [817]:
rfc_opti = RandomForestClassifier(n_estimators=100, min_samples_split=3, max_features='log2')
rfc_opti.fit(X_train_new, y_train)
rfc_predictions3 = rfc_opti.predict(X_test_new)
print(confusion_matrix(y_test, rfc_predictions3))
print(classification_report(list(y_test), list(rfc_predictions3)))

[[323  11  60  55]
 [ 15 400  10  42]
 [ 49  20 356  24]
 [ 44  52  16 323]]
             precision    recall  f1-score   support

      dance       0.75      0.72      0.73       449
       jazz       0.83      0.86      0.84       467
        rap       0.81      0.79      0.80       449
       rock       0.73      0.74      0.73       435

avg / total       0.78      0.78      0.78      1800



Again the score is slightly lower than just before however the number of features is lower. So there is only a small accuracy tradeoff for lowert computation time

Last step, let's now compare with RFE

## RFC RFE

In [818]:
rfc_rfe = RandomForestClassifier(n_estimators=100, min_samples_split=3, max_features='log2')
rfe_model = RFE(rfc_rfe, 9)

In [819]:
rfe_model = rfe_model.fit(X_train, y_train)

In [820]:
print(rfe_model.support_)
print(rfe_model.ranking_)

[False  True False  True  True  True  True False False  True  True False
  True False False  True False]
[7 1 5 1 1 1 1 8 9 1 1 2 1 3 4 1 6]


In [821]:
selected_features = final_features.columns[rfe_model.support_]
print(selected_features)

Index(['energy', 'tempo', 'speechiness', 'acousticness', 'instrumentalness',
       'loudness', 'valence', 'mode', 'key_confidence'],
      dtype='object')


The features selected are the same as in the above model

In [822]:
rfe_predictions = rfe_model.predict(X_test)

In [823]:
print(confusion_matrix(y_test, rfe_predictions))
print(classification_report(list(y_test), list(rfe_predictions)))

[[327  12  57  53]
 [ 13 397  13  44]
 [ 40  20 359  30]
 [ 45  56  17 317]]
             precision    recall  f1-score   support

      dance       0.77      0.73      0.75       449
       jazz       0.82      0.85      0.83       467
        rap       0.80      0.80      0.80       449
       rock       0.71      0.73      0.72       435

avg / total       0.78      0.78      0.78      1800



Similar results to above. Again, compared to the study without feature selection the score is slightly lower which is only a small tradeoff as the computation is cheaper.

I unfortunately do not have enough time to explore the mood column or to look for the right number of features for the SVC method, but this would be interesting to explore