# Data preparation

## Import the data

In [43]:
import pandas as pd

In [44]:
songs = pd.read_json('MasterSongList.json')
songs.head()

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],[pop],"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],[],"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436
3,{'$oid': '52fdfb410b9398049f3d1eac'},Gentleman,PSY,"[3, 0.705822, 0.053292, 126.009, 0.126016, 0.0...","[party, party, party, party, party, party]",[2010s],[dance],"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]",Gentleman,energetic,http://images.musicnet.com/albums/082/950/461/...,12353.0,"[driving in the left lane, energy boost, girls...",ASO_zypdnsQ,892096527
4,{'$oid': '52fdfb400b9398049f3d0b19'},On The Floor,Jennifer Lopez,"[3, 0.741757, 0.07277399999999999, 129.985, 0....","[party, party]",[2000s],[reggaeton],"[j, lo, the, other, side, out, my, mine, it, s...",[energetic],On The Floor,work out,http://images.musicnet.com/albums/050/131/765/...,29502.0,"[working out: cardio, dance party: sweaty]",t4H_Zoh7G5A,873285189


## Clean the genres column

Change genres from list to strings

In [45]:
songs2 = songs.copy()
songs2['genres'] = songs2['genres'].apply(''.join)

In [46]:
songs2['genres'][0:10]

0          pop
1          pop
2             
3        dance
4    reggaeton
5             
6    r&b: soul
7          pop
8             
9          pop
Name: genres, dtype: object

In [47]:
view_genres = songs2['genres'].unique()
view_genres.sort()
for i in view_genres:
    print(i)


bluegrass
blues & blues rock
children's
christian
classical
country
country: classic country
country: contemporary country
dance
dance: disco & nu disco
dance: house & techno
dubstep & drum 'n' bass
easy listening
electronica
film scores
folk
funk
hawaiian 
indie: indie electronic
indie: indie folk & americana
indie: indie pop
indie: indie rock
int'l: african
int'l: asian
int'l: brazilian
int'l: jamaican
int'l: mediterranean
international/world
jazz
jazz: vocal jazz
latin
latin: puerto rican
latin: salsa
latin: tropical
nature sounds
oldies
pop
pop: classic pop
pop: dance pop
pop: soft pop
r&b
r&b: classic r&b
r&b: contemporary r&b
r&b: soul
rap
rap: classic mainstream rap
rap: old school rap
rap: today's mainstream rap
rap: underground & alternative rap
reggae & ska
reggaeton
rock
rock: classic alternative & punk
rock: classic rock
rock: contemporary alternative
rock: emo/pop-punk
rock: hard rock
rock: metal
rock: modern rock
rock: rockabilly
showtunes
singer-songwriter


Split the genres by ":" to keep only the first one

In [48]:
def split_first_genre(genre):
    if len(genre) > 0:
        return genre.split(':')[0]
    else:
        return genre

songs2['genres'] = songs2['genres'].apply(split_first_genre)

In [49]:
view_genres = songs2['genres'].unique()
view_genres.sort()
for i in view_genres:
    print(i)


bluegrass
blues & blues rock
children's
christian
classical
country
dance
dubstep & drum 'n' bass
easy listening
electronica
film scores
folk
funk
hawaiian 
indie
int'l
international/world
jazz
latin
nature sounds
oldies
pop
r&b
rap
reggae & ska
reggaeton
rock
showtunes
singer-songwriter


Let's keep the cleaned genres column, we will use it later on

In [50]:
genres_col = songs2['genres']

## Work with the audio features column

In [51]:
songs2['audio_features'][0:10]

0    [11, 0.912744, 0.083704, 132.069, 0.293137, 0....
1    [6, 0.7457039999999999, 0.11995499999999999, 1...
2    [5, 0.709932, 0.231455, 130.03, 0.121740999999...
3    [3, 0.705822, 0.053292, 126.009, 0.126016, 0.0...
4    [3, 0.741757, 0.07277399999999999, 129.985, 0....
5    [8, 0.733856, 0.093043, 174.952, 0.05813699999...
6    [8, 0.7773749999999999, 0.054104, 104.946, 0.0...
7    [7, 0.585564, 0.10829699999999999, 120.014, 0....
8    [0, 0.418212, 0.105322, 129.054, 0.045461, 0.5...
9    [4, 0.81403, 0.079196, 124.991, 0.07244, 0.005...
Name: audio_features, dtype: object

In [52]:
audio_features_headers = ['key', 'energy', 'liveliness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'time_signature', 'duration', 'loudness', 'valence', 'danceability', 'mode', 'time_signature_confidence', 'tempo_confidence', 'key_confidence', 'mode_confidence']

Make a list of list of the features

In [53]:
audio_features_list = songs2['audio_features'].tolist()

Convert the list to a dataframe

In [54]:
audio_features_df = pd.DataFrame(audio_features_list, columns=audio_features_headers)
audio_features_df.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [55]:
audio_features_df['genres'] = songs2['genres']
audio_features_df.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


Let's look at the NaN values in the dataframe

In [56]:
import numpy as np
def checknan(x):
    return np.isnan(x)

In [57]:
cols = audio_features_df.columns
cols = cols.drop('genres')
#for i in cols:
#    nans = audio_features_df[i].apply(checknan)
#    print(audio_features_df[nans])

By looking at the above data, it seems like rows including NaN values actually have all their features as NaN (I removed the printed table to improve readability). Let's remove those rows from the dataframe

In [58]:
audio_features_df.shape

(36733, 18)

In [59]:
audio_features_df = audio_features_df.dropna(axis=0, how='any')
audio_features_df.shape

(30296, 18)

## Select 2 genres and balance dataframe

Let's have a look at the main genres

In [60]:
audio_features_df['genres'].value_counts()[0:10]

rock                 6435
                     3135
rap                  2452
r&b                  2344
dance                2000
jazz                 1889
indie                1834
electronica          1249
country              1075
singer-songwriter    1034
Name: genres, dtype: int64

I wish to pick rap and jazz

In [61]:
rap_songs = audio_features_df[audio_features_df['genres'] == 'rap']
rap_songs.shape

(2452, 18)

In [62]:
jazz_songs = audio_features_df[audio_features_df['genres'] == 'jazz']
jazz_songs.shape

(1889, 18)

We can notice there are more rap songs. Let's select a random sample of the same size as jazz songs

In [63]:
sample_rap_songs = rap_songs.sample(n=len(jazz_songs), random_state=101)
sample_rap_songs.shape

(1889, 18)

We can now concatenate the 2 dataframes

In [64]:
final_df = pd.concat([jazz_songs, sample_rap_songs])
final_df['genres'].value_counts()

rap     1889
jazz    1889
Name: genres, dtype: int64

Right now jazz and rap songs are grouped, let's randomize the dataframe and reset the index

In [65]:
final_df = final_df.sample(frac=1, random_state=101).reset_index(drop=True)
final_df.head(10)

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,1.0,0.870437,0.048915,104.09,0.038828,0.431946,0.844602,1.0,4.0,420.46667,-9.688,0.9685,0.718223,0.308,0.437,0.777,1.0,jazz
1,9.0,0.286595,0.102147,110.859,0.039914,0.804336,0.044933,1.0,5.0,183.96426,-16.628,0.474371,0.535849,0.0,0.24,0.833,0.47,jazz
2,2.0,0.195476,0.118042,85.615,0.030825,0.915787,0.875622,0.0,4.0,451.30358,-14.029,0.239448,0.380462,0.143,0.454,0.384,0.831,jazz
3,11.0,0.697546,0.161159,85.869,0.225095,0.023335,0.005098,1.0,4.0,297.57333,-9.025,0.796561,0.698103,0.49,0.458,0.499,1.0,rap
4,7.0,0.011078,0.152958,106.613,0.04309,0.977958,1.4e-05,1.0,4.0,168.8,-21.842,0.196808,0.508681,0.548,0.49,0.062,0.902,jazz
5,3.0,0.085161,0.13433,119.862,0.046687,0.954111,0.874544,1.0,4.0,277.98667,-19.729,0.18425,0.501791,0.307,0.419,0.105,1.0,jazz
6,9.0,0.657858,0.353692,184.509,0.342888,0.108643,1e-06,0.0,4.0,202.84952,-11.11,0.749261,0.582267,0.304,0.31,0.049,1.0,rap
7,7.0,0.057456,0.188454,117.915,0.038652,0.935244,0.014336,1.0,4.0,542.86667,-18.923,0.24466,0.584098,0.529,0.577,0.293,0.967,jazz
8,7.0,0.680885,0.049152,136.936,0.047375,0.623225,0.621041,1.0,4.0,146.12,-9.135,0.964715,0.65495,0.712,0.652,0.965,1.0,jazz
9,11.0,0.090816,0.138365,115.504,0.0626,0.79559,0.0,1.0,4.0,289.53333,-17.926,0.1223,0.517401,0.595,0.613,0.018,0.936,jazz


# k Nearest Neighbors

## Train Test Split method

In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [67]:
knn = KNeighborsClassifier(n_neighbors=3)

In [68]:
X = final_df.drop(['genres'], axis=1)
y = final_df['genres']

Let's scale and center the features

In [69]:
from sklearn.preprocessing import StandardScaler

In [70]:
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
X = np.round(X, decimals=2)
X[0:3]

array([[-1.19,  1.38, -0.91, -0.29, -0.76, -0.02,  1.76,  0.84,  0.19,
         1.39,  0.24,  1.76,  0.62, -0.44,  0.08,  1.41,  0.6 ],
       [ 1.06, -0.92, -0.61, -0.07, -0.76,  0.98, -0.52,  0.84,  2.7 ,
        -0.69, -1.06, -0.34, -0.46, -1.76, -1.07,  1.61, -1.82],
       [-0.91, -1.28, -0.53, -0.9 , -0.82,  1.28,  1.84, -1.2 ,  0.19,
         1.67, -0.57, -1.34, -1.38, -1.15,  0.18, -0.  , -0.17]])

We now need to create training and testing partitions: 30% of the data will be used as testing data and the rest as training

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

We can now fit the model on the training data

In [73]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

Now make predictions on the genre for the X_test data

In [74]:
predictions = knn.predict(X_test)

To evaluate the accuracy f our model, let's compare the predicted data made from X_test (predictions) with the real results: y_train

Let's have a look at the confusion matrix

In [75]:
from nltk import ConfusionMatrix

In [76]:
print(ConfusionMatrix(list(y_test), list(predictions)))

     |   j     |
     |   a   r |
     |   z   a |
     |   z   p |
-----+---------+
jazz |<541> 36 |
 rap |  50<507>|
-----+---------+
(row = reference; col = test)



We can already notice a lot fo true positives and true negatives. Let's look at precision, recall and f1-scores

In [77]:
from sklearn.metrics import classification_report

In [78]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

       jazz       0.92      0.94      0.93       577
        rap       0.93      0.91      0.92       557

avg / total       0.92      0.92      0.92      1134



# 10-fold cross-validation

To make sure the split did not have an influence on the results, let's use cross validation with 10 folds

In [79]:
from sklearn.cross_validation import cross_val_score

In [80]:
scores = cross_val_score(knn, X, y, cv=10)
print(scores)
print(np.round(np.mean(scores), decimals=2))

[0.92857143 0.91798942 0.92592593 0.94179894 0.8968254  0.92328042
 0.93650794 0.95238095 0.91798942 0.92819149]
0.93


In [81]:
precision_scores = cross_val_score(knn, X, y, cv=10, scoring='precision_macro')
print(precision_scores)
print(np.round(np.mean(precision_scores), decimals=2))

[0.92858343 0.91809476 0.92827586 0.94224464 0.89817416 0.9267329
 0.9369483  0.95258367 0.91800112 0.92830055]
0.93


In [82]:
recall_scores = cross_val_score(knn, X, y, cv=10, scoring='recall_macro')
print(recall_scores)
print(np.round(np.mean(recall_scores), decimals=2))

[0.92857143 0.91798942 0.92592593 0.94179894 0.8968254  0.92328042
 0.93650794 0.95238095 0.91798942 0.92819149]
0.93


In [83]:
f1_scores = cross_val_score(knn, X, y, cv=10, scoring='f1_macro')
print(f1_scores)
print(np.round(np.mean(f1_scores), decimals=2))

[0.92857093 0.91798425 0.92582418 0.94178427 0.89673795 0.92312493
 0.93649194 0.95237562 0.91798884 0.92818692]
0.93


Similar scores with Cross validation