In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
import json
import pandas as pd
import pprint

In [2]:
import numpy as np
from plotly import express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

In [3]:
# note : the current databases are built with the following columns 
# ["danceability","energy","loudness","speechiness","acousticness",
#    "instrumentalness","liveness","valence","tempo","id","duration_ms"]
# we could add the mode 

## Clustering
Note : we could add the PCA

In [5]:
data=pd.read_csv("audio_features.csv", sep=',')
data.drop(columns="Unnamed: 0", inplace=True)
data

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms
0,0.670,0.750,-4.895,0.1730,0.194000,0.000000,0.3690,0.526,77.045,6hZPajczr6rZV0BoiVaL9G,229738
1,0.631,0.743,-4.353,0.0285,0.123000,0.000000,0.1310,0.709,134.961,32s2XKjyXifmlGfU2tkT8y,178187
2,0.395,0.988,-1.936,0.1630,0.000748,0.000536,0.2410,0.327,152.974,7qNC8YKkXrzdKvcpL3xra6,165280
3,0.603,0.457,-5.384,0.2970,0.526000,0.000000,0.0905,0.455,73.724,5IvdLxVB8gwwcgnEkXSSW2,242972
4,0.547,0.758,-4.780,0.0285,0.307000,0.000001,0.1860,0.308,87.928,1iQ1B8aUtGMdsezQhCTgMQ,195960
...,...,...,...,...,...,...,...,...,...,...,...
1086,0.659,0.760,-2.564,0.1510,0.237000,0.000000,0.1480,0.742,119.849,0Yc1PcFl82lC2w3Bu4fXGP,175800
1087,0.616,0.810,-6.190,0.3740,0.090200,0.000000,0.4250,0.703,121.024,1ztFzSOgRMVTMOE8r0Jmlp,176692
1088,0.653,0.835,-5.769,0.2920,0.174000,0.000000,0.3060,0.515,135.041,6CZpEC9kyyCJWWZKPnviXY,233267
1089,0.633,0.813,-5.025,0.1540,0.118000,0.000000,0.1330,0.782,178.097,6L4okhXpRc3yFh5lhGWI9X,154547


In [6]:
data.set_index("id", inplace=True)
data

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6hZPajczr6rZV0BoiVaL9G,0.670,0.750,-4.895,0.1730,0.194000,0.000000,0.3690,0.526,77.045,229738
32s2XKjyXifmlGfU2tkT8y,0.631,0.743,-4.353,0.0285,0.123000,0.000000,0.1310,0.709,134.961,178187
7qNC8YKkXrzdKvcpL3xra6,0.395,0.988,-1.936,0.1630,0.000748,0.000536,0.2410,0.327,152.974,165280
5IvdLxVB8gwwcgnEkXSSW2,0.603,0.457,-5.384,0.2970,0.526000,0.000000,0.0905,0.455,73.724,242972
1iQ1B8aUtGMdsezQhCTgMQ,0.547,0.758,-4.780,0.0285,0.307000,0.000001,0.1860,0.308,87.928,195960
...,...,...,...,...,...,...,...,...,...,...
0Yc1PcFl82lC2w3Bu4fXGP,0.659,0.760,-2.564,0.1510,0.237000,0.000000,0.1480,0.742,119.849,175800
1ztFzSOgRMVTMOE8r0Jmlp,0.616,0.810,-6.190,0.3740,0.090200,0.000000,0.4250,0.703,121.024,176692
6CZpEC9kyyCJWWZKPnviXY,0.653,0.835,-5.769,0.2920,0.174000,0.000000,0.3060,0.515,135.041,233267
6L4okhXpRc3yFh5lhGWI9X,0.633,0.813,-5.025,0.1540,0.118000,0.000000,0.1330,0.782,178.097,154547


In [7]:
scaler = StandardScaler()
data_norm = scaler.fit_transform(data)
# data_norm is an array - eq. to X_normalized 
data_norm

array([[ 0.62162706,  0.35199228,  0.68758062, ...,  0.07576189,
        -1.45445037, -0.09934238],
       [ 0.4081261 ,  0.32418231,  0.79541023, ...,  0.78457574,
         0.47232335, -0.40650055],
       [-0.88382847,  1.29753113,  1.2762666 , ..., -0.69502477,
         1.07158738, -0.48340479],
       ...,
       [ 0.52856254,  0.68968473,  0.51370041, ...,  0.03315559,
         0.47498483, -0.07831542],
       [ 0.41907486,  0.60228197,  0.66171743, ...,  1.06732662,
         1.90738985, -0.54735561],
       [ 1.63986244,  0.12951255,  0.78745232, ..., -0.06755021,
         0.04379286,  0.46342839]])

## Clustering (without PCA)

In [8]:
kmeans = KMeans(n_clusters=6, n_init=10)

In [9]:
kmeans.fit(data_norm)
# try with .fit_predict

In [10]:
kmeans.inertia_

5944.683195205784

In [11]:
kmeans.cluster_centers_

array([[ 0.65154716,  0.33869186,  0.38451625, -0.24977   , -0.39730036,
        -0.26032714, -0.30249499,  0.75105337, -0.06599114, -0.05086   ],
       [-0.57539789,  0.59388509,  0.46737072, -0.15676245, -0.60652302,
        -0.04044426, -0.08581113, -0.65403866,  0.41011739, -0.01362988],
       [-0.7580154 ,  0.05828498, -0.20890927, -0.04842371,  0.15997867,
        -0.08097867,  3.31583904, -0.47806465,  0.14876897,  0.1853851 ],
       [-1.97862559, -2.03539076, -2.44510268, -0.5312354 ,  1.98142189,
         1.91770675, -0.33393136, -1.57308069, -0.95160488,  1.38465655],
       [ 0.02883406, -1.25092429, -1.01262679, -0.38145969,  1.4051398 ,
         0.29833967, -0.25966677,  0.0354593 , -0.24199891, -0.28487474],
       [ 0.73960003,  0.0315142 ,  0.21849001,  2.44894509, -0.12008817,
        -0.37644748,  0.2281959 ,  0.20860445, -0.18573661, -0.13332289]])

In [12]:
kmeans.n_iter_

19

In [13]:
kmeans.labels_

array([0, 0, 1, ..., 5, 0, 0], dtype=int32)

In [14]:
len(kmeans.labels_)

1091

In [18]:
#for c in np.unique(cluster_ids):
#    plt.scatter(miley_norm[cluster_ids == c, 0], miley_norm[cluster_ids == c, 1], marker='.', s=12)

In [15]:
# i want to merge the labels with the original dataframe 
data["cluster"]=kmeans.labels_
data 

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6hZPajczr6rZV0BoiVaL9G,0.670,0.750,-4.895,0.1730,0.194000,0.000000,0.3690,0.526,77.045,229738,0
32s2XKjyXifmlGfU2tkT8y,0.631,0.743,-4.353,0.0285,0.123000,0.000000,0.1310,0.709,134.961,178187,0
7qNC8YKkXrzdKvcpL3xra6,0.395,0.988,-1.936,0.1630,0.000748,0.000536,0.2410,0.327,152.974,165280,1
5IvdLxVB8gwwcgnEkXSSW2,0.603,0.457,-5.384,0.2970,0.526000,0.000000,0.0905,0.455,73.724,242972,5
1iQ1B8aUtGMdsezQhCTgMQ,0.547,0.758,-4.780,0.0285,0.307000,0.000001,0.1860,0.308,87.928,195960,1
...,...,...,...,...,...,...,...,...,...,...,...
0Yc1PcFl82lC2w3Bu4fXGP,0.659,0.760,-2.564,0.1510,0.237000,0.000000,0.1480,0.742,119.849,175800,0
1ztFzSOgRMVTMOE8r0Jmlp,0.616,0.810,-6.190,0.3740,0.090200,0.000000,0.4250,0.703,121.024,176692,5
6CZpEC9kyyCJWWZKPnviXY,0.653,0.835,-5.769,0.2920,0.174000,0.000000,0.3060,0.515,135.041,233267,5
6L4okhXpRc3yFh5lhGWI9X,0.633,0.813,-5.025,0.1540,0.118000,0.000000,0.1330,0.782,178.097,154547,0


In [16]:
# and then I want to create dataframes with the songs from each of the clusters
C0_songs=pd.DataFrame(data.loc[data["cluster"]==0])
C1_songs=pd.DataFrame(data.loc[data["cluster"]==1])
C2_songs=pd.DataFrame(data.loc[data["cluster"]==2])
C3_songs=pd.DataFrame(data.loc[data["cluster"]==3])
C4_songs=pd.DataFrame(data.loc[data["cluster"]==4])
C5_songs=pd.DataFrame(data.loc[data["cluster"]==5])
len(C0_songs), len(C1_songs), len (C2_songs), len (C3_songs), len (C4_songs), len(C5_songs)

(391, 317, 56, 55, 172, 100)

In [17]:
#in order to extract the means of their song measures 
C0_means=pd.DataFrame(C0_songs.describe())
C0_means.drop(columns="cluster", inplace=True)
cluster_0_means=C0_means.iloc[1:2:1]
cluster_0_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.675465,0.746652,-6.418338,0.070283,0.120643,0.062687,0.156162,0.700345,118.780054,237874.900256


In [18]:
C1_means=pd.DataFrame(C1_songs.describe())
C1_means.drop(columns="cluster", inplace=True)
cluster_1_means=C1_means.iloc[1:2:1]
cluster_1_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.451341,0.810886,-6.001874,0.079841,0.054624,0.124045,0.196828,0.337582,133.09118,244123.309148


In [19]:
C2_means=pd.DataFrame(C2_songs.describe())
C2_means.drop(columns="cluster", inplace=True)
cluster_2_means=C2_means.iloc[1:2:1]
cluster_2_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.417982,0.676071,-9.401161,0.090975,0.296487,0.112734,0.835232,0.383014,125.235429,277524.410714


In [20]:
C3_means=pd.DataFrame(C3_songs.describe())
C3_means.drop(columns="cluster", inplace=True)
cluster_3_means=C3_means.iloc[1:2:1]
cluster_3_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.195015,0.149076,-20.641273,0.041356,0.871227,0.67046,0.150262,0.100305,92.1598,478800.654545


In [21]:
C4_means=pd.DataFrame(C4_songs.describe())
C4_means.drop(columns="cluster", inplace=True)
cluster_4_means=C4_means.iloc[1:2:1]
cluster_4_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.561715,0.346533,-13.441006,0.056749,0.689387,0.218581,0.164199,0.515595,113.489517,198599.715116


In [22]:
C5_means=pd.DataFrame(C5_songs.describe())
C5_means.drop(columns="cluster", inplace=True)
cluster_5_means=C5_means.iloc[1:2:1]
cluster_5_means

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
mean,0.69155,0.669333,-7.25286,0.34763,0.208115,0.030284,0.255759,0.560297,115.18068,224034.98


In [23]:
cluster_means_all= pd.concat([cluster_0_means, cluster_1_means, cluster_2_means, cluster_3_means, cluster_4_means, cluster_5_means])
cluster_means_all.reset_index(inplace=True)
cluster_means_all.drop(columns="index", inplace=True)
cluster_means_all

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.675465,0.746652,-6.418338,0.070283,0.120643,0.062687,0.156162,0.700345,118.780054,237874.900256
1,0.451341,0.810886,-6.001874,0.079841,0.054624,0.124045,0.196828,0.337582,133.09118,244123.309148
2,0.417982,0.676071,-9.401161,0.090975,0.296487,0.112734,0.835232,0.383014,125.235429,277524.410714
3,0.195015,0.149076,-20.641273,0.041356,0.871227,0.67046,0.150262,0.100305,92.1598,478800.654545
4,0.561715,0.346533,-13.441006,0.056749,0.689387,0.218581,0.164199,0.515595,113.489517,198599.715116
5,0.69155,0.669333,-7.25286,0.34763,0.208115,0.030284,0.255759,0.560297,115.18068,224034.98


In [24]:
cluster_means_all.to_csv("cluster_means_all.csv", sep=';', index=False)

## get the input & compare

In [25]:
# finally we will compare the input song to the means of the different clusters and pick the one with the lowest difference to return a song. 
table_to_norm=pd.read_csv("cluster_means_all.csv", sep=';')
table_to_norm

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.675465,0.746652,-6.418338,0.070283,0.120643,0.062687,0.156162,0.700345,118.780054,237874.900256
1,0.451341,0.810886,-6.001874,0.079841,0.054624,0.124045,0.196828,0.337582,133.09118,244123.309148
2,0.417982,0.676071,-9.401161,0.090975,0.296487,0.112734,0.835232,0.383014,125.235429,277524.410714
3,0.195015,0.149076,-20.641273,0.041356,0.871227,0.67046,0.150262,0.100305,92.1598,478800.654545
4,0.561715,0.346533,-13.441006,0.056749,0.689387,0.218581,0.164199,0.515595,113.489517,198599.715116
5,0.69155,0.669333,-7.25286,0.34763,0.208115,0.030284,0.255759,0.560297,115.18068,224034.98


In [26]:
def input_track():
    user_track = input('Enter a track name')
    result = sp.search(q=user_track, limit=10)
    track_id = result['tracks']['items'][0]['id']
    track_features = pd.DataFrame(sp.audio_features(track_id))
    track_features.drop(columns=['id','analysis_url','track_href','uri','type', 'key', 'mode', 'time_signature'], axis = 1, inplace=True)
    return track_features

In [27]:
new_song2=input_track()

In [28]:
new_song2

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.785,0.745,-4.876,0.0559,0.29,6e-06,0.0507,0.809,121.973,170573


In [30]:
calculation= pd.concat([table_to_norm, new_song2])
measures_for_song= scaler.fit_transform(calculation)
measures_for_song

array([[ 0.7276834 ,  0.67780376,  0.63773797, -0.35903033, -0.8574018 ,
        -0.52276413, -0.42229079,  0.97267121,  0.13894972, -0.25266882],
       [-0.47380552,  0.95921067,  0.71820679, -0.2632307 , -1.09241579,
        -0.23490195, -0.25440144, -0.67805617,  1.34404284, -0.18625654],
       [-0.65263417,  0.36859324,  0.06139916, -0.15163962, -0.23142581,
        -0.28796804,  2.38123938, -0.47131874,  0.68253572,  0.16875279],
       [-1.84791967, -1.94014672, -2.11040674, -0.64894532,  1.8145506 ,
         2.32863376, -0.44664808, -1.75776329, -2.10265476,  2.30805182],
       [ 0.11789008, -1.07509916, -0.71917664, -0.49467335,  1.16722899,
         0.2086209 , -0.38910695,  0.13197843, -0.30654903, -0.67011186],
       [ 0.81390939,  0.33907254,  0.47649219,  2.42070021, -0.54601707,
        -0.67478439, -0.01110489,  0.33539246, -0.16414176, -0.39976879],
       [ 1.31487649,  0.67056567,  0.93574727, -0.50318087, -0.25451912,
        -0.81683616, -0.85768723,  1.46709611

In [31]:
len(measures_for_song)

7

In [39]:
def get_cluster():
    diff_clusters_dict={}
    for i in range (0,len(measures_for_song)-1):
        diff_clusters_dict[i]= abs(sum(measures_for_song[6]-measures_for_song[i]))
        selected=min(diff_clusters_dict, key=diff_clusters_dict.get)
    return selected


In [40]:
selected_cluster=get_cluster()
selected_cluster

2

In [42]:
# get a random song from the cluster identified above and play it. 
ids_dict={
    0:list(C0_songs.index), 
    1:list(C1_songs.index),
    2:list(C2_songs.index),
    3:list(C3_songs.index),
    4:list(C4_songs.index), 
    5:list(C5_songs.index)
    }

In [43]:
import random
track_to_play=random.choice(ids_dict[selected_cluster])

In [44]:
from IPython.display import IFrame
def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )

In [45]:
play_song(track_to_play)

### Improvements 
* play the input track
* add the mode column
* do a PCA before the kmeans
* identify ideal number of clusters (elbow)
* more songs 
* shorten code :) 