In [2]:
import pandas as pd

songs = pd.read_csv('../data/df_audio_features_1000.csv')
songs.head()

Unnamed: 0,name,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature,id,html
0,All Shook Up,Elvis Presley,0.624,0.468,10,-12.162,1,0.132,0.881,6e-06,0.144,0.952,74.139,,117080,4,5ueyLj6e6oVaTY0KQ6yLaA,https://open.spotify.com/track/5ueyLj6e6oVaTY0...
1,I've Got You Under My Skin - Remastered 1998,Frank Sinatra,0.585,0.247,1,-12.612,1,0.04,0.452,9e-06,0.107,0.591,127.15,,223760,4,3aEJMh1cXKEjgh52claxQp,https://open.spotify.com/track/3aEJMh1cXKEjgh5...
2,Smoke Gets In Your Eyes,The Platters,0.29,0.227,3,-13.06,1,0.0311,0.944,7.9e-05,0.617,0.224,114.278,,157293,4,307XEC1IUwUs9ojlEFwH7f,https://open.spotify.com/track/307XEC1IUwUs9oj...
3,"What'd I Say, Pt. 1 & 2",Ray Charles,0.54,0.681,4,-5.44,1,0.0508,0.808,0.0,0.162,0.794,88.385,,307053,4,5yQ9iMZXGcr5rlO4hoLsP4,https://open.spotify.com/track/5yQ9iMZXGcr5rlO...
4,Dream A Little Dream Of Me,Ella Fitzgerald,0.455,0.167,0,-13.613,1,0.0739,0.918,0.0,0.173,0.404,76.118,,185067,4,3vFVS2WYHDG4KkWCNecvpn,https://open.spotify.com/track/3vFVS2WYHDG4KkW...


In [4]:
songs_rdy = (
    songs
# createa column that contians name, song and id
    .assign(
        name_song_id = lambda x: x['name'] + ' - ' + x['artist'] + ' - ' + x['id']
    )
    .set_index('name_song_id')
    # select only numeric columns
    .select_dtypes('number')
    # drop columns key, mode, time_signature, duration_ms, type
    .drop(columns=['key', 'mode', 'time_signature', 'duration_ms', 'type'])
    # select only acousticness, danceability
    .loc[:, ['acousticness', 'danceability']]
)
songs_rdy.head()

Unnamed: 0_level_0,acousticness,danceability
name_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
All Shook Up - Elvis Presley - 5ueyLj6e6oVaTY0KQ6yLaA,0.881,0.624
I've Got You Under My Skin - Remastered 1998 - Frank Sinatra - 3aEJMh1cXKEjgh52claxQp,0.452,0.585
Smoke Gets In Your Eyes - The Platters - 307XEC1IUwUs9ojlEFwH7f,0.944,0.29
"What'd I Say, Pt. 1 & 2 - Ray Charles - 5yQ9iMZXGcr5rlO4hoLsP4",0.808,0.54
Dream A Little Dream Of Me - Ella Fitzgerald - 3vFVS2WYHDG4KkWCNecvpn,0.918,0.455


In [7]:
from sklearn.cluster import KMeans

# create 4 clusters
kmeans = KMeans(n_clusters=4)

# fit the model
kmeans.fit(songs_rdy)

# create a column with the cluster labels
songs_rdy['cluster'] = kmeans.labels_

songs_rdy.head()



Unnamed: 0_level_0,acousticness,danceability,cluster
name_song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All Shook Up - Elvis Presley - 5ueyLj6e6oVaTY0KQ6yLaA,0.881,0.624,3
I've Got You Under My Skin - Remastered 1998 - Frank Sinatra - 3aEJMh1cXKEjgh52claxQp,0.452,0.585,3
Smoke Gets In Your Eyes - The Platters - 307XEC1IUwUs9ojlEFwH7f,0.944,0.29,2
"What'd I Say, Pt. 1 & 2 - Ray Charles - 5yQ9iMZXGcr5rlO4hoLsP4",0.808,0.54,3
Dream A Little Dream Of Me - Ella Fitzgerald - 3vFVS2WYHDG4KkWCNecvpn,0.918,0.455,2


In [16]:
songs_rdy_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=songs_rdy.columns).drop(columns='cluster')
songs_rdy_centroids = songs_rdy_centroids.rename(columns=lambda x: x + '_centroid')
songs_rdy_centroids = songs_rdy_centroids.reset_index().rename(columns={'index': 'cluster'})

# rename {col} to {col}_centroid
songs_rdy_centroids

Unnamed: 0,cluster,acousticness_centroid,danceability_centroid
0,0,0.079484,0.474363
1,1,0.15257,0.758049
2,2,0.918274,0.266273
3,3,0.710003,0.582652


In [28]:
songs_rdy_with_centroids = (
songs_rdy
.merge(songs_rdy_centroids, on='cluster', how='left')
)
songs_rdy_with_centroids['songs_name'] = songs_rdy.index
songs_rdy_with_centroids = songs_rdy_with_centroids.set_index('songs_name')
songs_rdy_with_centroids.head()

Unnamed: 0_level_0,acousticness,danceability,cluster,acousticness_centroid,danceability_centroid
songs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All Shook Up - Elvis Presley - 5ueyLj6e6oVaTY0KQ6yLaA,0.881,0.624,3,0.710003,0.582652
I've Got You Under My Skin - Remastered 1998 - Frank Sinatra - 3aEJMh1cXKEjgh52claxQp,0.452,0.585,3,0.710003,0.582652
Smoke Gets In Your Eyes - The Platters - 307XEC1IUwUs9ojlEFwH7f,0.944,0.29,2,0.918274,0.266273
"What'd I Say, Pt. 1 & 2 - Ray Charles - 5yQ9iMZXGcr5rlO4hoLsP4",0.808,0.54,3,0.710003,0.582652
Dream A Little Dream Of Me - Ella Fitzgerald - 3vFVS2WYHDG4KkWCNecvpn,0.918,0.455,2,0.918274,0.266273


In [30]:
(
songs_rdy_with_centroids
    .assign(
        distance = lambda x: (x['acousticness'] - x['acousticness_centroid'])**2 + (x['danceability'] - x['danceability_centroid'])**2
    )
    .groupby('cluster')
        # get top 5 songs with the smallest distance
    .apply(lambda x: x.nsmallest(5, 'distance'))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,acousticness,danceability,cluster,acousticness_centroid,danceability_centroid,distance
cluster,songs_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Band On The Run - Remastered 2010 - Wings - 1H4idkmruFoJBg1DvUv2tY,0.0832,0.479,0,0.079484,0.474363,3.5e-05
0,Cool Out - Leroy Hutson - 489alJQIWUauqzKTMuXRjm,0.0708,0.453,0,0.079484,0.474363,0.000532
0,Because the Night - Patti Smith - 0lIoY4ZQsdn5QzhraM9o9u,0.0542,0.466,0,0.079484,0.474363,0.000709
0,Hustlin' - Rick Ross - 3UaqMekEh1kbVu2A6DwiQs,0.0849,0.502,0,0.079484,0.474363,0.000793
0,Heart Attack - Demi Lovato - 1V6gIisPpYqgFeWbMLI0bA,0.0738,0.504,0,0.079484,0.474363,0.000911
1,LA CANCIÓN - J Balvin - 0fea68AdmYNygeTGI4RC18,0.152,0.754,1,0.15257,0.758049,1.7e-05
1,Que Va - Alex Sensation - 6J1R5wtPXiHiwiPstOSI56,0.148,0.766,1,0.15257,0.758049,8.4e-05
1,Guerrera - DELLAFUENTE - 3Q6zROQvMXRtR9NYfDpNWL,0.165,0.76,1,0.15257,0.758049,0.000158
1,Mayores - Becky G - 2AY1UAimvTqjJC8vDJsOyy,0.145,0.745,1,0.15257,0.758049,0.000228
1,Jerry Sprunger (with T-Pain) - Tory Lanez - 6JsHgIkMYHcbNtKfsal2Mg,0.137,0.76,1,0.15257,0.758049,0.000246


Core concepts: 

* Distances - Food ratings
* Kmeans to create clusters and how it works
* Evaluation of custers - inertia and silhouette score 