## First, import datasets

In [32]:
import pandas as pd

In [33]:
songs_1000 = pd.read_csv("./songs_1000.csv", index_col=0)

In [34]:
hits_now = pd.read_csv("./hits.csv",index_col=0)

In [35]:
hits_old = pd.read_csv("./old_hits.csv",index_col=0)

## Cluster songs_1000

In [36]:
songs_1000.head()

Unnamed: 0,Title,Artist,Energy,Tempo,Danceability,Instrumentalness,Valence
0,Imagine - Remastered 2010,John Lennon,0.257,75.752,0.547,0.183,0.169
1,A Whiter Shade Of Pale,Procol Harum,0.66,149.813,0.249,0.0026,0.435
2,My Sweet Lord,George Harrison,0.685,120.965,0.537,0.0,0.542
3,God Only Knows - Remastered,The Beach Boys,0.487,117.072,0.521,0.0,0.483
4,Bridge Over Troubled Water,Simon & Garfunkel,0.206,79.764,0.149,0.000649,0.264


#### Make a selection of the columns to cluster

In [37]:
songs_1000_toclusters = songs_1000.iloc[:,2:7]

In [38]:
songs_1000_toclusters.head()

Unnamed: 0,Energy,Tempo,Danceability,Instrumentalness,Valence
0,0.257,75.752,0.547,0.183,0.169
1,0.66,149.813,0.249,0.0026,0.435
2,0.685,120.965,0.537,0.0,0.542
3,0.487,117.072,0.521,0.0,0.483
4,0.206,79.764,0.149,0.000649,0.264


#### Do the clustering 

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
scaler = StandardScaler()


In [41]:
# is to equilibrate the values of every column among each other
scaler.fit(songs_1000_toclusters)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [42]:
#it has not changed
songs_1000_toclusters

Unnamed: 0,Energy,Tempo,Danceability,Instrumentalness,Valence
0,0.257,75.752,0.547,0.183000,0.169
1,0.660,149.813,0.249,0.002600,0.435
2,0.685,120.965,0.537,0.000000,0.542
3,0.487,117.072,0.521,0.000000,0.483
4,0.206,79.764,0.149,0.000649,0.264
...,...,...,...,...,...
995,0.713,102.037,0.706,0.000000,0.544
996,0.563,121.808,0.800,0.000000,0.961
997,0.660,84.345,0.664,0.000000,0.873
998,0.627,150.273,0.336,0.000000,0.350


In [43]:
tracks_scaled = scaler.transform(songs_1000_toclusters)

In [44]:
tracks_scaled_df = pd.DataFrame(tracks_scaled)

In [45]:
tracks_scaled_df.head()

Unnamed: 0,0,1,2,3,4
0,-1.478364,-1.60997,-0.04601,1.00462,-1.653894
1,0.378061,1.041801,-2.021643,-0.261404,-0.604205
2,0.493223,0.008892,-0.112306,-0.279651,-0.181962
3,-0.418866,-0.130498,-0.21838,-0.279651,-0.414788
4,-1.713296,-1.46632,-2.684607,-0.275096,-1.279005


In [48]:
tracks_scaled_df.mean(axis=0)

0   -4.110046e-16
1   -1.076916e-16
2    1.706343e-16
3   -1.122435e-16
4   -3.992362e-16
dtype: float64

#### KMeans

In [26]:
from sklearn.cluster import KMeans

In [49]:
kmeans = KMeans()

In [50]:
kmeans.fit(tracks_scaled)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [51]:
 clusters = kmeans.predict(tracks_scaled)

In [52]:
clusters

array([3, 5, 1, 1, 3, 3, 3, 7, 2, 7, 3, 2, 7, 5, 1, 5, 0, 7, 5, 7, 0, 4,
       7, 3, 1, 3, 5, 4, 0, 6, 7, 1, 0, 6, 3, 0, 5, 6, 2, 0, 3, 7, 4, 3,
       3, 1, 0, 7, 3, 6, 3, 3, 3, 1, 0, 2, 7, 6, 2, 7, 2, 1, 3, 6, 1, 5,
       5, 0, 0, 1, 1, 6, 3, 6, 4, 3, 1, 3, 2, 3, 1, 0, 2, 6, 5, 0, 7, 4,
       3, 7, 5, 0, 3, 1, 3, 0, 5, 5, 3, 6, 3, 4, 1, 5, 1, 4, 3, 3, 1, 2,
       6, 6, 0, 7, 6, 6, 2, 0, 6, 4, 0, 3, 6, 4, 5, 2, 7, 4, 5, 0, 7, 5,
       6, 1, 7, 6, 3, 3, 2, 2, 0, 6, 3, 5, 1, 3, 6, 7, 6, 3, 0, 7, 7, 1,
       0, 6, 0, 2, 0, 5, 6, 1, 3, 7, 1, 5, 0, 0, 6, 1, 6, 2, 6, 3, 0, 6,
       3, 0, 6, 1, 5, 1, 0, 7, 6, 3, 0, 4, 3, 1, 4, 5, 6, 7, 5, 3, 5, 5,
       3, 2, 3, 4, 3, 3, 5, 3, 6, 2, 5, 0, 1, 3, 3, 6, 2, 0, 6, 6, 2, 1,
       3, 2, 5, 7, 1, 3, 6, 1, 0, 0, 3, 5, 4, 0, 7, 6, 6, 6, 5, 0, 1, 3,
       7, 0, 2, 2, 0, 7, 2, 2, 3, 7, 6, 5, 1, 6, 5, 3, 3, 6, 0, 1, 7, 5,
       2, 6, 3, 3, 7, 1, 5, 2, 3, 2, 6, 6, 2, 7, 6, 0, 6, 2, 1, 6, 0, 5,
       4, 3, 6, 3, 2, 0, 6, 3, 0, 6, 2, 5, 6, 3, 5,

In [53]:
songs_1000["Cluster"] = clusters

In [54]:
songs_1000.head()

Unnamed: 0,Title,Artist,Energy,Tempo,Danceability,Instrumentalness,Valence,Cluster
0,Imagine - Remastered 2010,John Lennon,0.257,75.752,0.547,0.183,0.169,3
1,A Whiter Shade Of Pale,Procol Harum,0.66,149.813,0.249,0.0026,0.435,5
2,My Sweet Lord,George Harrison,0.685,120.965,0.537,0.0,0.542,1
3,God Only Knows - Remastered,The Beach Boys,0.487,117.072,0.521,0.0,0.483,1
4,Bridge Over Troubled Water,Simon & Garfunkel,0.206,79.764,0.149,0.000649,0.264,3


In [55]:
songs_1000.to_csv("./songs_1000_clusters.csv")

### Conseguir el cluster de una cancion random

In [58]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [59]:
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(
    client_id="320d32b1cdff443d9f8498b16f9cf9ea",
    client_secret="290b630ee62d4ce19b3b244984190fa6"))

In [61]:
feat = get_features("rabiosa")

In [65]:
feat_scaled = pd.DataFrame(scaler.transform(feat))

In [63]:
feat

Unnamed: 0,energy,tempo,danceabilty,instrumentalness,valence
0,0.823,133.039,0.834,0,0.763


In [66]:
feat_scaled

Unnamed: 0,0,1,2,3,4
0,1.128922,0.441204,1.856697,-0.279651,0.690148


In [67]:
feat_cluster = kmeans.predict(feat_scaled)

In [69]:
feat_cluster[0]

6

In [70]:
# este funciona
def get_features(song):
    
    song = spotify.search(q=song, limit=1)
    song_URI = song["tracks"]["items"][0]["uri"]
    
    features = spotify.audio_features(song_URI)[0]
    feat = pd.DataFrame({"energy":features["energy"],
           "tempo":features["tempo"],
           "danceabilty":features["danceability"],
           "instrumentalness":features["instrumentalness"],
           "valence":features["valence"]},index=[0])
    
    feat_scaled = pd.DataFrame(scaler.transform(feat))
    feat_cluster = kmeans.predict(feat_scaled)
    return feat_cluster[0]

In [71]:
get_features("rabiosa")

6

### Devolver una canción del mismo cluster

In [79]:
one_cluster = songs_1000[songs_1000["Cluster"]==3]

In [104]:
title = one_cluster.sample()["Title"].values[0]

In [110]:
artist = songs_1000[songs_1000["Title"]==title]["Artist"].values[0]

In [106]:
print("This is: " + title + "by "+ artist)

This is: Imagine - Remastered 2010by Morris Albert


In [105]:
a

'Imagine - Remastered 2010'

In [111]:
artist

'John Lennon'

In [117]:
def get_features(song):
    
    song = spotify.search(q=song, limit=1)
    song_URI = song["tracks"]["items"][0]["uri"]
    
    features = spotify.audio_features(song_URI)[0]
    feat = pd.DataFrame({"energy":features["energy"],
           "tempo":features["tempo"],
           "danceabilty":features["danceability"],
           "instrumentalness":features["instrumentalness"],
           "valence":features["valence"]},index=[0])
    
    feat_scaled = pd.DataFrame(scaler.transform(feat))
    feat_cluster = kmeans.predict(feat_scaled)
    cluster = feat_cluster[0]
    
    one_cluster = songs_1000[songs_1000["Cluster"]==cluster]
    title = one_cluster.sample()["Title"].values[0]
    artist = songs_1000[songs_1000["Title"]==title]["Artist"].values[0]
    print("This is: " + title + " by "+ artist)

In [118]:
get_features("mamma mia")

This is: In My Life - Remastered by The Beatles
