In [38]:
# Import libraries 

import pandas as pd
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import random

In [53]:
import requests
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

In [189]:
# Initialize SpotiPy with user credentias

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=config.client_id,
                                                           client_secret=config.client_secret))

In [2]:
# Read file with collection of songs and audio features 

song_lst = pd.read_pickle("100kfeatslist.pkl")
song_lst = [song for song in song_lst if song]
df_101k_songs = pd.DataFrame(song_lst)

In [49]:
# Check the dataframe columns and its content. 
#(The track id (column="id") is given, but not the track name or artist)

df_101k_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.117,0.108,10,-26.922,1,0.0514,0.471,0.942,0.0785,0.0533,168.9,audio_features,0saVGOHWPnGXus321u3c1i,spotify:track:0saVGOHWPnGXus321u3c1i,https://api.spotify.com/v1/tracks/0saVGOHWPnGX...,https://api.spotify.com/v1/audio-analysis/0saV...,236347,4
1,0.0663,0.0906,2,-26.77,0,0.0457,0.81,0.645,0.112,0.0427,57.02,audio_features,0tQjYkNljPf8hhP7RdseSb,spotify:track:0tQjYkNljPf8hhP7RdseSb,https://api.spotify.com/v1/tracks/0tQjYkNljPf8...,https://api.spotify.com/v1/audio-analysis/0tQj...,121627,4
2,0.399,0.219,10,-22.531,0,0.0351,0.911,0.91,0.0915,0.127,139.034,audio_features,78U33gGujgyj1393Dr8uWO,spotify:track:78U33gGujgyj1393Dr8uWO,https://api.spotify.com/v1/tracks/78U33gGujgyj...,https://api.spotify.com/v1/audio-analysis/78U3...,178653,4
3,0.239,0.0358,2,-24.548,0,0.0414,0.962,0.923,0.0841,0.0529,116.784,audio_features,7MqzOAWxhXEoRyivx1cfdF,spotify:track:7MqzOAWxhXEoRyivx1cfdF,https://api.spotify.com/v1/tracks/7MqzOAWxhXEo...,https://api.spotify.com/v1/audio-analysis/7Mqz...,143040,4
4,0.0937,0.0496,8,-25.55,1,0.0401,0.93,0.883,0.056,0.0319,79.851,audio_features,7LlnIsRxZD5w2GmGD1u3l1,spotify:track:7LlnIsRxZD5w2GmGD1u3l1,https://api.spotify.com/v1/tracks/7LlnIsRxZD5w...,https://api.spotify.com/v1/audio-analysis/7Lln...,374200,4


In [19]:
# Create dataframe with audio feature columns only, as only these values are needed for the clustering 

X = pd.DataFrame(data=df_101k_songs, columns=["danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo"])
X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.1170,0.1080,10,-26.922,1,0.0514,0.4710,0.942000,0.0785,0.0533,168.900
1,0.0663,0.0906,2,-26.770,0,0.0457,0.8100,0.645000,0.1120,0.0427,57.020
2,0.3990,0.2190,10,-22.531,0,0.0351,0.9110,0.910000,0.0915,0.1270,139.034
3,0.2390,0.0358,2,-24.548,0,0.0414,0.9620,0.923000,0.0841,0.0529,116.784
4,0.0937,0.0496,8,-25.550,1,0.0401,0.9300,0.883000,0.0560,0.0319,79.851
...,...,...,...,...,...,...,...,...,...,...,...
101381,0.7910,0.6080,6,-10.428,0,0.2880,0.0556,0.000074,0.3610,0.6210,111.518
101382,0.7890,0.4270,11,-12.633,1,0.1390,0.1340,0.883000,0.2880,0.8710,96.885
101383,0.6020,0.7980,4,-10.125,0,0.0703,0.0159,0.407000,0.3760,0.5040,197.143
101384,0.9200,0.6460,1,-8.735,1,0.2850,0.0430,0.000012,0.3070,0.8310,93.794


#### Scaling Audio Features for Clustering 

In [None]:
# We see that the scales of the variables differ, e.g. the scale of "tempo" is bigger than of "valence". 
# K-Means is a distance based algorithm
# The scales need to be normalized

In [20]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
display(X.head())
print()
display(X_scaled_df.head())

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.117,0.108,10,-26.922,1,0.0514,0.471,0.942,0.0785,0.0533,168.9
1,0.0663,0.0906,2,-26.77,0,0.0457,0.81,0.645,0.112,0.0427,57.02
2,0.399,0.219,10,-22.531,0,0.0351,0.911,0.91,0.0915,0.127,139.034
3,0.239,0.0358,2,-24.548,0,0.0414,0.962,0.923,0.0841,0.0529,116.784
4,0.0937,0.0496,8,-25.55,1,0.0401,0.93,0.883,0.056,0.0319,79.851





Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,-2.512843,-2.00628,1.332823,-3.367687,0.704451,-0.31393,0.52744,2.731378,-0.694355,-1.720542,1.642484
1,-2.796854,-2.076035,-0.907348,-3.338833,-1.419545,-0.370596,1.572196,1.718088,-0.49019,-1.761196,-2.150826
2,-0.933137,-1.561294,1.332823,-2.534144,-1.419545,-0.475976,1.883465,2.622202,-0.615127,-1.437883,0.629872
3,-1.829424,-2.295722,-0.907348,-2.917031,-1.419545,-0.413345,2.040641,2.666555,-0.660226,-1.722076,-0.124518
4,-2.643365,-2.240399,0.772781,-3.107241,0.704451,-0.426269,1.942021,2.530085,-0.83148,-1.802617,-1.376737


#### Clustering with K-Means

In [195]:
# The number of cluster is manually set to 40. The song collection is huge and there should be a lot of different music genres
# K-means picks randomly the initial centroids
# Random seed is being used in order to reproduce te results. It is set in random_state

In [23]:
kmeans = KMeans(n_clusters=40, random_state=1233)
kmeans.fit(X_scaled_df)

KMeans(n_clusters=40, random_state=1233)

In [24]:
kmeans.labels_

array([25, 21, 21, ..., 14, 32, 33])

In [25]:
# See the clusters and how many observations each cluster has. 
# The distribution is okay - besides cluster nr.23, it is way smaller than the other clusters 

clusters = kmeans.predict(X_scaled_df)

pd.Series(clusters).value_counts().sort_index()

0     2644
1     3462
2     2033
3     3866
4     1507
5     1665
6     2469
7     3689
8     3896
9     2378
10    2977
11    1861
12    1522
13    2923
14    2685
15    2210
16    2361
17    3502
18    2786
19    2333
20    1685
21    1506
22    5280
23     615
24    2233
25    1183
26    3277
27    2632
28    2107
29    3207
30    2863
31    1057
32    2142
33    1920
34    4327
35    1769
36    2020
37    1970
38    3344
39    3480
dtype: int64

In [26]:
# Add the cluster number for each track as additionl column to dataframe 

#X_df = pd.DataFrame(X)
X["cluster"] = clusters
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,cluster
0,0.117,0.108,10,-26.922,1,0.0514,0.471,0.942,0.0785,0.0533,168.9,25
1,0.0663,0.0906,2,-26.77,0,0.0457,0.81,0.645,0.112,0.0427,57.02,21
2,0.399,0.219,10,-22.531,0,0.0351,0.911,0.91,0.0915,0.127,139.034,21
3,0.239,0.0358,2,-24.548,0,0.0414,0.962,0.923,0.0841,0.0529,116.784,21
4,0.0937,0.0496,8,-25.55,1,0.0401,0.93,0.883,0.056,0.0319,79.851,2


In [28]:
X["id"] = df_101k_songs["id"]
X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,cluster,id
0,0.1170,0.1080,10,-26.922,1,0.0514,0.4710,0.942000,0.0785,0.0533,168.900,25,0saVGOHWPnGXus321u3c1i
1,0.0663,0.0906,2,-26.770,0,0.0457,0.8100,0.645000,0.1120,0.0427,57.020,21,0tQjYkNljPf8hhP7RdseSb
2,0.3990,0.2190,10,-22.531,0,0.0351,0.9110,0.910000,0.0915,0.1270,139.034,21,78U33gGujgyj1393Dr8uWO
3,0.2390,0.0358,2,-24.548,0,0.0414,0.9620,0.923000,0.0841,0.0529,116.784,21,7MqzOAWxhXEoRyivx1cfdF
4,0.0937,0.0496,8,-25.550,1,0.0401,0.9300,0.883000,0.0560,0.0319,79.851,2,7LlnIsRxZD5w2GmGD1u3l1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
101381,0.7910,0.6080,6,-10.428,0,0.2880,0.0556,0.000074,0.3610,0.6210,111.518,33,6tYX8lbMl5aH3p3Cnew6mG
101382,0.7890,0.4270,11,-12.633,1,0.1390,0.1340,0.883000,0.2880,0.8710,96.885,19,0wkaBWFqLAvhnLosIsgSp1
101383,0.6020,0.7980,4,-10.125,0,0.0703,0.0159,0.407000,0.3760,0.5040,197.143,14,1X2O4ObuDdTZAsAVB6zg5t
101384,0.9200,0.6460,1,-8.735,1,0.2850,0.0430,0.000012,0.3070,0.8310,93.794,32,6zceR6vUKRuEwVYEODIhz2


In [188]:
# Save the dataframe with the cluster column 

X.to_csv("X.csv")

In [29]:
# New dataframe, with only the columns that are needed for user interaction 

lookup_df = pd.DataFrame(data=X, columns=["cluster", "id"])
lookup_df

Unnamed: 0,cluster,id
0,25,0saVGOHWPnGXus321u3c1i
1,21,0tQjYkNljPf8hhP7RdseSb
2,21,78U33gGujgyj1393Dr8uWO
3,21,7MqzOAWxhXEoRyivx1cfdF
4,2,7LlnIsRxZD5w2GmGD1u3l1
...,...,...
101381,33,6tYX8lbMl5aH3p3Cnew6mG
101382,19,0wkaBWFqLAvhnLosIsgSp1
101383,14,1X2O4ObuDdTZAsAVB6zg5t
101384,32,6zceR6vUKRuEwVYEODIhz2


#### Saving with Pickle 

In [32]:
import pickle

# Saving the:
#scaler = StandardScaler()
#model = KMeans()

with open("Model/scaler.pickle", "wb") as f:
    pickle.dump(scaler,f)

with open("Model/kmeans.pickle", "wb") as f:
    pickle.dump(kmeans,f)

In [33]:
def load(filename = "filename.pickle"): 
    try: 
        with open(filename, "rb") as f: 
            return pickle.load(f) 
    except FileNotFoundError: 
        print("File not found!") 

In [34]:
scaler2 = load("Model/scaler.pickle")
scaler2

StandardScaler()

In [35]:
kmeans2 = load("Model/kmeans.pickle")
kmeans2

KMeans(n_clusters=40, random_state=1233)

#### Getting User Input 

In [72]:
def get_track_data(track_id):
    meta = sp.track(track_id)
    track_details = {"name":meta['name'], "album":meta['album']['name'], 
                    "artist":meta['album']['artists'][0]['name']}
    return track_details

In [194]:
# The user is providing the track´s id

get_track_data("1i3fTnSg2X03MWLEt411Eb")

{'name': "when the party's over",
 'album': 'ESTATE 2021 HITS',
 'artist': 'Various Artists'}

In [65]:
# Get information about the track on Spotify

results = sp.search(q="uri:1i3fTnSg2X03MWLEt411Eb")
results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=uri%3A1i3fTnSg2X03MWLEt411Eb&type=track&offset=0&limit=10',
  'items': [],
  'limit': 10,
  'next': None,
  'offset': 0,
  'previous': None,
  'total': 0}}

#### Computing User Input 

In [167]:
# Show audio features for the track id, the user gave as input

user_choice=pd.DataFrame(data=sp.audio_features("7LlnIsRxZD5w2GmGD1u3l1"),columns=["danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo"])
test

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.0937,0.0496,8,-25.55,1,0.0401,0.93,0.883,0.056,0.0319,79.851


In [168]:
# Scale the audio features of the track
# Save scaled input as dataframe 

scaler2.fit(user_choice)
user_choice_scaled = scaler.transform(user_choice)
user_choice_scaled_df = pd.DataFrame(user_choice_scaled, columns = user_choice.columns)
display(user_choice.head())
print()
display(user_choice_scaled_df.head())

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.0937,0.0496,8,-25.55,1,0.0401,0.93,0.883,0.056,0.0319,79.851





Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,-2.643365,-2.240399,0.772781,-3.107241,0.704451,-0.426269,1.942021,2.530085,-0.83148,-1.802617,-1.376737


In [170]:
# Use the trained K-means model to predict the cluster, the input track belongs to
# The user´s track fits to cluster nr.2, in regard of the audio features

user_cluster = kmeans2.predict(user_choice_scaled_df)
print(user_cluster)

[2]


In [177]:
# Randomly choose another song from the same cluster

lookup_df[lookup_df["cluster"]==2]["id"].sample().iloc[-1]

'5H14dJbhKeaU1ffTMmIGJi'

In [197]:
# Return the randomly choosen song as a recommendation for the user

recommended_song_from_cluster=get_track_data("5H14dJbhKeaU1ffTMmIGJi")["name"]
print("You may also like the song:"+ " " + recommended_song_from_cluster)

You may also like the song: Observations From A Faraway Place


#### Other


In [None]:
recommend_song()

In [141]:
result=sp.search(q="track:Happy")
type(result)

dict

In [143]:
result["tracks"]["items"][0]["artists"][0]["name"]

'Thomas Rhett'

In [92]:
type(result["tracks"])

dict

In [155]:
for i in range(0,10):
    print(result["tracks"]["items"][i]["name"])

Die A Happy Man
I Hope You’re Happy Now
Happy - From "Despicable Me 2"
Happy Pills
Happy Anywhere (feat. Gwen Stefani)
Happy Together
Happy?
Happy Hour
Happy Now
Don't Worry Be Happy


In [150]:
result["tracks"]["items"][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

#### User Interaction Flow

In [36]:
df = pd.read_csv("artist_song_list.csv")

In [45]:
def recommend_song():
    
    requested_song = input("Tell me your favorite song: ")
        
    if requested_song in list(df["song"]):
        choice = df["song"].iloc[random.randint(0,len(df))]
        return "Excellent choice! You will also like"  + " " + choice
    else:
        return "This song is not hot"