# Imports

In [1]:
import pandas as pd
import numpy as np

from time import sleep
from random import randint

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
import sys
sys.path.insert(1, '../Src/Lib')
from config import *

In [3]:
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering

from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
import pickle
import yaml

# Spotify API

## Token

In [5]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= Client_ID, client_secret= Client_Secret))

## Not Hot

### Request

#### Function to get the data

There seems to be a problem with some items that are not songs and make the function stop, we had to call it several times setting the offset 1 item after the stop.

In [6]:
def analyze_playlist(creator, playlist_id, pages=5, start=0):
    
    playlist_columns = ["Song", "Artist", "ID", "Link", "danceability", "energy", "key", "loudness", "mode", "speechiness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]
    
    playlist_df = pd.DataFrame(columns = playlist_columns)
    
    playlist = sp.user_playlist_tracks(creator, playlist_id, offset=start)
    nap = randint(1,2)
    for i in range(pages):
        print("Page",i+1)
        try:
            tracks = playlist["items"]
            for track in tracks:
                playlist_features = {}
                playlist_features["Song"] = track["track"]["name"]
                playlist_features["Artist"] = track["track"]["album"]["artists"][0]["name"]
                playlist_features["ID"] = track["track"]["id"]
                playlist_features["Link"] = track["track"]["external_urls"]["spotify"]
        
                audio_features = sp.audio_features(playlist_features["ID"])[0]
                for feature in playlist_columns[4:]:
                    playlist_features[feature] = audio_features[feature]
        
                track_df = pd.DataFrame(playlist_features, index = [0])
                playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
            playlist = sp.next(playlist)
            
        except:
            continue
        
        print("Sleeping for",nap,"seconds")
        sleep(nap)
    
    playlist_df.columns = ["Song", "Artist", "ID", "Link", "Danceability", "Energy", "Key", "Loudness", "Mode", "Speechiness", "Instrumentalness", "Liveness", "Valence", "Tempo", "Duration", "Time Signature"]
        
    return playlist_df

In [7]:
not_hot_0 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 5)
not_hot_0["ID"].nunique()

Page 1
Sleeping for 1 seconds
Page 2
Sleeping for 1 seconds
Page 3
Sleeping for 1 seconds
Page 4
Sleeping for 1 seconds
Page 5


467

In [8]:
skip1 = not_hot_0["ID"].nunique() + 1
skip1

468

In [9]:
not_hot_1 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 10, skip1)
not_hot_1["ID"].nunique()

Page 1
Sleeping for 1 seconds
Page 2
Sleeping for 1 seconds
Page 3
Sleeping for 1 seconds
Page 4
Sleeping for 1 seconds
Page 5
Sleeping for 1 seconds
Page 6
Sleeping for 1 seconds
Page 7
Sleeping for 1 seconds
Page 8
Sleeping for 1 seconds
Page 9
Sleeping for 1 seconds
Page 10
Sleeping for 1 seconds


1000

In [10]:
skip2 = skip1 + not_hot_1["ID"].nunique() + 1
skip2

1469

In [11]:
not_hot_2 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 10, skip2)
not_hot_2["ID"].nunique()

Page 1
Sleeping for 2 seconds
Page 2
Sleeping for 2 seconds
Page 3
Sleeping for 2 seconds
Page 4
Sleeping for 2 seconds
Page 5
Sleeping for 2 seconds
Page 6
Sleeping for 2 seconds
Page 7
Sleeping for 2 seconds
Page 8
Sleeping for 2 seconds
Page 9
Sleeping for 2 seconds
Page 10
Sleeping for 2 seconds


1000

In [12]:
skip3 = skip2 + not_hot_2["ID"].nunique() + 1
skip3

2470

In [13]:
not_hot_3 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 2, skip3)
not_hot_3["ID"].nunique()

Page 1
Sleeping for 2 seconds
Page 2


188

In [14]:
skip4 = skip3 + not_hot_3["ID"].nunique() + 1
skip4

2659

In [15]:
not_hot_4 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 1, skip4)
not_hot_4["ID"].nunique()

Page 1


12

In [16]:
skip5 = skip4 + not_hot_4["ID"].nunique() + 1
skip5

2672

In [17]:
not_hot_5 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 4, skip5)
not_hot_5["ID"].nunique()

Page 1
Sleeping for 2 seconds
Page 2
Sleeping for 2 seconds
Page 3
Sleeping for 2 seconds
Page 4


307

In [18]:
skip6 = skip5 + not_hot_5["ID"].nunique() + 1
skip6

2980

In [19]:
not_hot_6 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 5, skip6)
not_hot_6["ID"].nunique()

Page 1
Sleeping for 2 seconds
Page 2
Sleeping for 2 seconds
Page 3
Sleeping for 2 seconds
Page 4
Sleeping for 2 seconds
Page 5


407

In [20]:
skip7 = skip6 + not_hot_6["ID"].nunique() + 1
skip7

3388

In [21]:
not_hot_7 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 1, skip7)
not_hot_7["ID"].nunique()

Page 1


82

In [22]:
skip8 = skip7 + not_hot_7["ID"].nunique() + 1
skip8

3471

In [23]:
not_hot_8 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 2, skip8)
not_hot_8["ID"].nunique()

Page 1
Sleeping for 1 seconds
Page 2


169

In [24]:
skip9 = skip8 + not_hot_8["ID"].nunique() + 1
skip9

3641

In [25]:
not_hot_9 = analyze_playlist("acclaimed music", "1G8IpkZKobrIlXcVPoSIuf", 1, skip9)
not_hot_9["ID"].nunique()

Page 1


43

### Concat and save

In [26]:
not_hot = pd.concat([not_hot_0, not_hot_1, not_hot_2, not_hot_3, not_hot_4, not_hot_5, not_hot_6, not_hot_7, not_hot_8, not_hot_9]).reset_index(drop=True)
not_hot

Unnamed: 0,Song,Artist,ID,Link,Danceability,Energy,Key,Loudness,Mode,Speechiness,Instrumentalness,Liveness,Valence,Tempo,Duration,Time Signature
0,Like a Rolling Stone,Bob Dylan,3AhXZa8sUQht0UEdBJgpGc,https://open.spotify.com/track/3AhXZa8sUQht0UE...,0.482,0.721,0,-6.839,1,0.0321,0,0.1890,0.557,95.263,369600,4
1,Smells Like Teen Spirit,Nirvana,3oTlkzk1OtrhH8wBAduVEi,https://open.spotify.com/track/3oTlkzk1OtrhH8w...,0.485,0.863,1,-9.027,1,0.0495,0.0162,0.1380,0.767,116.835,300977,4
2,A Day In The Life - Remastered,The Beatles,3ZFBeIyP41HhnALjxWy1pR,https://open.spotify.com/track/3ZFBeIyP41HhnAL...,0.364,0.457,4,-14.162,0,0.0675,0.000106,0.9220,0.175,163.219,337413,4
3,Good Vibrations (Mono),The Beach Boys,5Qt4Cc66g24QWwGP3YYV9y,https://open.spotify.com/track/5Qt4Cc66g24QWwG...,0.398,0.413,1,-10.934,1,0.0388,0.000025,0.0891,0.331,133.574,219147,4
4,Johnny B Goode,Chuck Berry,7MH2ZclofPlTrZOkPzZKhK,https://open.spotify.com/track/7MH2ZclofPlTrZO...,0.518,0.756,10,-10.851,1,0.0915,0.000062,0.3170,0.968,166.429,160893,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3670,I Don't Like,Chief Keef,1h6kgem1ai8vUgO1rZOwfB,https://open.spotify.com/track/1h6kgem1ai8vUgO...,0.742,0.844,2,-4.622,1,0.0479,0,0.0659,0.416,131.986,293840,4
3671,"Happy - From ""Despicable Me 2""",Pharrell Williams,5b88tNINg4Q4nrRbrCXUmg,https://open.spotify.com/track/5b88tNINg4Q4nrR...,0.647,0.822,5,-4.662,0,0.1830,0,0.0908,0.962,160.019,232720,4
3672,Express Yourself - Remastered Version,Charles Wright & The Watts 103rd Street Rhythm...,6gQZKkphKIMxZgca5r7ImA,https://open.spotify.com/track/6gQZKkphKIMxZgc...,0.461,0.818,8,-8.482,1,0.1250,0,0.2660,0.721,92.305,229653,4
3673,Call Me The Breeze,J.J. Cale,0D4JiHE8NkSQa5ZdZkYYLw,https://open.spotify.com/track/0D4JiHE8NkSQa5Z...,0.815,0.415,11,-15.573,1,0.0384,0.012,0.0943,0.817,91.539,157617,4


In [27]:
not_hot["ID"].nunique()

3675

In [28]:
not_hot.to_csv("Not_Hot.csv")

## Hot 100 Dataset

In [30]:
hot_100 = pd.read_csv("../Data/Raw/Hot_100.csv", index_col = 0)

### Cleaning

SpotiPy doesn't like colabs, so we are getting only the main artist and also getting rid of "'" as it doesn't like them either.

In [37]:
hot_100["Artists"] = [artist[0] for artist in [artist.split("Featuring") for artist in hot_100["Artists"]]]
hot_100["Artists"] = [artist[0] for artist in [artist.split(",") for artist in hot_100["Artists"]]]
hot_100["Artists"] = [artist[0] for artist in [artist.split("&") for artist in hot_100["Artists"]]]
hot_100["Artists"] = [artist[0] for artist in [artist.split("With") for artist in hot_100["Artists"]]]
hot_100["Songs"] = [song.replace("'","") for song in hot_100["Songs"]]

### Slicing

We divide the Dataframe so we don't get banned.

In [38]:
def slice_in_4(df):
    rows = df.shape[0]
    df1 = df.iloc[:int((rows/4))]
    df2 = df.iloc[int(rows/4):int((rows/2))]
    df3 = df.iloc[int(rows/2):int(((rows/4)*3))]
    df4 = df.iloc[int(((rows/4)*3)):]
    return df1, df2, df3, df4

hot_100_0, hot_100_1, hot_100_2, hot_100_3 = slice_in_4(hot_100)

### Function

This asks for data using the song and artist combo to be accurate.

In [39]:
def get_audio_features(df):
    
    playlist_columns = ["Song", "Artist", "ID", "Link", "danceability", "energy", "key", "loudness", "mode", "speechiness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]
    playlist_df = pd.DataFrame(columns = playlist_columns)
    songs = df["Song"].tolist()
    artists = df["Artist"].tolist()

    for song, artist in zip(songs, artists):
        
        print("Getting info for song",song,"by",artist)
        track = sp.search(q="artist:" + artist + " track:" + song, type="track")
        playlist_features = {}
        playlist_features["Song"] = track["tracks"]["items"][0]["name"]
        playlist_features["Artist"] = track["tracks"]["items"][0]["artists"][0]["name"]
        playlist_features["ID"] = track["tracks"]["items"][0]["id"]
        playlist_features["Link"] = track["tracks"]["items"][0]["external_urls"]["spotify"]
        
        audio_features = sp.audio_features(playlist_features["ID"])[0]
        for feature in playlist_columns[4:]:
                playlist_features[feature] = audio_features[feature]
                    
        track_df = pd.DataFrame(playlist_features, index = [0])
        playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
        
        nap = randint(2,3)    
        print("Sleeping for",nap,"seconds")
        sleep(nap)
            
    playlist_df.columns = ["Song", "Artist", "ID", "Link", "Danceability", "Energy", "Key", "Loudness", "Mode", "Speechiness", "Instrumentalness", "Liveness", "Valence", "Tempo", "Duration", "Time Signature"]
    
    return playlist_df

In [45]:
hot_100_0_data = get_audio_features(hot_100_0)

Getting info for song BREAK MY SOUL by Beyoncé
Sleeping for 2 seconds
Getting info for song As It Was by Harry Styles
Sleeping for 2 seconds
Getting info for song About Damn Time by Lizzo
Sleeping for 3 seconds
Getting info for song Running Up That Hill (A Deal With God) - 2018 Remaster by Kate Bush
Sleeping for 2 seconds
Getting info for song STAYING ALIVE (feat. Drake & Lil Baby) by DJ Khaled
Sleeping for 3 seconds
Getting info for song Bad Habit by Steve Lacy
Sleeping for 2 seconds
Getting info for song WAIT FOR U (feat. Drake & Tems) by Future
Sleeping for 2 seconds
Getting info for song Sunroof by Nicky Youre
Sleeping for 3 seconds
Getting info for song First Class by Jack Harlow
Sleeping for 2 seconds
Getting info for song Bad Decisions (with BTS & Snoop Dogg) by benny blanco
Sleeping for 3 seconds
Getting info for song Me Porto Bonito by Bad Bunny
Sleeping for 2 seconds
Getting info for song Late Night Talking by Harry Styles
Sleeping for 2 seconds
Getting info for song I Like Y

IndexError: list index out of range

In [None]:
hot_100_1_data = get_audio_features(hot_100_1)

In [None]:
hot_100_2_data= get_audio_features(hot_100_2)

In [None]:
hot_100_3_data = get_audio_features(hot_100_3)

### Concat and save

In [46]:
hot_100_data = pd.concat([hot_100_0_data, hot_100_1_data, hot_100_2_data, hot_100_3_data]).reset_index(drop=True)
hot_100_data


NameError: name 'hot_100_0_data' is not defined

In [None]:
hot_100_data.to_csv("Hot_100_Data.csv")

# Models

## Label row and concat

In [42]:
hot_100 = pd.read_csv("../Data/Clean/Hot_100_Data.csv", index_col = 0)
hot_100.head()

Unnamed: 0,Song,Artist,ID,Link,Danceability,Energy,Key,Loudness,Mode,Speechiness,Instrumentalness,Liveness,Valence,Tempo,Duration,Time Signature
0,BREAK MY SOUL,Beyoncé,2KukL7UlQ8TdvpaA7bY3ZJ,https://open.spotify.com/track/2KukL7UlQ8Tdvpa...,0.687,0.887,1,-5.04,0,0.0826,2e-06,0.27,0.853,114.941,278282,4
1,As It Was,Harry Styles,4LRPiXqCikLlN15c3yImP7,https://open.spotify.com/track/4LRPiXqCikLlN15...,0.52,0.731,6,-5.338,0,0.0557,0.00101,0.311,0.662,173.93,167303,4
2,About Damn Time,Lizzo,1PckUlxKqWQs3RlWXVBLw3,https://open.spotify.com/track/1PckUlxKqWQs3Rl...,0.836,0.743,10,-6.305,0,0.0656,0.0,0.335,0.722,108.966,191822,4
3,Running Up That Hill (A Deal With God) - 2018 ...,Kate Bush,29d0nY7TzCoi22XBqDQkiP,https://open.spotify.com/track/29d0nY7TzCoi22X...,0.625,0.533,10,-11.903,0,0.0596,0.00266,0.0546,0.139,108.296,300840,4
4,STAYING ALIVE (feat. Drake & Lil Baby),DJ Khaled,0g2Bbgy7P41pFjMWJuzsf4,https://open.spotify.com/track/0g2Bbgy7P41pFjM...,0.718,0.461,4,-7.676,0,0.0797,0.0,0.283,0.181,129.953,178176,4


In [None]:
hot_100["Label"] = "H"
hot_100.head()

In [None]:
not_hot = pd.read_csv("../Data/Clean/Not_Hot.csv", index_col = 0)
not_hot.head()

In [None]:
not_hot["Label"] = "N"
not_hot.head()

In [None]:
songs_db = pd.concat([hot_100, not_hot]).reset_index(drop=True)
songs_db

In [None]:
songs_db.to_csv("../Data/Clean/Songs_DB.csv")

## Column dropping

We decided to drop columns ```Energy```, ```Mode```, ```Time Signature``` and ```Duration```, due to having correlations to other columns, class imbalance or not being relevant.

In [None]:
songs_data = songs_db.drop(columns=["Song", "Artist", "ID", "Link", "Label", "Energy", "Mode", "Time Signature", "Duration", "Key", "Liveness"])
songs_data.head()

In [None]:
songs_data.to_csv("../Data/Clean/Songs_Data.csv")

## Transformer

In [None]:
transformer = PowerTransformer()
transformer.fit(songs_data)

filename = "../Transformers/Power_Transformer.pkl"
with open(filename, "wb") as file:
    pickle.dump(transformer, file)

songs_data_pt = transformer.transform(songs_data)
songs_data_pt = pd.DataFrame(songs_data_pt, columns = songs_data.columns)
songs_data_pt.head()

## Scaler

In [None]:
scaler = StandardScaler()
scaler.fit(songs_data_pt.to_numpy())

filename = "../Scalers/Standard_Scaler.pkl"
with open(filename, "wb") as file:
    pickle.dump(scaler, file)

songs_data_pt_ss = scaler.transform(songs_data_pt.to_numpy())
songs_data_pt_ss = pd.DataFrame(songs_data_pt_ss, columns = songs_data_pt.columns)
songs_data_pt_ss.head()

## K-Means

In [None]:
K = range(10, 31)
inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} neighbours".format(k))
    print()
    kmeans = KMeans(n_clusters=k, random_state=22)
    kmeans.fit(songs_data_pt_ss.to_numpy())
    filename = "../Models/KM_" + str(k) + ".pkl"
    with open(filename, "wb") as file:
        pickle.dump(kmeans,file)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(songs_data_pt_ss, kmeans.predict(songs_data_pt_ss.to_numpy())))

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))
ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('K')
ax[0].set_ylabel('Inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[0].set_title('Elbow Method')

ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('K')
ax[1].set_ylabel('Silhouette')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[1].set_title('Silhouette Method')

In [None]:
songs_db["Cluster"] = KMeans(n_clusters=28, random_state=22).fit(songs_data_pt_ss).predict(songs_data_pt_ss)
display(songs_db["Cluster"][songs_db["Label"] == "H"].value_counts())
display(songs_db["Cluster"][songs_db["Label"] == "N"].value_counts())

In [None]:
songs_db.head()

## DBSCAN

In [None]:
d = distance_matrix(songs_data_pt_ss,songs_data_pt_ss)

In [None]:
d2 = np.sort(d)[:,1:]

In [None]:
f = d2[:,2]

In [None]:
f_final = np.sort(f)[::-1]
f1_final = f_final[:500]
f2_final = f_final[:200]
f3_final = f_final[:50]

In [None]:
fig, ax = plt.subplots(3,1, figsize=(10,10))

ax[0].scatter(range(1,len(f1_final)+1), f1_final)
ax[0].set_xlabel("Points")
ax[0].set_ylabel("Distance")
ax[0].grid(visible=True)

ax[1].scatter(range(1,len(f2_final)+1), f2_final)
ax[1].set_xlabel("Points")
ax[1].set_ylabel("Distance")
ax[1].grid(visible=True)

ax[2].scatter(range(1,len(f3_final)+1), f3_final)
ax[2].set_xlabel("Points")
ax[2].set_ylabel("Distance")
ax[2].grid(visible=True)

In [None]:
f_final[4]

In [None]:
dbscan = DBSCAN(eps=f_final[4], min_samples=5)
yhat = dbscan.fit_predict(songs_data_pt_ss)
clusters = np.unique(yhat)
np.unique(yhat)

In [None]:
for index, cluster in enumerate(list(clusters)):
	row_ix = np.where(yhat == cluster)
	plt.scatter(songs_data.to_numpy()[row_ix, 0], songs_data.to_numpy()[row_ix, 1])
    
plt.title("Clusters detected by DBSCAN, blue dots are considered Noise")
plt.show()

## Gaussian Mixtures

In [None]:
N = range(10, 31)
covariance_types = ["full", "tied", "diag", "spherical"]
silhouette_f = []
silhouette_t = []
silhouette_d = []
silhouette_s = []

for covariance in covariance_types:
    for n in N:
        print("Gaussian Mixture with {} components and covariance type = {}".format(n, covariance))
        print()
        
        gm = GaussianMixture(n_components = n, covariance_type = covariance, random_state=22)
        gm.fit(songs_data_pt_ss)
        
        name = ""
        
        if covariance == "full":
            name = "F"
            silhouette_f.append(silhouette_score(songs_data_pt_ss, gm.predict(songs_data_pt_ss)))
            
        elif covariance == "tied":
            name = "T"
            silhouette_t.append(silhouette_score(songs_data_pt_ss, gm.predict(songs_data_pt_ss)))
            
        elif covariance == "diag":
            name = "D"
            silhouette_d.append(silhouette_score(songs_data_pt_ss, gm.predict(songs_data_pt_ss)))
            
        else:
            name = "S"
            silhouette_s.append(silhouette_score(songs_data_pt_ss, gm.predict(songs_data_pt_ss)))
            
        filename = "../Models/GM_"+str(n)+"_"+name+".pkl"
        with open (filename, "wb") as file:
            pickle.dump(gm, file)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(16,8))

ax[0,0].plot(N, silhouette_f, 'bx-')
ax[0,0].set_xlabel('N')
ax[0,0].set_ylabel('Silhouette')
ax[0,0].set_xticks(np.arange(min(N), max(N)+1, 1.0))
ax[0,0].set_title('Silhouette for covariance type = Full')

ax[0,1].plot(N, silhouette_t, 'bx-')
ax[0,1].set_xlabel('N')
ax[0,1].set_ylabel('Silhouette')
ax[0,1].set_xticks(np.arange(min(N), max(N)+1, 1.0))
ax[0,1].set_title('Silhouette for covariance type = Tied')

ax[1,0].plot(N, silhouette_d, 'bx-')
ax[1,0].set_xlabel('N')
ax[1,0].set_ylabel('Silhouette')
ax[1,0].set_xticks(np.arange(min(N), max(N)+1, 1.0))
ax[1,0].set_title('Silhouette for covariance type = Diag')

ax[1,1].plot(N, silhouette_s, 'bx-')
ax[1,1].set_xlabel('N')
ax[1,1].set_ylabel('Silhouette')
ax[1,1].set_xticks(np.arange(min(N), max(N)+1, 1.0))
ax[1,1].set_title('Silhouette for covariance type = Spherical')

In [None]:
songs_db["Cluster"] = GaussianMixture(n_components = 20, covariance_type="spherical", random_state=22).fit(songs_data_pt_ss).predict(songs_data_pt_ss)
display(songs_db["Cluster"][songs_db["Label"] == "H"].value_counts())
display(songs_db["Cluster"][songs_db["Label"] == "N"].value_counts())

## Agglomerative Clustering

In [None]:
N = range(10, 31)
silhouette = []

for n in N:
    print("Agglomerative Clustering with {} components".format(n))
    print()
        
    aglo = AgglomerativeClustering(distance_threshold=None, n_clusters=n)
    silhouette.append(silhouette_score(songs_data_pt_ss, aglo.fit_predict(songs_data_pt_ss)))

    filename = "../Models/AC_"+str(n)+".pkl"
    with open (filename, "wb") as file:
        pickle.dump(aglo, file)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(16,8))

ax.plot(N, silhouette, 'bx-')
ax.set_xlabel('N')
ax.set_ylabel('Silhouette')
ax.set_xticks(np.arange(min(N), max(N)+1, 1.0))
ax.set_title('Silhouette for Agglomerative Clustering')

In [None]:
songs_db["Cluster"] = AgglomerativeClustering(distance_threshold=None, n_clusters=28).fit_predict(songs_data_pt_ss)
songs_db["Cluster"].value_counts()

## Choosing cluster method

We decided to choose the K-Means with k = 7 due to having the least class imbalance among the 3 models that worked for us, and also a nice amount of clusters.

In [None]:
songs_db.head()

In [None]:
songs_db.to_csv("../Data/Clean/Songs_DB_Clusters.csv")

In [None]:
songs_db[songs_db["Cluster"] == 1].head(50)