# Making clusters

#### Importing libraries

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from random import randint
from time import sleep
from itertools import islice
from pandas import json_normalize

# Reading out the data and creating a dataframe

In [2]:
df_list = []
for i in range(1,101):
    df_list.append(pd.read_csv('./Data/part'+str(i)+'.csv'))    

In [3]:
df_songs = pd.concat(df_list)

In [4]:
df_songs.head(3)

Unnamed: 0,uri,title,artist_name,artist_id,album_id,album_name,length,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:7zgqtptZvhf8GEmdsM2vp2,...Ready For It?,Taylor Swift,06HL4z0CvFAxyc27GXpf02,0HG8fMDhvN2tH5uPHFsyZP,...Ready For It?,208198,False,0,0.615,0.779,2,-6.454,1,0.135,0.0665,0.0,0.155,0.453,160.0
1,spotify:track:4Vxu50qVrQcycjRyJQaZLC,Life Changes,Thomas Rhett,6x2LnllRG5uGarZMsD4iO8,4w5Jvreahp3yvLqc4vCr9I,Life Changes,190226,False,63,0.687,0.845,7,-4.37,1,0.0576,0.1,0.0,0.0452,0.809,87.972
2,spotify:track:6b8Be6ljOzmkOmFslEb23P,24K Magic,Bruno Mars,0du5cEVh5yTK9QJze8zA0C,4PgleR09JVnm3zY1fW3XBA,24K Magic,225983,False,81,0.818,0.803,1,-4.282,1,0.0797,0.034,0.0,0.153,0.632,106.97


In [5]:
df_songs.shape

(10000, 20)

# Preparing the dataframe

#### More imports

In [None]:
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from matplotlib.lines import Line2D
from sklearn.cluster import KMeans

#### Reducing columns

In [None]:
df_songs.columns

Most of the data we have is not needed for the clustering process, we only keep the audio features.

In [None]:
df_prep = df_songs[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo']]
df_prep.columns

#### Cleaning

In [None]:
# Check nan values
df_prep.isna().sum()

In [None]:
# Checking types
df_prep.dtypes

There is nothing to clean

#### Scaling

In [None]:
scaler = StandardScaler().fit(df_prep)
scaler

In [None]:
normalized = scaler.transform(df_prep)
df_normalized = pd.DataFrame(normalized, columns=df_prep.columns)

In [None]:
df_normalized.head(5)

# Clustering with KMeans

In [None]:
# Initializing the classifier
kmeans = cluster.KMeans(n_clusters=10)

In [None]:
# Fitting with our scaled data
kmeans.fit(df_normalized)

In [None]:
# Calculating the clusters
pred = kmeans.predict(df_normalized)

In [None]:
# We now have predictions for all our songs
len(pred)

In [None]:
# Seperated into 10 clusters
pd.DataFrame(pred)[0].unique()

# Evaluation

In [None]:
# Looking at inertia
kmeans.inertia_ 

# Choosing k

In [None]:
# We try out different values for K, since grouping music in just 2 categories is just not
# useful we start off with 10
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(df_normalized)
    inertia.append(kmeans.inertia_)

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')
plt.show()

We dont have a clear elbow point here, we try to improve with PCA

# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
pca.fit(df_normalized)

#### Inspecting the components

In [None]:
pca.explained_variance_ratio_.round(2)

In [None]:
pca.explained_variance_ratio_[:8].sum()

Apparently we need a lot of the components to not loose too much information, nevertheless we retry to model with the reduced dimensions.

# Modeling with reduced dimensions

#### Setting PCA to 85%

In [None]:
pca = PCA(0.85)
pca.fit(df_normalized)

In [None]:
# Applying to our data
df_normalized_pca = pca.transform(df_normalized)

#### Model

In [None]:
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(df_normalized_pca)
    inertia.append(kmeans.inertia_)

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')
plt.show()

The curve looks a little bit more defined, I would still apply K = 10

#### Comparing

In [None]:
kmeans = cluster.KMeans(n_clusters=10)
kmeans.fit(df_normalized)
kmeans.inertia_ 

In [None]:
kmeans_pca = KMeans(n_clusters=10)
kmeans_pca.fit(df_normalized_pca)
kmeans_pca.inertia_ 

# Correction

After integrating our kmean model into our reconmender I could see, that the reconmended songs feel very random. My guess is, that k = 10 looks good technically, but is not suitable for this use case.
We try again for a higher number of k.

In [None]:
kmeans_pca_30 = KMeans(n_clusters=30)
kmeans_pca_30.fit(df_normalized_pca)
kmeans_pca_30.inertia_ 

#### We try 45 classes

In [None]:
kmeans_pca_45 = KMeans(n_clusters=45)
kmeans_pca_45.fit(df_normalized_pca)
kmeans_pca_45.inertia_

# Saving the classifier
We save the PCA treated model because of the lower inertia.

In [None]:
import pickle

In [None]:
pickle.dump(kmeans_pca, open('kmeans10.p', 'wb'))

In [None]:
pickle.dump(kmeans_pca_30, open('kmeans30.p', 'wb'))

In [None]:
pickle.dump(kmeans_pca_45, open('kmeans45.p', 'wb'))

In [None]:
pickle.dump(pca, open('pca.p', 'wb'))

In [None]:
pickle.dump(scaler, open('scaler.p', 'wb'))