In [1]:
# Import Statements
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Assign %pwd to working_dir
working_dir = %pwd

In [3]:
# Reading Pre Processed Data File
spotify = pd.read_csv((working_dir)+"/Data/spotify-pre_processed.csv", delimiter = ",")

In [4]:
pca_names = np.array(
['mode_0',
 'acousticness',
 'explicit_0',
 'instrumentalness',
 'key',
 'valence',
 'popularity',
 'liveness',
 'popularity'])

In [5]:
data_for_clustering = spotify[pca_names]

### Performing exhaustive search for the optimal K value

In [6]:
# try different values of K to see which K might be the best for K-means
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn import metrics

# Storing K and Silhouette Values for sorting later
df_silhouette_values = pd.DataFrame(columns=['K','Silhouette Values'])

for k in range(3, 50):
    kmeans = KMeans(n_clusters=k, max_iter=500)
    kmeans.fit(data_for_clustering)
    clusters = kmeans.predict(data_for_clustering)
    cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

    spotify_silhouettes = metrics.silhouette_samples(data_for_clustering, cluster_df.to_numpy().ravel())
    print(spotify_silhouettes[:20])
    print("\nMean Silhouette Value: ", spotify_silhouettes.mean())
    print('K = ',k)
    
    df_silhouette_values = df_silhouette_values.append({'K': k, 'Silhouette Values': spotify_silhouettes.mean() }, ignore_index=True)

[0.20843374 0.27364901 0.27380139 0.08847232 0.19540075 0.37498273
 0.38801549 0.38159028 0.32748531 0.03257381 0.20181465 0.29907067
 0.32050407 0.15735702 0.31152153 0.32988927 0.30281849 0.31373351
 0.31299063 0.18914225]

Mean Silhouette Value:  0.22606823152592015
K =  3
[0.16952759 0.24970205 0.35807947 0.02374915 0.21537075 0.23243813
 0.48128407 0.31892027 0.27636922 0.10930472 0.16638381 0.22851017
 0.25504978 0.13906323 0.33823835 0.37825711 0.30895121 0.21021888
 0.13379315 0.15784305]

Mean Silhouette Value:  0.21724444301914264
K =  4
[ 0.35466559  0.14669009  0.33520181  0.13837096  0.1073196   0.32753948
  0.44624678  0.26973783  0.34574795  0.20971215  0.38065015  0.29018383
  0.32632559  0.08606708  0.29313613  0.33521031  0.27379586  0.28351711
 -0.02248808  0.41264962]

Mean Silhouette Value:  0.22766393734604795
K =  5
[ 0.320925    0.07867747  0.33840139  0.10701758  0.11438723  0.32474031
  0.45967793  0.28264636  0.34418394  0.18310585  0.3505237   0.28812254
  0

[ 0.10450075  0.05282192  0.18550732  0.25877201  0.24671959  0.06515822
  0.05049253  0.298956    0.16781293  0.29617741  0.04219723  0.1684433
  0.14065445  0.10669294  0.41988601  0.41222437  0.38155355  0.08558918
 -0.01166071  0.11641803]

Mean Silhouette Value:  0.2077798328677678
K =  32
[ 0.15233177  0.07483415  0.19817323  0.25898222  0.38318404  0.06279791
  0.05505772  0.25553378  0.16017974  0.29484866  0.03627849  0.15971265
  0.13021889 -0.02212122  0.34680849  0.37220543  0.37531607  0.09768771
  0.03283847  0.18370689]

Mean Silhouette Value:  0.20985221733297943
K =  33
[ 0.03802397  0.20818474  0.32985166  0.25857053  0.38097852  0.06065197
 -0.02149043  0.28375922  0.1550808   0.29543124  0.02801693  0.15489163
  0.13024748 -0.02548388  0.24469062  0.32673577  0.37787513  0.09748382
  0.12693081  0.15407528]

Mean Silhouette Value:  0.21016291173664065
K =  34
[ 0.12637992  0.26405108  0.20498924  0.25935411  0.38097852  0.06222168
  0.07008117  0.27260554  0.1567056

In [7]:
df_silhouette_values = df_silhouette_values.sort_values('Silhouette Values', ascending=False).nlargest(50, 'Silhouette Values')
df_silhouette_values

Unnamed: 0,K,Silhouette Values
10,13.0,0.242631
3,6.0,0.241541
8,11.0,0.237546
11,14.0,0.235171
6,9.0,0.234827
12,15.0,0.231603
9,12.0,0.230541
2,5.0,0.227664
13,16.0,0.226723
0,3.0,0.226068


In [23]:
optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

Optimal K value appears to be 13 .


## Writing Silhouette Values to disk.

In [46]:
df_silhouette_values = df_silhouette_values[['K','Silhouette Values']]
df_silhouette_values.to_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values.csv",index = False)