In [4]:
# Import Statements
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# KMeans Clustering- With PCA

In [5]:
# Assign %pwd to working_dir
working_dir = %pwd

In [6]:
# Reading PCA Features From Disk
spotify = pd.read_csv((working_dir)+"/Data/spotify-pca.csv", delimiter = ",")

In [7]:
pca_names = np.array(
    ['mode_0', 
     'acousticness', 
     'explicit_0', 
     'instrumentalness', 
     'key', 
     'valence'])

In [8]:
data_for_clustering = spotify[pca_names]

### Performing exhaustive search for the optimal K value

In [9]:
# try different values of K to see which K might be the best for K-means
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn import metrics

# Storing K and Silhouette Values for sorting later
df_silhouette_values = pd.DataFrame(columns=['K','Silhouette Values','Silhouette Mean'])

for k in range(3,21):
    kmeans = KMeans(n_clusters=k, max_iter=500)
    kmeans.fit(data_for_clustering)
    clusters = kmeans.predict(data_for_clustering)
    cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

    spotify_silhouettes = metrics.silhouette_samples(data_for_clustering, cluster_df.to_numpy().ravel())
    print(spotify_silhouettes[:20])
    print("\nMean Silhouette Value: ", spotify_silhouettes.mean())
    print('K = ',k)
    
    df_silhouette_values = df_silhouette_values.append({'K': k, 'Silhouette Values': spotify_silhouettes, 'Silhouette Mean':spotify_silhouettes.mean() }, ignore_index=True)

[0.26427766 0.34028793 0.29227394 0.14195452 0.20926535 0.37093779
 0.40483117 0.38915462 0.31081375 0.12711734 0.27977531 0.27980631
 0.3047757  0.31113494 0.29979872 0.32308749 0.30054641 0.29279668
 0.28278382 0.26849683]

Mean Silhouette Value:  0.26727154297297634
K =  3
[ 0.21570664  0.29049923  0.39892645  0.1062708   0.1987046   0.36628428
  0.53010467  0.35240176  0.37461633  0.14358829  0.23495029  0.3077633
  0.36523961  0.27209661  0.36425293  0.41978495  0.33813308  0.30413462
 -0.08190376  0.24298039]

Mean Silhouette Value:  0.2687557507282339
K =  4
[ 0.2545575   0.32419337  0.3797917   0.06362764  0.25742861  0.37205963
  0.5175149   0.36066013  0.37901774  0.10058099  0.27370139  0.31067384
  0.36971218  0.33606822  0.35902858  0.40065028  0.34421517  0.30737733
 -0.08823515  0.28463024]

Mean Silhouette Value:  0.29200734041582244
K =  5
[ 0.346566   -0.00208442  0.37938087  0.06522617  0.20676732  0.37035341
  0.51725482  0.36163773  0.37781989  0.10228324  0.352085

In [10]:
df_silhouette_values = df_silhouette_values.sort_values('Silhouette Mean', ascending=False).nlargest(50, 'Silhouette Mean')
df_silhouette_values

Unnamed: 0,K,Silhouette Values,Silhouette Mean
10,13,"[0.5043133969549626, 0.12129033611852735, 0.45...",0.325729
11,14,"[0.5041760651787965, 0.12082550614254375, 0.44...",0.322958
12,15,"[0.5041677375375535, 0.12082837722410662, 0.45...",0.320459
8,11,"[0.5027327944517269, 0.171960102762028, 0.4540...",0.3164
13,16,"[0.10999961368428665, 0.06828125808006094, 0.1...",0.313898
16,19,"[0.12398628113536839, 0.06323588640800475, 0.2...",0.312377
9,12,"[0.5027327944517269, 0.171960102762028, 0.4549...",0.310722
14,17,"[0.5041807665896382, 0.12088230678792537, 0.24...",0.310606
17,20,"[0.12452963488841297, 0.06281076180394472, 0.1...",0.30918
15,18,"[0.12408768859075439, 0.0635879679059271, 0.23...",0.307994


In [11]:
optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

Optimal K value appears to be 13 .


## Writing Silhouette Values to disk.

In [12]:
# Saved Silhouette Values, Just In Case We Need To Plot Them Later.
df_silhouette_values = df_silhouette_values[['K','Silhouette Values', 'Silhouette Mean']]
df_silhouette_values.to_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values.csv",index = False)

### We caclulated the Silhouette values for K = {3:21}.  After 2 hours, we were finally able to confirm that the optimal K value is 13

In [14]:
# Reading KMeans Silhouette Values From File
df_silhouette_values = pd.read_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values.csv", delimiter = ",")

optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

Optimal K value appears to be 13 .


### Doing some initial clustering exploration with the 6 features discovered in our PCA and with the optimal K

In [15]:
# Passing pca_names for KMeans processing
data_for_clustering = spotify[pca_names]

In [16]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=optimal_k, max_iter=500)
kmeans.fit(data_for_clustering)
clusters = kmeans.predict(data_for_clustering)
cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

### Summary Of Centroid Values For Each Cluster

In [17]:
# Summary Of Centroid Values For Each Cluster
pd.options.display.float_format='{:,.2f}'.format
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=pca_names)
centroids

Unnamed: 0,mode_0,acousticness,explicit_0,instrumentalness,key,valence
0,-0.0,0.09,1.0,0.77,0.43,0.45
1,-0.0,0.8,1.0,0.03,0.73,0.54
2,1.0,0.84,1.0,0.04,0.48,0.5
3,-0.0,0.17,1.0,0.03,0.14,0.62
4,-0.0,0.82,1.0,0.04,0.18,0.48
5,-0.0,0.93,1.0,0.83,0.45,0.4
6,1.0,0.07,1.0,0.79,0.61,0.44
7,-0.0,0.16,1.0,0.02,0.71,0.61
8,1.0,0.16,1.0,0.03,0.29,0.58
9,1.0,0.92,1.0,0.82,0.47,0.39


### Summary Of Cluster Sizes

In [18]:
# Function: cluster_sizes
# Accepts:  cluster labels
# Returns:  size of cluster labels
def cluster_sizes(clusters):
    # clusters is an array of cluster labels
    # for each instance in the data

    size = {}
    cluster_labels = np.unique(clusters)
    n_clusters = cluster_labels.shape[0]

    for c in cluster_labels:
        size[c] = len(spotify[clusters == c])
    return size

In [19]:
# Summary Of Cluster Sizes
size = cluster_sizes(clusters)
cluster_total = 0

for c in size.keys():
    print("Size of Cluster", c, "= ", size[c])
    cluster_total += size[c]

print('\nTotal:              ',cluster_total)

Size of Cluster 0 =  7290
Size of Cluster 1 =  23732
Size of Cluster 2 =  14037
Size of Cluster 3 =  20718
Size of Cluster 4 =  23099
Size of Cluster 5 =  16989
Size of Cluster 6 =  4760
Size of Cluster 7 =  23589
Size of Cluster 8 =  10617
Size of Cluster 9 =  8014
Size of Cluster 10 =  7071
Size of Cluster 11 =  9692
Size of Cluster 12 =  4781

Total:               174389


### Adding class labels to the Spotify dataset.

In [20]:
cluster_df.to_numpy().ravel()
cluster_df

Unnamed: 0,Cluster
0,2
1,2
2,4
3,0
4,12
...,...
174384,8
174385,4
174386,9
174387,10


In [23]:
# adding the what cluster the row belongs to for the whole dataset
spotify['cluster'] = cluster_df
spotify

Unnamed: 0,artists,id,name,mode_0,acousticness,explicit_0,instrumentalness,key,valence,cluster
0,['Mamie Smith'],0cS0A1fUEUd1EW3FcF8AEI,Keep A Song In Your Soul,1,0.99,1,0.00,0.45,0.63,2
1,"[""Screamin' Jay Hawkins""]",0hbkKFIJm7Z05H8Zl9w30f,I Put A Spell On You,1,0.64,1,0.03,0.45,0.95,2
2,['Mamie Smith'],11m7laMUgmOKqI3oYzuhne,Golfing Papa,0,0.99,1,0.00,0.00,0.69,4
3,['Oscar Velazquez'],19Lc5SfJJ5O1oaxY0fpwfh,True House Music - Xavier Santos & Carlos Gomi...,0,0.00,1,0.80,0.18,0.04,0
4,['Mixe'],2hJjbsLCytGsnAHfdsLejp,Xuniverxe,1,0.29,0,0.00,0.91,0.30,12
...,...,...,...,...,...,...,...,...,...,...
174384,"['DJ Combo', 'Sander-7', 'Tony T']",46LhBf6TvYjZU2SMvGZAbn,The One,1,0.01,1,0.00,0.55,0.19,8
174385,['Alessia Cara'],7tue2Wemjd0FZzRtDrQFZd,A Little More,0,0.80,1,0.00,0.36,0.23,4
174386,['Roger Fly'],48Qj61hOdYmUCFJbpQ29Ob,Together,1,0.81,1,0.92,0.36,0.71,9
174387,['Taylor Swift'],1gcyHQpBQ1lfXGdhZmWrHP,champagne problems,0,0.92,0,0.00,0.00,0.32,10


### Saving newly created class labels to file, spotify-classlabels-kmeans.csv.

In [24]:
# saving labels to file, so that we don't
# have to rerun K-means all the time.
spotify.to_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", index = False)