In [1]:
# Import Statements
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Assign %pwd to working_dir
working_dir = %pwd

In [3]:
# Reading Pre Processed File From Disk
spotify = pd.read_csv((working_dir)+"/Data/spotify-pre_processed.csv", delimiter = ",")
spotify

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,name,popularity,speechiness,tempo,valence,year,explicit_0,explicit_1,mode_0,mode_1
0,0.991000,['Mamie Smith'],0.598,0.030637,0.224,0cS0A1fUEUd1EW3FcF8AEI,0.000522,0.454545,0.3790,0.741868,Keep A Song In Your Soul,0.12,0.0936,0.615900,0.6340,1920s,1,0,1,0
1,0.643000,"[""Screamin' Jay Hawkins""]",0.852,0.027237,0.517,0hbkKFIJm7Z05H8Zl9w30f,0.026400,0.454545,0.0809,0.825918,I Put A Spell On You,0.07,0.0534,0.356823,0.9500,1920s,1,0,1,0
2,0.993000,['Mamie Smith'],0.647,0.029792,0.186,11m7laMUgmOKqI3oYzuhne,0.000018,0.000000,0.5190,0.750168,Golfing Papa,0.04,0.1740,0.400810,0.6890,1920s,1,0,0,1
3,0.000173,['Oscar Velazquez'],0.730,0.078215,0.798,19Lc5SfJJ5O1oaxY0fpwfh,0.801000,0.181818,0.1280,0.825135,True House Music - Xavier Santos & Carlos Gomi...,0.17,0.0425,0.525640,0.0422,1920s,1,0,0,1
4,0.295000,['Mixe'],0.704,0.030054,0.707,2hJjbsLCytGsnAHfdsLejp,0.000246,0.909091,0.4020,0.845102,Xuniverxe,0.02,0.0768,0.501324,0.2990,1920s,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174384,0.009170,"['DJ Combo', 'Sander-7', 'Tony T']",0.792,0.026752,0.866,46LhBf6TvYjZU2SMvGZAbn,0.000060,0.545455,0.1780,0.859933,The One,0.00,0.0356,0.517324,0.1860,2020s,1,0,1,0
174385,0.795000,['Alessia Cara'],0.429,0.026209,0.211,7tue2Wemjd0FZzRtDrQFZd,0.000000,0.363636,0.1960,0.756949,A Little More,0.00,0.0360,0.388942,0.2280,2020s,1,0,0,1
174386,0.806000,['Roger Fly'],0.671,0.039977,0.589,48Qj61hOdYmUCFJbpQ29Ob,0.920000,0.363636,0.1130,0.745549,Together,0.00,0.0282,0.443757,0.7140,2020s,1,0,1,0
174387,0.920000,['Taylor Swift'],0.462,0.044824,0.240,1gcyHQpBQ1lfXGdhZmWrHP,0.000000,0.000000,0.1130,0.750497,champagne problems,0.69,0.0377,0.703549,0.3200,2020s,0,1,0,1


In [4]:
non_pca_names = np.array(spotify.describe().columns)
non_pca_names

array(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'popularity',
       'speechiness', 'tempo', 'valence', 'explicit_0', 'explicit_1',
       'mode_0', 'mode_1'], dtype=object)

In [5]:
data_for_clustering = spotify[non_pca_names]

### Performing exhaustive search for the optimal K value

In [None]:
# try different values of K to see which K might be the best for K-means
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn import metrics

# Storing K and Silhouette Values for sorting later
df_silhouette_values = pd.DataFrame(columns=['K','Silhouette Values','Silhouette Mean'])

for k in range(3,21):
    kmeans = KMeans(n_clusters=k, max_iter=500)
    kmeans.fit(data_for_clustering)
    clusters = kmeans.predict(data_for_clustering)
    cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

    spotify_silhouettes = metrics.silhouette_samples(data_for_clustering, cluster_df.to_numpy().ravel())
    print(spotify_silhouettes[:20])
    print("\nMean Silhouette Value: ", spotify_silhouettes.mean())
    print('K = ',k)
    
    df_silhouette_values = df_silhouette_values.append({'K': k, 'Silhouette Values': spotify_silhouettes, 'Silhouette Mean':spotify_silhouettes.mean() }, ignore_index=True)

[0.32491168 0.37797116 0.28125591 0.14657901 0.2201213  0.37278143
 0.36234336 0.37663317 0.32410729 0.10842662 0.25378334 0.27602565
 0.29870697 0.26526979 0.27333595 0.28636967 0.30005237 0.25589705
 0.2634182  0.33359271]

Mean Silhouette Value:  0.25198357009315403
K =  3
[0.36286317 0.08450712 0.28091655 0.14642971 0.15460239 0.37292541
 0.36210528 0.37656634 0.32423277 0.10825167 0.27212918 0.27608676
 0.29881156 0.26527415 0.27293635 0.28590977 0.29993376 0.2558843
 0.26319222 0.37012308]

Mean Silhouette Value:  0.22827000622237326
K =  4


In [None]:
df_silhouette_values = df_silhouette_values.sort_values('Silhouette Mean', ascending=False).nlargest(50, 'Silhouette Mean')
df_silhouette_values

In [None]:
optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

## Writing Silhouette Values to disk.

In [None]:
# Saved Silhouette Values, Just In Case We Need To Plot Them Later.
df_silhouette_values = df_silhouette_values[['K','Silhouette Values', 'Silhouette Mean']]
df_silhouette_values.to_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values-without_pca.csv",index = False)

# KMeans Clustering

### We caclulated the Silhouette values for K = {3:21}.  After 2 hours, we were finally able to confirm that the optimal K value is 13

In [None]:
# Reading KMeans Silhouette Values From File
df_silhouette_values = pd.read_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values-without_pca.csv", delimiter = ",")

optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

### Doing some initial clustering exploration with the 6 features discovered in our PCA and with the optimal K

In [None]:
# Passing pca_names for KMeans processing
data_for_clustering = spotify[pca_names]

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=optimal_k, max_iter=500)
kmeans.fit(data_for_clustering)
clusters = kmeans.predict(data_for_clustering)
cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

### Summary Of Centroid Values For Each Cluster

In [None]:
# Summary Of Centroid Values For Each Cluster
pd.options.display.float_format='{:,.2f}'.format
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=pca_names)
centroids

### Summary Of Cluster Sizes

In [None]:
# Function: cluster_sizes
# Accepts:  cluster labels
# Returns:  size of cluster labels
def cluster_sizes(clusters):
    # clusters is an array of cluster labels
    # for each instance in the data

    size = {}
    cluster_labels = np.unique(clusters)
    n_clusters = cluster_labels.shape[0]

    for c in cluster_labels:
        size[c] = len(spotify[clusters == c])
    return size

In [None]:
# Summary Of Cluster Sizes
size = cluster_sizes(clusters)
cluster_total = 0

for c in size.keys():
    print("Size of Cluster", c, "= ", size[c])
    cluster_total += size[c]

print('\nTotal:              ',cluster_total)

### Adding class labels to the Spotify dataset.

In [None]:
cluster_df.to_numpy().ravel()
cluster_df

In [None]:
# adding the what cluster the row belongs to for the whole dataset
spotify['cluster'] = cluster_df
spotify

### Saving newly created class labels to file, spotify-classlabels-kmeans-without_pca.csv.

In [None]:
# saving labels to file, so that we don't
# have to rerun K-means all the time.
spotify.to_csv((working_dir)+"/Data/spotify-classlabels-kmeans-without_pca.csv", index = False)
