In [16]:
# Import Statements
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
# Assign %pwd to working_dir
working_dir = %pwd

In [23]:
# Reading PCA Features From Disk
spotify = pd.read_csv((working_dir)+"/Data/spotify-pca.csv", delimiter = ",")

In [27]:
spotify

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
0,['Mamie Smith'],0cS0A1fUEUd1EW3FcF8AEI,Keep A Song In Your Soul,1920s,1,0.991000,0,0.000522,0.454545,0.6340
1,"[""Screamin' Jay Hawkins""]",0hbkKFIJm7Z05H8Zl9w30f,I Put A Spell On You,1920s,1,0.643000,0,0.026400,0.454545,0.9500
2,['Mamie Smith'],11m7laMUgmOKqI3oYzuhne,Golfing Papa,1920s,0,0.993000,0,0.000018,0.000000,0.6890
3,['Oscar Velazquez'],19Lc5SfJJ5O1oaxY0fpwfh,True House Music - Xavier Santos & Carlos Gomi...,1920s,0,0.000173,0,0.801000,0.181818,0.0422
4,['Mixe'],2hJjbsLCytGsnAHfdsLejp,Xuniverxe,1920s,1,0.295000,1,0.000246,0.909091,0.2990
...,...,...,...,...,...,...,...,...,...,...
174384,"['DJ Combo', 'Sander-7', 'Tony T']",46LhBf6TvYjZU2SMvGZAbn,The One,2020s,1,0.009170,0,0.000060,0.545455,0.1860
174385,['Alessia Cara'],7tue2Wemjd0FZzRtDrQFZd,A Little More,2020s,0,0.795000,0,0.000000,0.363636,0.2280
174386,['Roger Fly'],48Qj61hOdYmUCFJbpQ29Ob,Together,2020s,1,0.806000,0,0.920000,0.363636,0.7140
174387,['Taylor Swift'],1gcyHQpBQ1lfXGdhZmWrHP,champagne problems,2020s,0,0.920000,1,0.000000,0.000000,0.3200


In [26]:
pca_names = np.array([spotify.describe().columns])
pca_names

array([['mode_0', 'acousticness', 'explicit_1', 'instrumentalness',
        'key', 'valence']], dtype=object)

In [22]:
data_for_clustering = spotify[pca_names]

### Performing exhaustive search for the optimal K value

In [None]:
# try different values of K to see which K might be the best for K-means
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn import metrics

# Storing K and Silhouette Values for sorting later
df_silhouette_values = pd.DataFrame(columns=['K','Silhouette Values','Silhouette Mean'])

for k in range(3,21):
    kmeans = KMeans(n_clusters=k, max_iter=500)
    kmeans.fit(data_for_clustering)
    clusters = kmeans.predict(data_for_clustering)
    cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

    spotify_silhouettes = metrics.silhouette_samples(data_for_clustering, cluster_df.to_numpy().ravel())
    print(spotify_silhouettes[:20])
    print("\nMean Silhouette Value: ", spotify_silhouettes.mean())
    print('K = ',k)
    
    df_silhouette_values = df_silhouette_values.append({'K': k, 'Silhouette Values': spotify_silhouettes, 'Silhouette Mean':spotify_silhouettes.mean() }, ignore_index=True)

In [None]:
df_silhouette_values = df_silhouette_values.sort_values('Silhouette Mean', ascending=False).nlargest(50, 'Silhouette Mean')
df_silhouette_values

In [None]:
optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

## Writing Silhouette Values to disk.

In [None]:
# Saved Silhouette Values, Just In Case We Need To Plot Them Later.
df_silhouette_values = df_silhouette_values[['K','Silhouette Values', 'Silhouette Mean']]
df_silhouette_values.to_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values.csv",index = False)

# *****Everything Below Can Be Copied To Final Project Notebook*****

# KMeans Clustering
### The bulk of the work was performed on separate Notebook, Final Project - KMeans.ipynb.  

### We caclulated the Silhouette values for K = {3:21}.  After 2 hours, we were finally able to confirm that the optimal K value is 13

In [None]:
# Reading KMeans Silhouette Values From File
df_silhouette_values = pd.read_csv((working_dir)+"/Data/spotify-kmeans-silhouette_values.csv", delimiter = ",")

optimal_k = int(df_silhouette_values.K.iloc[0])
print('Optimal K value appears to be',optimal_k,'.')

### Doing some initial clustering exploration with the 6 features discovered in our PCA and with the optimal K

In [None]:
# Passing pca_names for KMeans processing
data_for_clustering = spotify[pca_names]

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=optimal_k, max_iter=500)
kmeans.fit(data_for_clustering)
clusters = kmeans.predict(data_for_clustering)
cluster_df = pd.DataFrame(clusters, columns=["Cluster"])

### Summary Of Centroid Values For Each Cluster

In [None]:
# Summary Of Centroid Values For Each Cluster
pd.options.display.float_format='{:,.2f}'.format
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=pca_names)
centroids

### Summary Of Cluster Sizes

In [None]:
# Function: cluster_sizes
# Accepts:  cluster labels
# Returns:  size of cluster labels
def cluster_sizes(clusters):
    # clusters is an array of cluster labels
    # for each instance in the data

    size = {}
    cluster_labels = np.unique(clusters)
    n_clusters = cluster_labels.shape[0]

    for c in cluster_labels:
        size[c] = len(spotify[clusters == c])
    return size

In [None]:
# Summary Of Cluster Sizes
size = cluster_sizes(clusters)
cluster_total = 0

for c in size.keys():
    print("Size of Cluster", c, "= ", size[c])
    cluster_total += size[c]

print('\nTotal:              ',cluster_total)

### Adding class labels to the Spotify dataset.

In [None]:
cluster_df.to_numpy().ravel()
cluster_df

In [None]:
# adding the what cluster the row belongs to for the whole dataset
spotify['cluster'] = cluster_df
spotify

### Saving newly created class labels to file, spotify-classlabels-kmeans.csv.

In [None]:
# saving labels to file, so that we don't
# have to rerun K-means all the time.
spotify.to_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", index = False)

### Inspecting Some Rows, TBD?

In [None]:
#inspecting some rows
pd.set_option('display.max_columns', 500)

#spotify.loc[spotify['cluster'] == 0].sample(n=20) # seems to be mid to high-energy poppy
spotify.loc[spotify['cluster'] == 1].sample(n=20) # seems to be mellow instrumental
#spotify.loc[spotify['cluster'] == 6].sample(n=20) # seems to be a more easy-listening collection