# Optimize clustering into playlists with 5000 songs

# Import libraries and data

In [None]:
# import sys
# !{sys.executable} -m pip install nbformat

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer
import matplotlib.pyplot as plt
import seaborn as sns

# This will ensure the outputs of the .transform() method are pandas data frames
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
songs = pd.read_csv("../data/3_spotify_5000_songs.csv")
songs.columns = songs.columns.str.strip()
songs = songs.set_index(["name", "artist"])
songs_df = songs.drop(columns=["id", "html",  "type", "Unnamed: 0"])
#"time_signature", "duration_ms","tempo", "mode", "key", "loudness",
songs_df.head()

In [None]:
# calculate the correlation matrix on the numeric columns
corr = songs_df.select_dtypes('number').corr()

# plot the heatmap
sns.heatmap(corr,cmap="vlag", annot=False);

---
# Feature Selection

In [None]:
# to do!

---
# Try various scaling and transformation algorithms

Try out the different scalers and transformers on the Spotify data and compare the results. Which scaler do you feel had the greatest impact? And, maybe, which scaler didn't help at all?

In [None]:
songs_distances = pd.DataFrame(pairwise_distances(songs_df),
                                        index=songs_df.index,
                                        columns=songs_df.index)

## Min Max

In [None]:
scaler = MinMaxScaler()
songs_minmax = scaler.fit_transform(songs_df) # in one step, could also be done seperatly
songs_minmax_distances = pd.DataFrame(pairwise_distances(songs_minmax),
                                        index=songs_df.index,
                                        columns=songs_df.index)

## Standard Scaler

In [None]:
scaler = StandardScaler()
songs_standard = scaler.fit_transform(songs_df)
songs_standard_distances = pd.DataFrame(pairwise_distances(songs_standard),
                                        index=songs_df.index,
                                        columns=songs_df.index)


## Robust Scaler

In [None]:
scaler = RobustScaler()
songs_robust = scaler.fit_transform(songs_df)
songs_robust_distances = pd.DataFrame(pairwise_distances(songs_robust),
                                        index=songs_df.index,
                                        columns=songs_df.index)

## Quantile Transformer

In [None]:
# Create a QuantileTransformer object
scaler = QuantileTransformer()
songs_quantile = scaler.fit_transform(songs_df)
songs_quantile_distances = pd.DataFrame(pairwise_distances(songs_quantile),
                                        index=songs_df.index,
                                        columns=songs_df.index)

## Power transformer

In [None]:
scaler = PowerTransformer()
songs_power = scaler.fit_transform(songs_df)
songs_power_distances = pd.DataFrame(pairwise_distances(songs_power),
                                        index=songs_df.index,
                                        columns=songs_df.index)

## Plot all scalings

In [None]:
songs_df.columns

In [None]:
# Histograms
column_name = 'tempo'
# choose feature here
fig, ax = plt.subplots(3, 2, figsize=(20, 20))

sns.histplot(data=songs_df.loc[:,column_name], bins=10, kde=True, ax=ax[0, 0]);
sns.histplot(data=songs_minmax.loc[:,column_name], bins=10, kde=True, ax=ax[0, 1]);
sns.histplot(data=songs_standard.loc[:,column_name], bins=10, kde=True, ax=ax[1, 1]);
sns.histplot(data=songs_robust.loc[:,column_name], bins=10, kde=True, ax=ax[1, 0]);
sns.histplot(data=songs_quantile.loc[:,column_name], bins=12, kde=True, ax=ax[2, 0]);
sns.histplot(data=songs_power.loc[:,column_name], bins=10, kde=True, ax=ax[2, 1]);

ax[0, 0].set_title(f'Distribution of {column_name} without scaling')
ax[0, 1].set_title(f'Distribution of {column_name} with MinMax scaling')
ax[1, 0].set_title(f'Distribution of {column_name} with Robust scaling')
ax[1, 1].set_title(f'Distribution of {column_name} with Standard scaling')
ax[2, 0].set_title(f'Distribution of {column_name} with Quantile transforming')
ax[2, 1].set_title(f'Distribution of {column_name} with Power transforming')

plt.show()

In [None]:
# Heatmans (only do with 10 sampled songs)
import random

samples = random.sample(range(0, 5001), 10)
fig, ax = plt.subplots(3, 2, figsize=(15, 20))

sns.heatmap(songs_distances.iloc[samples, samples], ax=ax[0, 0], linewidths=.2);
sns.heatmap(songs_minmax_distances.iloc[samples, samples], ax=ax[0, 1], linewidths=.2);
sns.heatmap(songs_robust_distances.iloc[samples, samples], ax=ax[1, 0], linewidths=.2);
sns.heatmap(songs_standard_distances.iloc[samples, samples], ax=ax[1, 1], linewidths=.2);
sns.heatmap(songs_quantile_distances.iloc[samples, samples], ax=ax[2, 0], linewidths=.2);
sns.heatmap(songs_power_distances.iloc[samples, samples], ax=ax[2, 1], linewidths=.2);

for a in ax.flat:
    a.set_xticks([])
    a.set_yticks([])

ax[0, 0].set_title('NOT in the same 1-10 scaling')
ax[0, 1].set_title('MinMax scaled')
ax[1, 0].set_title('Robust scaled')
ax[1, 1].set_title('Standard scaled')
ax[2, 0].set_title('Quantile transformed')
ax[2, 1].set_title('Power transformed');


---
# Try numbers of clusters
Each playlist should have between 50 and 250 songs. For a dataset with roughly 5000 songs, that means between 20 and 100 clusters.

In [None]:
import numpy as np

scaled_audio_features = songs_minmax.copy()
inertia_list = []
n_cluster_list = []
silhouette_list = []
songs_Labels_list = []
cluster_centers_list = []
min_count = []
max_count = []
mean_count = []

max_k = 100
for i in range(2,max_k+1 ):
    myKMeans = KMeans(n_clusters=i)
    myKMeans.fit(scaled_audio_features)
    n_cluster_list.append(i)
    inertia_list.append(round(myKMeans.inertia_))
    silhouette_list.append(silhouette_score(scaled_audio_features, myKMeans.labels_)) # not yet working, all values zero! Check-up
    unique, counts = np.unique(myKMeans.labels_, return_counts=True)

    # Compute statistics
    min_count.append(counts.min())
    max_count.append(counts.max())
    mean_count.append(counts.mean())
    songs_Labels_list.append(myKMeans.labels_)
    cluster_centers_list.append(myKMeans.cluster_centers_)

clusters_df = pd.DataFrame({'n_clusters':n_cluster_list, 
                            'inertia':inertia_list, 
                            'silhouette':silhouette_list, 
                            'min':min_count,
                            'max':max_count,
                            'mean':mean_count,
                            'labels':songs_Labels_list,
                            'centroids':cluster_centers_list})
clusters_df

In [None]:
clusters_df.plot(x='n_clusters', y='inertia');

In [None]:
# from example with sns
# Set the Seaborn theme to darkgrid
sns.set_theme(style='darkgrid')

(
# Create a line plot of the inertia scores
sns.relplot(data=clusters_df,
            y = 'inertia',
            x = 'n_clusters',
            kind = 'line',
            marker = 'o',
            height = 8,
            aspect = 2)
# Set the title of the plot
.set(title=f"Inertia score from 2 to {max_k} clusters")
# Set the axis labels
.set_axis_labels("Number of clusters", "Inertia score")
);

In [None]:
clusters_df.plot(x='n_clusters', y='silhouette');

In [None]:
# plot avg number of songs/playlist, as well as min/max
sns.set_theme(style='darkgrid')

(
# Create a line plot of the inertia scores
sns.relplot(data=clusters_df,
            y = 'in',
            x = 'n_clusters',
            kind = 'line',
            marker = 'o',
            height = 8,
            aspect = 2)
# Set the title of the plot
.set(title=f"NUmber of songsper playlist from 2 to {max_k} clusters")
# Set the axis labels
.set_axis_labels("Number of clusters", "Inertia score")
);

---
# Evaluate Clusters

In [None]:
# Attach the cluster output to our original DataFrame
scaled_features_df["cluster"] = clusters
scaled_features_df.groupby(by="cluster").mean()

---
# Suggestions beyond the algorithm
* include songs liked by user
* include popular songs
* -> songs could be in more than one playlist
* include curated lists!!!
Either spotify API or kaggle dataset