# Clustering the songs

### import necessary libraries

In [None]:
import os
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

### Load the data and set `song_name` and `artist` as the index
### Drop unnecessary columns (features / dimensions)

In [None]:
# small dataset 10 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_10_songs\df_audio_features_10.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","song_name"])
songs_data_clean_df = songs_data_df.drop(["duration_ms", "time_signature", "id", "html"], axis=1)

In [None]:
# medium dataset 1000 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_1000_songs\df_audio_features_1000.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","name"])
songs_data_clean_df = songs_data_df.drop(["type", "duration_ms", "time_signature", "id", "html"], axis=1)

In [None]:
# big dataset 5000 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_5000_songs\df_audio_features_5000.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","name"])
songs_data_clean_df = songs_data_df.drop(["type", "duration_ms", "time_signature", "id", "html"], axis=1)

In [None]:
songs_data_clean_df

### Features to look at:

In [None]:
x = "energy"
y = "danceability"

### Plot of the clean data in relation to the above defined features

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=songs_data_clean_df, x=x, y=y)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show()

# Data Scaling (SciKit-Learn)

In [None]:
# min-max scaler
song_sk_pre_mima = MinMaxScaler().fit_transform(songs_data_clean_df)
song_sk_pre_mima_df = pd.DataFrame(song_sk_pre_mima, columns=songs_data_clean_df.columns, index=songs_data_clean_df.index)
song_sk_pre_mima_df

In [None]:
# robust scaler
song_sk_pre_robu = RobustScaler().fit_transform(songs_data_clean_df)
song_sk_pre_robu_df = pd.DataFrame(song_sk_pre_robu, columns=songs_data_clean_df.columns, index=songs_data_clean_df.index)
song_sk_pre_robu_df

In [None]:
# standard scaler
song_sk_pre_stand = StandardScaler().fit_transform(songs_data_clean_df)
song_sk_pre_stand_df = pd.DataFrame(song_sk_pre_stand, columns=songs_data_clean_df.columns, index=songs_data_clean_df.index)
song_sk_pre_stand_df

In [None]:
# quantile transformer
song_sk_pre_quat = QuantileTransformer().fit_transform(songs_data_clean_df)
song_sk_pre_quat_df = pd.DataFrame(song_sk_pre_quat, columns=songs_data_clean_df.columns, index=songs_data_clean_df.index)
song_sk_pre_quat_df

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show()

# Data Clustering

In [None]:
# number of clusters
n_clust = 30

In [None]:
# clustering based on min-max scaler
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_mima_df)
clusters = song_kmeans.predict(song_sk_pre_mima_df)
song_sk_pre_mima_df["cluster"] = clusters

In [None]:
# clustering based on robust scaler
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_robu_df)
clusters = song_kmeans.predict(song_sk_pre_robu_df)
song_sk_pre_robu_df["cluster"] = clusters

In [None]:
# clustering based on standard scaler
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_stand_df)
clusters = song_kmeans.predict(song_sk_pre_stand_df)
song_sk_pre_stand_df["cluster"] = clusters

In [None]:
# clustering based on quantile transformation
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_quat_df)
clusters = song_kmeans.predict(song_sk_pre_quat_df)
song_sk_pre_quat_df["cluster"] = clusters

In [None]:
song_sk_pre_mima_df

In [None]:
song_sk_pre_robu_df

In [None]:
song_sk_pre_stand_df

In [None]:
song_sk_pre_quat_df

In [None]:
song_kmeans.cluster_centers_

In [None]:
cl_pos = pd.DataFrame(song_kmeans.cluster_centers_)
cl_pos.columns = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "cluster"]
cl_pos.head(1)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show()

In [None]:
# create plot dataframe
song_sk_pre_mima_df["cluster"] = song_kmeans.labels_

# size and plot titles
fig, ax = plt.subplots(figsize=(10, 8))
plt.title('Visualising clusters')
plt.xlabel(f'{x} normalised')
plt.ylabel(f'{y} normalised')

# scatter plot
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y, hue='cluster', palette='Set2', s=75)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.show()

In [None]:
song_sk_pre_mima_df.groupby(by="cluster").mean()

In [None]:
centroids = song_kmeans.cluster_centers_
centr_df = pd.DataFrame(centroids)
centr_df

In [None]:
eucl_centr_df = pd.DataFrame(pairwise_distances(centr_df),
                    index=centr_df.index,
                    columns=centr_df.index)

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(eucl_centr_df);