# Project 6 Moosic

agreed upon features: `danceability`, `energy`, `acousticness`, `instrumentalness`, `valence`, `tempo`

agreed upon scaler: `MinMaxScaler`

### Import necessary libraries and modules

In [None]:
import os
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load the datasets, set `song_name` and `artist` as the index

In [None]:
# small dataset 10 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_10_songs\df_audio_features_10.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","song_name"])

In [None]:
# medium dataset 1000 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_1000_songs\df_audio_features_1000.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","name"])

In [None]:
# big dataset 5000 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_5000_songs\df_audio_features_5000.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","name"])

In [None]:
songs_data_df.head()

### Drop unnecessary columns (see above)

In [None]:
# for small dataset
songs_data_clean_df = songs_data_df.drop(["loudness", "duration_ms", "time_signature", "id", "html", "speechiness", "mode", "key"], axis=1)

In [None]:
# for medium and big dataset
songs_data_clean_df = songs_data_df.drop(["liveness", "loudness", "type", "duration_ms", "time_signature", "id", "html", "speechiness", "mode", "key"], axis=1)

In [None]:
songs_data_clean_df.head()

### Features to look at:

In [None]:
x = "energy"
y = "danceability"

### Plot the cleaned data in relation to the above defined features

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=songs_data_clean_df, x=x, y=y)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show()

### Scale the dataset

In [None]:
# min-max scaler
song_sk_pre_mima = MinMaxScaler().fit_transform(songs_data_clean_df)
song_sk_pre_mima_df = pd.DataFrame(song_sk_pre_mima, columns=songs_data_clean_df.columns, index=songs_data_clean_df.index)
song_sk_pre_mima_df

## KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
# number of clusters
n_clust = 16

In [None]:
# clustering based on min-max scaler
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_mima_df)
clusters = song_kmeans.predict(song_sk_pre_mima_df)
song_sk_pre_mima_df["cluster"] = clusters

In [None]:
song_sk_pre_mima_df

In [None]:
song_sk_pre_mima_df.loc[song_sk_pre_mima_df["cluster"] == 14].sample(10)

### Plot the centroids in the scattered dataset

In [None]:
song_kmeans.cluster_centers_

In [None]:
cl_pos = pd.DataFrame(song_kmeans.cluster_centers_)
cl_pos.columns = ["danceability", "energy", "acousticness", "instrumentalness", "valence", "tempo"]

In [None]:
song_sk_pre_mima_df.head(1)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show()

### 2D Plot the centroids in the scattered dataset with the clusters in different colours

In [None]:
# create plot dataframe
song_sk_pre_mima_df["cluster"] = song_kmeans.labels_
# size and plot titles
fig, ax = plt.subplots(figsize=(10, 8))
plt.title('Visualising clusters')
plt.xlabel(f'{x} normalised')
plt.ylabel(f'{y} normalised')
# scatter plot
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y, hue='cluster', palette='Set2', s=75)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.show()

### 3D Plot for centroids in scattered dataset with clusters in different colours (NOT ADAPTED)

In [None]:
import matplotlib.pyplot as plt
import random
colors = ['r','g','b']
markers = ['o', '^','s']
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')

# ax.scatter(plot_data['bill_depth_mm'], plot_data['bill_length_mm'], plot_data['flipper_length_mm'])
for marker, d in plot_data.groupby('cluster'):
    ax.scatter(d['bill_depth_mm'], d['body_mass_g'], d['bill_length_mm'], marker=markers[marker], label=d['cluster'])
ax.scatter(cl_pos['bill_depth_mm'], cl_pos['body_mass_g'],cl_pos['bill_length_mm'], color='red', s=250)
ax.set_xlabel('Bill depth')
ax.set_ylabel('Body mass')
ax.set_zlabel('Bill length')
plt.show()

### Group by cluster and plot heatmap

In [None]:
song_sk_pre_mima_df.groupby(by="cluster").mean()

In [None]:
centroids = song_kmeans.cluster_centers_
centr_df = pd.DataFrame(centroids)
centr_df

In [None]:
eucl_centr_df = pd.DataFrame(pairwise_distances(centr_df),
                    index=centr_df.index,
                    columns=centr_df.index)

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(eucl_centr_df);

### Calculate inerta

In [None]:
song_kmeans = KMeans(n_clusters=n_clust)
song_kmeans.fit(song_sk_pre_mima_df)
song_kmeans.inertia_

In [None]:
k_max = n_clust
inertia_list = []

for i in range(1, k_max):
    song_kmeans = KMeans(n_clusters=i)
    song_kmeans.fit(song_sk_pre_mima_df)
    inertia_list.append(round(song_kmeans.inertia_));

In [None]:
inertia_list

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
plt.title(f'Inertia score evolution from 1 cluster to {k_max} clusters')
sns.lineplot(x=range(1, k_max), y=inertia_list, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

### The silhouette coefficient

In [None]:
silhouette_score(songs_data_clean_df, song_kmeans.labels_)

The silhouette coefficient can vary between -1 and +1:

+1 --> the instance is well inside its own cluster and far from other clusters.

 0 --> the instance is close to a cluster boundary.

-1 --> the instance may have been assigned to the wrong cluster.

In [None]:
k_max = n_clust
sil_score = []

for k in range(2, k_max):
    labels = KMeans(n_clusters=k).fit(songs_data_clean_df).labels_
    sil_score.append(silhouette_score(songs_data_clean_df, labels));

In [None]:
sil_score

In [None]:
plt.title(f'Silhouette score evolution from 2 cluster to {k_max} clusters')
sns.lineplot(x=range(2, k_max), y=sil_score, marker='o')
plt.show();

### Radar / Spider-Plot

In [None]:
# Source: https://www.python-graph-gallery.com/391-radar-chart-with-several-individuals
# Libraries

from math import pi

# Set data
radar_df = cl_pos.reset_index().rename(columns={'index':'cluster'})

# ------- PART 1: Create background

# number of variable
categories= radar_df.columns.tolist()[1:]
# list(df)[1:]
N = len(categories)

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)

# If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], categories)

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks(ticks=None, labels=None)
# plt.ylim(0,40)


# pick three playlists

pl1 = 3
pl2 = 9
pl3 = 15

# ------- PART 2: Add plots

# Plot each individual = each line of the data
# I don't make a loop, because plotting more than 3 groups makes the chart unreadable

# Ind1
values=radar_df.loc[pl1].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl1}")
ax.fill(angles, values, 'b', alpha=0.1)

# Ind2
values=radar_df.loc[pl2].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl2}")
ax.fill(angles, values, 'r', alpha=0.1)

# Ind3
values=radar_df.loc[pl3].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl3}")
ax.fill(angles, values, 'b', alpha=0.1)


# # Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Show the graph
plt.show()

In [None]:
song_sk_pre_mima_df.loc[song_sk_pre_mima_df["cluster"] == 15].sample(10)

## BisectingKMeans

K-means and bisecting k-means are both clustering algorithms used in unsupervised learning. The main difference between them is that bisecting k-means is more efficient when it comes to computation time. In each bisecting step of bisecting k-means, only the data points of one cluster and two centroids are involved in the computation. Thus, the computation time is reduced. While k-means is known to yield clusters of varied sizes, bisecting k-means results in clusters of comparable sizes.

In [None]:
from sklearn.cluster import BisectingKMeans

In [None]:
# number of clusters
n_clust = 16

In [None]:
# clustering based on min-max scaler
song_bskmean = BisectingKMeans(n_clusters=n_clust, bisecting_strategy='biggest_inertia')
song_bskmean.fit(song_sk_pre_mima_df)
song_bskmean.labels_
bskmean_clusters = song_bskmean.predict(song_sk_pre_mima_df)
song_sk_pre_mima_df["bskmean_cluster"] = bskmean_clusters

### Plot the centroids in the scattered dataset

In [None]:
song_bskmean.cluster_centers_

cl_pos = pd.DataFrame(song_bskmean.cluster_centers_)
cl_pos.columns = ["danceability", "energy", "acousticness", "instrumentalness", "valence", "tempo"]#, "bskmean_cluster"] 

In [None]:
song_sk_pre_mima_df.head(1) 

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.title(f'Looking for a relation of {x} and {y}')
plt.show() 

### Plot the centroids in the scattered dataset with the clusters in different colours

In [None]:
# create plot dataframe
song_sk_pre_mima_df["bskmean_cluster"] = song_bskmean.labels_
# size and plot titles
fig, ax = plt.subplots(figsize=(10, 8))
plt.title('Visualising clusters')
plt.xlabel(f'{x} normalised')
plt.ylabel(f'{y} normalised')
# scatter plot
sns.scatterplot(data=song_sk_pre_mima_df, x=x, y=y, hue='bskmean_cluster', palette='Set2', s=75)
sns.scatterplot(data=cl_pos, x=x, y=y, color='red', s=250)
plt.show()

### Group by cluster and plot heatmap

In [None]:
song_sk_pre_mima_df.groupby(by="bskmean_cluster").mean()

In [None]:
centroids = song_bskmean.cluster_centers_
centr_df = pd.DataFrame(centroids)
centr_df

In [None]:
eucl_centr_df = pd.DataFrame(pairwise_distances(centr_df),
                    index=centr_df.index,
                    columns=centr_df.index)

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(eucl_centr_df);

In [None]:
song_sk_pre_mima_df.groupby(by='bskmean_cluster').count().sort_values(by='energy', ascending=False)

In [None]:
song_sk_pre_mima_df.describe()

### Calculate Inertia

In [None]:
song_bskmeans = BisectingKMeans(n_clusters=n_clust)
song_bskmeans.fit(song_sk_pre_mima_df)
song_bskmeans.inertia_

In [None]:
k_max = n_clust
inertia_list = []

for i in range(1, k_max):
    song_bskmeans = BisectingKMeans(n_clusters=i)
    song_bskmeans.fit(song_sk_pre_mima_df)
    inertia_list.append(round(song_bskmeans.inertia_));

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
plt.title(f'Inertia score evolution from 1 cluster to {k_max} clusters')
sns.lineplot(x=range(1, k_max), y=inertia_list, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

### The Silhouette Coefficient

In [None]:
k_max = n_clust
sil_score = []

for k in range(2, k_max):
    labels = BisectingKMeans(n_clusters=k).fit(songs_data_clean_df).labels_
    sil_score.append(silhouette_score(songs_data_clean_df, labels));

In [None]:
sil_score

In [None]:
plt.title(f'Silhouette score evolution from 2 cluster to {k_max} clusters')
sns.lineplot(x=range(2, k_max), y=sil_score, marker='o')
plt.show();

### Radar / Spider-Plot

In [None]:
# Source: https://www.python-graph-gallery.com/391-radar-chart-with-several-individuals
# Libraries

from math import pi

# Set data
radar_df = cl_pos.reset_index().rename(columns={'index':'bskmean_cluster'})

# ------- PART 1: Create background

# number of variable
categories= radar_df.columns.tolist()[1:]
# list(df)[1:]
N = len(categories)

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)

# If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], categories)

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks(ticks=None, labels=None)
# plt.ylim(0,40)


# pick three playlists

pl1 = 1
pl2 = 7
pl3 = 13

# ------- PART 2: Add plots

# Plot each individual = each line of the data
# I don't make a loop, because plotting more than 3 groups makes the chart unreadable

# Ind1
values=radar_df.loc[pl1].drop('bskmean_cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl1}")
ax.fill(angles, values, 'b', alpha=0.1)

# Ind2
values=radar_df.loc[pl2].drop('bskmean_cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl2}")
ax.fill(angles, values, 'r', alpha=0.1)

# Ind3
values=radar_df.loc[pl3].drop('bskmean_cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label=f"Playlist {pl3}")
ax.fill(angles, values, 'b', alpha=0.1)


# # Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Show the graph
plt.show()

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
song_dbscan = DBSCAN(eps=0.33, min_samples=2)
song_dbscan.fit(song_sk_pre_mima_df)
song_dbscan.labels_
dbscan_clusters = song_dbscan.fit_predict(song_sk_pre_mima_df)
song_sk_pre_mima_df["dbscan_cluster"] = dbscan_clusters

In [None]:
song_sk_pre_mima_df

In [None]:
song_sk_pre_mima_df.loc[song_sk_pre_mima_df["dbscan_cluster"] == 1].count()

In [None]:
song_sk_pre_mima_df.groupby(by='dbscan_cluster').count().sort_values(by='energy', ascending=False)