In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

### 6.1.&nbsp; Explore the 10 songs dataset

Import the 10 songs dataset into a new notebook and explore it like we did at the beginning of this notebook. Do you agree with the liveliness scores that certain songs have been given? Do you think the top 3 most danceable songs are the most danceable songs in the list? Ask questions of the data and see where it leads you.

If you're not familiar with any of the songs, look them up on YouTube or Spotify and listen to at least 30 seconds of each song to get a feel for them. Even if you're not a music expert we can all tell the difference between gangster rap and classical, right? Look at the scores given and see if they make sense with **how these songs make you feel**.

In [None]:
# use this code when you want to use this in google colab
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv('2_spotify_10_songs.csv')
# df

In [None]:
songs_df = pd.read_csv("../data/2_spotify_10_songs.csv")
songs_df.columns = songs_df.columns.str.strip()
songs_df

In [None]:
report = ProfileReport(songs_df)
report

### 6.2 Explore the dataset in dimensions
Change the code above in the section `Clustering in 2 dimensions` and re-cluster the data using different columns. Explore what happens with different pairs of columns. Why do people change tables? How different does the data look each time?

### 6.3.&nbsp; Use KMeans to group the 10 songs dataset

Play around with the clustering algorithm and get familiar with it. What happens when you change the number of clusters? What number of clusters produces the best output? What happens when you change the number of the random seed?

In [None]:
# crop dataframe to only keep song and audio features
dim1= 'energy'
dim2 = 'valence'

two_features_df = songs_df.copy()
two_features_df = two_features_df[['song_name',dim1, dim2]] 
two_features_df = two_features_df.set_index('song_name')

In [None]:
# own hyperparameters
num_clusters= 3
my_state = 123
my_init = 'k-means++' #'random'
my_iter = 100
# initialize the model
my_2d_kmeans = KMeans(n_clusters = num_clusters, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = my_state,
                        init = my_init,
                        n_init=my_iter,
                        verbose = 1)

# fit the model to the data
my_2d_kmeans.fit(two_features_df)

# obtain the cluster output
table = my_2d_kmeans.labels_

# attach the cluster output to our original DataFrame
two_features_df["table"] = table

two_features_df.sort_values(by="table")

In [None]:
my_2d_kmeans.inertia_ * my_2d_kmeans.n_clusters

#### Plot solution 

In [None]:
# Plot the data points
%matplotlib inline
plt.scatter(x = two_features_df.iloc[:, 0],
            y = two_features_df.iloc[:, 1],
            c = my_2d_kmeans.labels_,
            cmap = 'viridis')

# Plot the cluster centers
plt.scatter(x = my_2d_kmeans.cluster_centers_[:, 0],
            y = my_2d_kmeans.cluster_centers_[:, 1],
            c = 'red',
            marker = 'x',
            s = 100)

# Annotate the data points
for idx, row in two_features_df.iterrows():
     plt.annotate(idx[:15], (row[dim1], row[dim2]), xytext=(5, 0), textcoords='offset points')

# # Add labels and title
plt.title('KMeans Clustering')
plt.xlabel(two_features_df.columns[0])
plt.ylabel(two_features_df.columns[1])

# Display the plot
plt.show()

#### Full dataset

In [None]:
# crop dataframe to only keep song and audio features
songs_clusters = songs_df.copy()
songs_clusters = songs_clusters[['song_name','danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence']] #'loudeness', 'tempo'
songs_clusters = songs_clusters.set_index('song_name')

In [None]:
# initialize the model
my_full_kmeans = KMeans(n_clusters = 3, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = 42)

# fit the model to the data
my_full_kmeans.fit(songs_clusters)

# obtain the cluster output
table = my_full_kmeans.labels_

# attach the cluster output to our original DataFrame
songs_clusters["table"] = table

songs_clusters.sort_values(by="table")

### 6.4.&nbsp; Familiarise yourself with the documentation for scikit-learn

We'll be using scikit-learn a lot in the coming weeks, so it's a good idea to start familiarising yourself with its documentation now. The documentation is very well written, so be sure to check it out, start with the [KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) function that we used in this notebook. Play around with it and read up on some of the parameters that we didn't use.

### 6.5. [Optional bonus] See if you can learn to plot in 3 dimensions
Matplotlib allows you to visualise your data in 3D. Get creative and explore how to create a 3D scatter plot using different combinations of three columns from the food DataFrame. Once you have your data, experiment with clustering and assigning distinct colours to each cluster.

Compare the resulting clusters to those obtained using a 2D scatter plot. Does adding a third dimension significantly alter how the data points group together?

For an even more immersive experience, consider using Plotly, a Python library that enables interactive 3D scatter plots. Hovering your mouse over a data point in these plots reveals valuable information, such as the row index and cluster number.

In [None]:
# crop dataframe to only keep song and audio features
# other possible dimensions: 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'loudeness', 'tempo'
dim1= 'danceability'
dim2 = 'tempo'
dim3 = 'energy'
three_features_df = songs_df.copy()
three_features_df = three_features_df[['song_name',dim1, dim2, dim3]]
three_features_df = three_features_df.set_index('song_name')

In [None]:
# initialize the model
three_features_kmeans = KMeans(n_clusters = 3, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = 42)

# fit the model to the data
three_features_kmeans.fit(three_features_df)

# obtain the cluster output
table = three_features_kmeans.labels_

# attach the cluster output to our original DataFrame
three_features_df["table"] = table

three_features_df.sort_values(by="table")

In [None]:
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(xs = three_features_df.iloc[:, 0],
            ys = three_features_df.iloc[:, 1],
            zs = three_features_df.iloc[:, 2],
            c = three_features_kmeans.labels_,
            cmap = 'viridis')

# Plot the cluster centers
ax.scatter(xs = three_features_kmeans.cluster_centers_[:, 0],
            ys = three_features_kmeans.cluster_centers_[:, 1],
            zs = three_features_kmeans.cluster_centers_[:, 2],
            c = 'red',
            marker = 'x',
            s = 100)

# Annotate the data points
for idx, row in three_features_df.iterrows():
     ax.text(row[dim1], row[dim2], row[dim3], idx[:15])

# # Add labels and title
plt.title('KMeans Clustering')
ax.set_xlabel(three_features_df.columns[0])
ax.set_ylabel(three_features_df.columns[1])
ax.set_zlabel(three_features_df.columns[2])

plt.show()