# Explore avalable features from spotify with 10 example songs

# Import libraries and data

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# Import Data

In [None]:
# use this code when you want to use this in google colab
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv('2_spotify_10_songs.csv')
# df

In [None]:
songs_df = pd.read_csv("../data/2_spotify_10_songs.csv")
songs_df.columns = songs_df.columns.str.strip()
songs_df

## Get Initial overview of features and how they are related

In [None]:
report = ProfileReport(songs_df)
report

## K-Means clustering

## Two features, 3 clusters

In [None]:
# features
dim1= 'energy'
dim2 = 'valence'

# own hyperparameters
num_clusters= 3
my_state = 123

# prepare df
two_features_df = songs_df.copy()
two_features_df = two_features_df[['song_name',dim1, dim2]] 
two_features_df = two_features_df.set_index('song_name')

# initialize the model
my_2d_kmeans = KMeans(n_clusters = num_clusters, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = my_state)

# fit the model to the data
my_2d_kmeans.fit(two_features_df)

# obtain the cluster output
table = my_2d_kmeans.labels_

# attach the cluster output to our original DataFrame
two_features_df["table"] = table

two_features_df.sort_values(by="table")

#### Plot clustering

In [None]:
# Plot the data points
%matplotlib inline
plt.scatter(x = two_features_df.iloc[:, 0],
            y = two_features_df.iloc[:, 1],
            c = my_2d_kmeans.labels_,
            cmap = 'viridis')

# Plot the cluster centers
plt.scatter(x = my_2d_kmeans.cluster_centers_[:, 0],
            y = my_2d_kmeans.cluster_centers_[:, 1],
            c = 'red',
            marker = 'x',
            s = 100)

# Annotate the data points
for idx, row in two_features_df.iterrows():
     plt.annotate(idx[:15], (row[dim1], row[dim2]), xytext=(5, 0), textcoords='offset points')

# # Add labels and title
plt.title('KMeans Clustering')
plt.xlabel(two_features_df.columns[0])
plt.ylabel(two_features_df.columns[1])

# Display the plot
plt.show()

## Three features, 3 clusters

In [None]:
# features
dim1= 'danceability'
dim2 = 'tempo'
dim3 = 'energy'

# own hyperparameters
num_clusters= 3
my_state = 123

# prepare df
three_features_df = songs_df.copy()
three_features_df = three_features_df[['song_name',dim1, dim2, dim3]]
three_features_df = three_features_df.set_index('song_name')

# initialize the model
three_features_kmeans = KMeans(n_clusters = num_clusters, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = my_state)

# fit the model to the data
three_features_kmeans.fit(three_features_df)

# obtain the cluster output
table = three_features_kmeans.labels_

# attach the cluster output to our original DataFrame
three_features_df["table"] = table

three_features_df.sort_values(by="table")

### Plot

In [None]:
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(xs = three_features_df.iloc[:, 0],
            ys = three_features_df.iloc[:, 1],
            zs = three_features_df.iloc[:, 2],
            c = three_features_kmeans.labels_,
            cmap = 'viridis')

# Plot the cluster centers
ax.scatter(xs = three_features_kmeans.cluster_centers_[:, 0],
            ys = three_features_kmeans.cluster_centers_[:, 1],
            zs = three_features_kmeans.cluster_centers_[:, 2],
            c = 'red',
            marker = 'x',
            s = 100)

# Annotate the data points
for idx, row in three_features_df.iterrows():
     ax.text(row[dim1], row[dim2], row[dim3], idx[:15])

# # Add labels and title
plt.title('KMeans Clustering')
ax.set_xlabel(three_features_df.columns[0])
ax.set_ylabel(three_features_df.columns[1])
ax.set_zlabel(three_features_df.columns[2])

plt.show()

## All features

In [None]:
# crop dataframe to only keep song and audio features
songs_clusters = songs_df.copy()
songs_clusters = songs_clusters[['song_name','danceability', 'energy', 
                                 'speechiness', 'acousticness', 
                                 'instrumentalness', 'valence','loudness', 
                                 'tempo']]
songs_clusters = songs_clusters.set_index('song_name')

# initialize the model
my_full_kmeans = KMeans(n_clusters = 3, # we'll explore how to choose the number of clusters in a later notebook
                        random_state = 42)

# fit the model to the data
my_full_kmeans.fit(songs_clusters)

# obtain the cluster output
table = my_full_kmeans.labels_

# attach the cluster output to our original DataFrame
songs_clusters["table"] = table

songs_clusters.sort_values(by="table")