In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt
import hvplot.pandas

In [2]:
# Read in the clean spotify data
file_path = Path("Resources/spotify_clean.csv")
df_spotify = pd.read_csv(file_path)

# Display sample data
df_spotify.head()

Unnamed: 0,track_name,track_artist,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,2019-06-14,0.642049,1.201614,0.1732,1.367123,1,-0.481362,-0.333898,...,0,0,0,0,0,0,0,0,0,0
1,Memories - Dillon Francis Remix,Maroon 5,2019-12-13,0.490412,0.643317,1.557627,0.585766,1,-0.688642,-0.46867,...,0,0,0,0,0,0,0,0,0,0
2,All the Time - Don Diablo Remix,Zara Larsson,2019-07-05,0.138889,1.284529,-1.211227,1.10009,0,-0.324422,-0.436799,...,0,0,0,0,0,0,0,0,0,0
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,2019-07-19,0.435271,1.279002,0.450085,0.984309,1,-0.050024,-0.667642,...,0,0,0,0,0,0,0,0,0,0
4,Someone You Loved - Future Humans Remix,Lewis Capaldi,2019-03-05,-0.033426,0.742815,-1.211227,0.685151,1,-0.70246,-0.432701,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df_spotify = df_spotify.set_index(['track_name', 'track_artist'])
df_spotify.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
track_name,track_artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
I Don't Care (with Justin Bieber) - Loud Luxury Remix,Ed Sheeran,2019-06-14,0.642049,1.201614,0.1732,1.367123,1,-0.481362,-0.333898,-0.377953,-0.80923,...,0,0,0,0,0,0,0,0,0,0
Memories - Dillon Francis Remix,Maroon 5,2019-12-13,0.490412,0.643317,1.557627,0.585766,1,-0.688642,-0.46867,-0.359177,1.081061,...,0,0,0,0,0,0,0,0,0,0
All the Time - Don Diablo Remix,Zara Larsson,2019-07-05,0.138889,1.284529,-1.211227,1.10009,0,-0.324422,-0.436799,-0.377849,-0.519562,...,0,0,0,0,0,0,0,0,0,0
Call You Mine - Keanu Silva Remix,The Chainsmokers,2019-07-19,0.435271,1.279002,0.450085,0.984309,1,-0.050024,-0.667642,-0.377911,0.089582,...,0,0,0,0,0,0,0,0,0,0
Someone You Loved - Future Humans Remix,Lewis Capaldi,2019-03-05,-0.033426,0.742815,-1.211227,0.685151,1,-0.70246,-0.432701,-0.377953,-0.692585,...,0,0,0,0,0,0,0,0,0,0


In [4]:
new_df = df_spotify.drop(columns=['track_album_release_date'])

In [5]:
# Instantiate PCA with 2 components
pca = PCA(n_components=2)

# Train the model 
pca.fit(new_df)
spotify_pca= pca.fit_transform(new_df)

spotify_pca[:5]

array([[-3.10458130e+04,  4.13766162e-01],
       [-6.31998130e+04, -2.16296710e+01],
       [-4.91838132e+04,  2.27942169e+00],
       [-5.67068126e+04,  5.75415439e-01],
       [-3.67478132e+04,  2.25938352e+00]])

In [6]:
pca.explained_variance_ratio_

array([9.99999624e-01, 2.02233390e-07])

In [7]:
pca_components_df = pd.DataFrame(pca.components_, columns=new_df.columns, index=['PCA1', 'PCA2'])
pca_components_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
PCA1,-2e-06,2.107771e-07,2.530257e-07,-2e-06,1.295113e-07,-1e-06,-1e-06,1e-06,1.02593e-07,-5.385846e-07,...,4.754846e-07,1.730798e-07,-2.592675e-07,-1.652175e-07,4.341059e-07,-6.203786e-08,3.025205e-07,-2.733649e-07,-2.299975e-07,3.168128e-08
PCA2,-0.006911,0.005709261,-0.0004982636,0.003441,0.0002570329,0.001659,-0.004288,0.001035,0.0008478074,-0.0009938068,...,-0.0003480299,0.0001666946,0.0001411523,0.0001192763,0.000468009,-7.8503e-05,-0.0001120471,0.0004841974,-0.0002369383,-0.0001714913


In [8]:
pca_components_df.iloc[0]

danceability                                  -1.619151e-06
energy                                         2.107771e-07
key                                            2.530257e-07
loudness                                      -1.922974e-06
mode                                           1.295113e-07
speechiness                                   -1.494667e-06
acousticness                                  -1.363471e-06
instrumentalness                               1.056852e-06
liveness                                       1.025930e-07
valence                                       -5.385846e-07
tempo                                         -6.348112e-07
duration_ms                                    1.000000e+00
track_popularity                              -5.999550e-05
bangers                                       -2.059185e-07
playlist_genre_edm                            -1.675472e-07
playlist_genre_latin                          -3.919188e-07
playlist_genre_pop                      

In [9]:
spotify_pca_df = pd.DataFrame(
    spotify_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
spotify_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-31045.812982,0.413766
1,-63199.812967,-21.629671
2,-49183.813189,2.279422
3,-56706.812574,0.575415
4,-36747.81315,2.259384


In [10]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(spotify_pca_df)

# Make predictions
k_3 = model.predict(spotify_pca_df)

# Create a copy of the PCA DataFrame
spotify_pca_predictions_df = spotify_pca_df.copy()

# Add a class column with the labels
spotify_pca_predictions_df["song_clusters"] = k_3

spotify_pca_predictions_df.head()



Unnamed: 0,PCA1,PCA2,song_clusters
0,-31045.812982,0.413766,0
1,-63199.812967,-21.629671,0
2,-49183.813189,2.279422,0
3,-56706.812574,0.575415,0
4,-36747.81315,2.259384,0


In [11]:
# Plot the clusters
spotify_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="song_clusters"
)


In [12]:
# Start by importing the K-means algorithm
from sklearn.cluster import KMeans

# Create and initialize the K-means model instance for 2 clusters
model = KMeans(n_clusters=3, random_state=1)

# Print the model
model.fit(new_df)

song_clusters = model.predict(new_df)

song_clusters



array([1, 1, 1, ..., 1, 0, 0])

In [13]:
new_df['k_clusters'] = song_clusters
new_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary,k_clusters
track_name,track_artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
I Don't Care (with Justin Bieber) - Loud Luxury Remix,Ed Sheeran,0.642049,1.201614,0.1732,1.367123,1,-0.481362,-0.333898,-0.377953,-0.80923,0.031908,...,0,0,0,0,0,0,0,0,0,1
Memories - Dillon Francis Remix,Maroon 5,0.490412,0.643317,1.557627,0.585766,1,-0.688642,-0.46867,-0.359177,1.081061,0.782522,...,0,0,0,0,0,0,0,0,0,1
All the Time - Don Diablo Remix,Zara Larsson,0.138889,1.284529,-1.211227,1.10009,0,-0.324422,-0.436799,-0.377849,-0.519562,0.439384,...,0,0,0,0,0,0,0,0,0,1
Call You Mine - Keanu Silva Remix,The Chainsmokers,0.435271,1.279002,0.450085,0.984309,1,-0.050024,-0.667642,-0.377911,0.089582,-1.001795,...,0,0,0,0,0,0,0,0,0,1
Someone You Loved - Future Humans Remix,Lewis Capaldi,-0.033426,0.742815,-1.211227,0.685151,1,-0.70246,-0.432701,-0.377953,-0.692585,0.919777,...,0,0,0,0,0,0,0,0,0,1
