# Clustering Tracks

## Processing Data

In [1]:
# Import libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path

In [2]:
# Read the csv file into a pandas DataFrame
tracks_features_df = pd.read_csv(Path("Resources/tracks_feature.csv"))

# Review the DataFrame
tracks_features_df.head()

Unnamed: 0,track_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,spotify:track:6KaZC1JpPRq9CNZ3X8jBCM,0.591,0.721,2,-7.96,1,0.0308,0.0197,0.000244,0.117,0.434,94.053,246933,4
1,spotify:track:7hiqCnFcOBFAqZSf3AOY8a,0.399,0.955,11,-3.662,0,0.0822,0.0026,0.376,0.425,0.679,156.583,165549,4
2,spotify:track:1SxVwlqUaz6PdQSqjclahn,0.573,0.915,1,-3.658,1,0.0532,0.809,0.304,0.324,0.758,105.323,230000,4
3,spotify:track:1XGX6lvmT464sVpJKTF3aV,0.492,0.801,2,-7.679,1,0.0357,0.00896,0.00012,0.187,0.491,74.969,229600,4
4,spotify:track:317d1IqYa8JhV8BGVDtgwZ,0.219,0.367,0,-9.696,0,0.0285,0.0726,6e-06,0.0832,0.107,148.622,274547,4


## Scaling

In [3]:
# Import libraries and dependencies
from sklearn.preprocessing import StandardScaler

In [4]:
# Creating `scaler` 
scaler = StandardScaler()
# Scale the `tracks_features_df` and drop the 'track_uri' column
scaler.fit(tracks_features_df.drop('track_uri',axis=1))
scaled_tracks = scaler.transform(tracks_features_df.drop('track_uri',axis=1))

In [5]:
# Create `scaled_tracks_df` for the scaled_tracks
scaled_tracks_df = pd.DataFrame(scaled_tracks,columns=tracks_features_df.drop('track_uri',axis=1).columns, index=tracks_features_df.index)
# Check out `scaled_tracks_df`
scaled_tracks_df


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.219920,0.511650,-0.916348,0.302226,0.725791,-0.507261,-0.945325,-0.631304,-0.485287,-0.154460,-0.866982,-0.003276,0.254497
1,-0.820554,1.391278,1.612534,1.066076,-1.377808,-0.060998,-0.993576,0.444108,1.137242,0.753029,1.223033,-0.523731,0.254497
2,0.122375,1.240915,-1.197335,1.066787,0.725791,-0.312781,1.281859,0.238045,0.605179,1.045648,-0.490291,-0.111564,0.254497
3,-0.316574,0.812378,-0.916348,0.352166,0.725791,-0.464718,-0.975630,-0.631659,-0.116530,0.056670,-1.504849,-0.114122,0.254497
4,-1.795998,-0.819069,-1.478322,-0.006299,-1.377808,-0.527230,-0.796056,-0.631987,-0.663344,-1.365681,0.956943,0.173316,0.254497
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214987,-0.159420,-0.533378,0.488586,0.201280,-1.377808,-0.245060,1.183098,-0.631424,-0.912518,-0.021115,0.342606,-0.032233,0.254497
2214988,0.447523,1.346169,-1.197335,1.279876,0.725791,0.310598,-0.945889,-0.628969,-0.776604,1.708671,0.085307,-0.273569,0.254497
2214989,0.030250,0.492855,0.207599,-0.136925,-1.377808,-0.017587,-1.000506,-0.631978,0.657858,1.023424,0.923151,-0.145074,0.254497
2214990,-1.682196,-1.059651,-0.916348,-0.704747,0.725791,-0.519416,1.081517,0.713136,-0.559038,-1.457171,-0.688096,-0.624529,0.254497


## Finding K Value
* Elbow Method

In [6]:
# Import libraries and dependencies
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import hvplot.pandas
import time

In [7]:
# Set up the KMeans model where k is the number for clusters ranging from 1 to 10
start_time = time.time()

inertias =[]
k = list(range(1,10))

for i in k:
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=42)
    kmeans.fit(scaled_tracks_df)
    inertias.append(kmeans.inertia_)

print("--- %s seconds ---" % (time.time() - start_time))

--- 59.529577016830444 seconds ---


In [8]:
# Create elbow_df and sil_score_df
elbow_df = pd.DataFrame({
    "k": k,
    "inertia": inertias
})
# Check out `elbow_df`
elbow_df

Unnamed: 0,k,inertia
0,1,28794900.0
1,2,23816410.0
2,3,22040790.0
3,4,20689960.0
4,5,19672090.0
5,6,18680940.0
6,7,17856340.0
7,8,16936340.0
8,9,16322870.0


In [9]:
# Plot elbow curve
elbow_df.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Clustering 

In [10]:
# Initialize the K-Means model with n_clusters=8
model = KMeans(n_clusters=8)

# Fit the model for the scaled_tracks_df 
model.fit(scaled_tracks_df)

# Predict the clusters of tracks
tracks_clusters = model.predict(scaled_tracks_df)

# View the tracks clusters
print(tracks_clusters)



[1 2 1 ... 2 5 3]


In [11]:
# Create a copy of the scaled_tracks_df
clustered_scaled_tracks_df = scaled_tracks_df.copy()

# Create a new column "track_cluster" for the predicted clusters
clustered_scaled_tracks_df["track_cluster"] = tracks_clusters

# Review the DataFrame
clustered_scaled_tracks_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,track_cluster
0,0.21992,0.51165,-0.916348,0.302226,0.725791,-0.507261,-0.945325,-0.631304,-0.485287,-0.15446,-0.866982,-0.003276,0.254497,1
1,-0.820554,1.391278,1.612534,1.066076,-1.377808,-0.060998,-0.993576,0.444108,1.137242,0.753029,1.223033,-0.523731,0.254497,2
2,0.122375,1.240915,-1.197335,1.066787,0.725791,-0.312781,1.281859,0.238045,0.605179,1.045648,-0.490291,-0.111564,0.254497,1
3,-0.316574,0.812378,-0.916348,0.352166,0.725791,-0.464718,-0.97563,-0.631659,-0.11653,0.05667,-1.504849,-0.114122,0.254497,1
4,-1.795998,-0.819069,-1.478322,-0.006299,-1.377808,-0.52723,-0.796056,-0.631987,-0.663344,-1.365681,0.956943,0.173316,0.254497,3


## Visualization Of Clustering

In [12]:
# Import libraries and dependencies
from sklearn.manifold import TSNE
import seaborn as sns

In [13]:
# Create a copy of `clustered_scaled_tracks_df`
visual_cluster_df = clustered_scaled_tracks_df.copy()

In [14]:
# Create a TSNE model
m = TSNE(learning_rate=50)

In [None]:
# Fit the data
tsne_features = m.fit_transform(scaled_tracks_df)

In [None]:
# Check out the first 3 rows of the coordinates of the data
tsne_features[1:4,:]

In [None]:
# Create the X and y for the data
visual_cluster_df["X"] = tsne_features[:,0]
visual_cluster_df["y"] = tsne_features[:,1]

In [None]:
# Plot the clustering result
sns.scatterplot(x="X", y="y", hue='track_cluster', palette='Spectral', data=visual_cluster_df)
plt.show()

## Clustering Input Playlist

In [None]:
# # Process the input playlist data
# scaled_input_playlist_df = pd.DataFrame(data={
#     'danceability':0.07272,
#     'energy':-0.21237,
#     'key':-0.103686,
#     'loudness':0.274964,
#     'mode':0.876177,
#     'speechiness':0.152517,
#     'acousticness':-0.31441,
#     'instrumentalness':-0.377864,
#     'liveness':-0.536152,
#     'valence':0.210339,
#     'tempo':0.465768,
#     'duration_ms_y':-0.290816,
#     'time_signature':0.582697
# },index=[0])
# # Check out the dataframe
# scaled_input_playlist_df

In [None]:
# # Cluster the new input playlist
# input_playlist_cluster = model.predict(scaled_input_playlist_df)
# input_playlist_cluster[0]

## Recommend User's Song

In [None]:
# # Label the tracks in `tracks_features_df`
# tracks_features_df["cluster"]=clustered_scaled_tracks_df["track_cluster"]
# # Get all the tracks of the wanted cluster
# recommend_tracks_df = tracks_features_df[tracks_features_df["cluster"]==input_playlist_cluster[0]]
# recommend_tracks_df

## Save The Model

In [None]:
import pickle

In [None]:
# # save the song classification model as a pickle file
# model_pkl_file = "song_model.pkl"  

# with open(model_pkl_file, 'wb') as file:  
#     pickle.dump(model, file)

In [None]:
# # save the nearest neighbor model as a pickle file
# neigh_pkl_file = "song_neighbor.pkl"  

# with open(neigh_pkl_file, 'wb') as file:  
#     pickle.dump(neigh, file)
