Baseline Clustering

In [1]:
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import psycopg2.extras

root_path = Path().resolve().parent.parent
sys.path.append(str(root_path))

from src.dataset.connection import connect_to_spotify_dataset
from src.dataset.connection import select

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn.functional as F

In [2]:
connection = connect_to_spotify_dataset() # Add db Password
if not connection:
    print("Connection Error")
tds = select(connection, "SELECT * FROM track_data") 

Connection successful
Connection closed.


In [3]:
columns = [
    "id", "name", "popularity", "duration_ms", "explicit", "artists", "id_artists", "release_date",
    "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", 
    "instrumentalness", "liveness", "valence", "tempo", "time_signature","decade", "mood_index", "emotion_index", "party_index", "chill_index"
]
# Create the DataFrame
df = pd.DataFrame(tds, columns=columns)
#df = df.iloc[:10000]
print(df.dtypes)
df.head()

id                   object
name                 object
popularity            int64
duration_ms           int64
explicit               bool
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                   bool
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
decade                int64
mood_index          float64
emotion_index       float64
party_index         float64
chill_index         float64
dtype: object


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,decade,mood_index,emotion_index,party_index,chill_index
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,False,[Uli],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,1920,0.346,4.1857,0.545,0.6264
1,2B9d7LBGJvQEKiLEAexCOP,Toda Alma - Remasterizado,0,180253,False,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.545,0.268,...,0.92,0.112,0.648,126.614,4,1920,0.5411,7.2483,0.4065,0.8898
2,3fKqEoaJZODF7Xhh7qH4Wv,Capítulo 2.10 - Banquero Anarquista,0,106800,False,[Fernando Pessoa],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.702,0.2,...,0.0,0.36,0.773,81.152,4,1920,0.6371,7.2689,0.451,0.7052
3,3h8ioTTWfrC25hrHwQQpLc,All Or Nothing at All,0,164320,False,"[Dick Haymes, Harry James, His Orchestra]","[3BiJGZsyX9sJchTqcSA7Su, 5MpELOfAiq7aIBTij30ph...",1922,0.237,0.555,...,0.000262,0.366,0.292,92.213,3,1920,0.3281,2.2666,0.396,0.6652
4,3rcerSJNOkfgB9CELqcRHL,Capítulo 1.5 - Banquero Anarquista,0,108200,False,[Fernando Pessoa],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.773,0.155,...,0.0,0.13,0.494,52.032,4,1920,0.5099,7.601,0.464,0.6926


In [4]:
df_baseline = df[['duration_ms', 'tempo', 'decade', 'popularity', 'key', 'emotion_index', 'mode', 'chill_index']]
df_baseline.head()

Unnamed: 0,duration_ms,tempo,decade,popularity,key,emotion_index,mode,chill_index
0,126903,104.851,1920,6,0,4.1857,True,0.6264
1,180253,126.614,1920,0,1,7.2483,True,0.8898
2,106800,81.152,1920,0,7,7.2689,True,0.7052
3,164320,92.213,1920,0,2,2.2666,False,0.6652
4,108200,52.032,1920,0,9,7.601,False,0.6926


In [5]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_baseline)

In [6]:
df_scaled.size

4204696

In [None]:
from src.utils import distributed_silhouette
distributed_silhouette = distributed_silhouette.DistributedSilhouette()



In [12]:
dbi_random = davies_bouldin_score(X, random_labels)

print(f"Random Clustering - \nSilhouette Score: {silhouette_random}, \nDavies-Bouldin Index: {dbi_random}")

Random Clustering - 
Silhouette Score: -0.004608494131524032, 
Davies-Bouldin Index: 491.22270561847176


In [None]:
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score

X= df_scaled
# Assume X is your dataset
n_clusters = 8  # or any number of clusters you're using for KMeans
random_labels = np.random.randint(0, n_clusters, size=len(X))

# Evaluate random clustering
partitions, label_partitions = distributed_silhouette.split_data(df_scaled, random_labels)

silhouette_random = distributed_silhouette.compute_fast_silhouette(partitions, label_partitions)

dbi_random = davies_bouldin_score(X, random_labels)

print(f"Random Clustering - \nSilhouette Score: {silhouette_random}, \nDavies-Bouldin Index: {dbi_random}")


Random Clustering - 

Silhouette Score: -0.004608494131524032

Davies-Bouldin Index: 491.22270561847176

GMM - Gaussian Mixture Model

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=8, random_state=42)
labels_gmm = gmm.fit_predict(X)

partitions, label_partitions = distributed_silhouette.split_data(X, labels_gmm)
silhouette_gmm = distributed_silhouette.compute_fast_silhouette(partitions, label_partitions)
dbi_gmm = davies_bouldin_score(X, labels_gmm)
print(f"Random Clustering - \nSilhouette Score: {silhouette_gmm}, \nDavies-Bouldin Index: {dbi_gmm}")



Setting up the droplet clones...
We are ready to compute the score in parallel!
Calculating...
Done! Cleaning up droplets...
Destroyed droplet: silhouette-clone-0
Destroyed droplet: silhouette-clone-1
Destroyed droplet: silhouette-clone-2
Destroyed droplet: silhouette-clone-3
Destroyed droplet: silhouette-clone-4
Destroyed droplet: silhouette-clone-5
Destroyed droplet: silhouette-clone-6
Destroyed droplet: silhouette-clone-7
Destroyed droplet: silhouette-clone-8
Destroyed droplet: silhouette-clone-9
Droplets destroyed, you are safe to terminate your code!
Random Clustering - 
Silhouette Score: 0.08833424194399461, 
Davies-Bouldin Index: 491.22270561847176


Gaussian Mixture Model - 


Silhouette Score: 0.08833424194399461


Davies-Bouldin Index: 2.0470611600249145

Null Model (Permutation Test)

In [None]:
X_permuted = np.random.permutation(X)
kmeans_perm = KMeans(n_clusters=8, random_state=42)
labels_perm = kmeans_perm.fit_predict(X_permuted)

partitions, label_partitions = distributed_silhouette.split_data(X, labels_perm)
silhouette_perm = distributed_silhouette.compute_fast_silhouette(partitions, label_partitions)
dbi_perm = davies_bouldin_score(X, labels_perm)
print(f"Random Clustering - \nSilhouette Score: {silhouette_perm}, \nDavies-Bouldin Index: {dbi_perm}")


Setting up the droplet clones...
We are ready to compute the score in parallel!
Calculating...
Done! Cleaning up droplets...
Destroyed droplet: silhouette-clone-0
Destroyed droplet: silhouette-clone-1
Destroyed droplet: silhouette-clone-2
Destroyed droplet: silhouette-clone-3
Destroyed droplet: silhouette-clone-4
Destroyed droplet: silhouette-clone-5
Destroyed droplet: silhouette-clone-6
Destroyed droplet: silhouette-clone-7
Destroyed droplet: silhouette-clone-8
Destroyed droplet: silhouette-clone-9
Droplets destroyed, you are safe to terminate your code!
Random Clustering - 
Silhouette Score: -0.005457596000588488, 
Davies-Bouldin Index: 491.22270561847176


Null Model - 

Silhouette Score: -0.005457596000588488, 

Davies-Bouldin Index: 587.1649645919061