Baseline Clustering

In [1]:
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import psycopg2.extras

root_path = Path().resolve().parent.parent
sys.path.append(str(root_path))

from src.dataset.connection import connect_to_spotify_dataset
from src.dataset.connection import select

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn.functional as F

In [2]:
connection = connect_to_spotify_dataset() # Add db Password
if not connection:
    print("Connection Error")
tds = select(connection, "SELECT * FROM track_data LIMIT 10") 

Connection successful
Connection closed.


In [3]:
columns = [
    "id", "name", "popularity", "duration_ms", "explicit", "artists", "id_artists", "release_date",
    "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", 
    "instrumentalness", "liveness", "valence", "tempo", "time_signature","decade", "mood_index", "emotion_index", "party_index", "chill_index"
]
# Create the DataFrame
df = pd.DataFrame(tds, columns=columns)
df = df.iloc[:10000]
print(df.dtypes)
df.head()

id                   object
name                 object
popularity            int64
duration_ms           int64
explicit               bool
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                   bool
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
decade                int64
mood_index          float64
emotion_index       float64
party_index         float64
chill_index         float64
dtype: object


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,decade,mood_index,emotion_index,party_index,chill_index
0,1mC0LEmjbCXjs8jjnYIjHw,Yo También Era Dichoso - Instrumental (Remaste...,0,170933,False,[Francisco Canaro],[2maQMqxNnlRrBrS1oAsrX9],1927-09-11,0.808,0.398,...,0.328,0.173,0.668,114.891,4,1920,0.656,4.3715,0.603,0.8024
1,21010f0ucoaLEEn4khQKZb,T for Texas,1,208000,False,[Jimmie Rodgers],[0Y0ZlznP6vzTS1qAm5dvTN],1927,0.571,0.718,...,0.0,0.309,0.565,78.849,4,1920,0.5974,1.612,0.6445,0.6774
2,2fPzoKtGjXFDgzIOSAfrEq,Chapter 1 & Chapter 2.1 - Sprawy Sherlocka Hol...,0,128038,False,[Arthur Conan Doyle],[2AcBhTO6Q3zzUrfPKmskp3],1927-07-29,0.703,0.321,...,0.0,0.31,0.803,87.746,3,1920,0.6766,5.6921,0.512,0.7738
3,3B6gBlf64fMQY3sbkyFENu,Purple Daze,0,192332,False,[Cellular Project],[6OOyYaxY66YrMwusS1PIr6],1927,0.344,0.857,...,0.874,0.125,0.158,140.669,4,1920,0.3536,1.9526,0.6005,0.2312
4,3KD6IGlsy0OmvQ5EZVSGwf,Iris: Inno al sole,0,231987,False,"[Pietro Mascagni, Staatskapelle Berlin]","[3Z5fRknMBBNfCw6pkgR9S8, 7vEPPI71V8dEHtEhPMAxWT]",1927-04-01,0.188,0.139,...,0.902,0.817,0.0387,74.178,3,1920,0.10355,5.63778,0.1635,0.9312


In [4]:
df_baseline = df[['duration_ms', 'tempo', 'decade', 'popularity', 'key', 'emotion_index', 'mode', 'chill_index']]
df_baseline.head()

Unnamed: 0,duration_ms,tempo,decade,popularity,key,emotion_index,mode,chill_index
0,170933,114.891,1920,0,4,4.3715,False,0.8024
1,208000,78.849,1920,1,3,1.612,True,0.6774
2,128038,87.746,1920,0,6,5.6921,False,0.7738
3,192332,140.669,1920,0,3,1.9526,True,0.2312
4,231987,74.178,1920,0,2,5.63778,True,0.9312


In [5]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_baseline)

In [9]:
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score

X= df_scaled
# Assume X is your dataset
n_clusters = 3  # or any number of clusters you're using for KMeans
random_labels = np.random.randint(0, n_clusters, size=len(X))

# Evaluate random clustering
silhouette_random = silhouette_score(X, random_labels)
dbi_random = davies_bouldin_score(X, random_labels)

print(f"Random Clustering - \nSilhouette Score: {silhouette_random}, \nDavies-Bouldin Index: {dbi_random}")


Random Clustering - 
Silhouette Score: -0.02833587528930409, 
Davies-Bouldin Index: 1.904177224468597


GMM - Gaussian Mixture Model

In [14]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=42)
labels_gmm = gmm.fit_predict(X)

silhouette_gmm = silhouette_score(X, labels_gmm)
dbi_gmm = davies_bouldin_score(X, random_labels)
print(f"Random Clustering - \nSilhouette Score: {silhouette_gmm}, \nDavies-Bouldin Index: {dbi_gmm}")



Random Clustering - 
Silhouette Score: 0.258765131909637, 
Davies-Bouldin Index: 1.904177224468597


Null Model (Permutation Test)

In [15]:
X_permuted = np.random.permutation(X)
kmeans_perm = KMeans(n_clusters=3, random_state=42)
labels_perm = kmeans_perm.fit_predict(X_permuted)

silhouette_perm = silhouette_score(X_permuted, labels_perm)
dbi_perm = davies_bouldin_score(X, random_labels)
print(f"Random Clustering - \nSilhouette Score: {silhouette_perm}, \nDavies-Bouldin Index: {dbi_perm}")


Random Clustering - 
Silhouette Score: 0.11078902985876213, 
Davies-Bouldin Index: 1.904177224468597
