In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cdist
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'


### Loading mixed playlist and Datasets as dictionaries

In [76]:
# Load the datasets
mixed_playlist = pd.read_csv('data/mixed_playlist.csv')

In [12]:
# Loading Users Dataset
# Base directory where the folders with CSV files are located
base_dir = 'data/recovered_data/'

# Initialize a dictionary to store the DataFrames with the filename as the key
dataframes = {}

# Loop through each folder in the base directory
for folder_name in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder_name)
    
    # Check if the path is actually a directory
    if os.path.isdir(folder_path):
        
        # Loop through each file in the folder
        for file_name in os.listdir(folder_path):
            
            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)
            
            # Check if the file is a CSV
            if file_name.endswith('.csv'):
                
                # Read the CSV file and store the DataFrame
                df = pd.read_csv(file_path)
                
                # Use the filename without extension as the dictionary key
                dataframe_key = os.path.splitext(file_name)[0]
                dataframes[dataframe_key] = df

# At this point, you have a dictionary with each DataFrame accessible by its key
# For example, to access the DataFrame for 'user_alpha_2016', you would use:
# dataframes['user_alpha_2016']

In [77]:
# Exploring recovered data
# dataframes['user_alpha_2016']

In [78]:
mixed_playlist

Unnamed: 0,name,album,artist,release_date,length,popularity,acousticness,danceability,energy,instrumentalness,...,speechiness,tempo,valence,time_signature,key,mode,uri,release_year,top_year,user
0,"Variations on a Polish Theme, Op. 10: No. 5 An...","Szymanowski: Piano Works, Vol. 2",Karol Szymanowski,06/12/1996,76933,53,0.996000,0.329,0.00695,0.866000,...,0.0448,70.295,0.238,4,11,0,spotify:track:3bcdLMrAxrfn5dxInjIdI2,1996,unknown,unknown
1,Je vous trouve un charme fou - En duo avec Gaë...,Il suffit d'y croire (Version deluxe),Hoshi,2018-11-30,172626,62,0.622000,0.615,0.59900,0.000008,...,0.2530,86.976,0.626,4,1,1,spotify:track:0C2yaSWVgCUiiqPyYxSOkd,2018,2022,delta
2,Me Gusta,On ira où ?,DTF,2019-10-11,175269,72,0.413000,0.834,0.73400,0.000040,...,0.3410,89.989,0.356,4,6,0,spotify:track:6P3FBaZfUjeWYExU2ShaPZ,2019,2020,gamma
3,L’amour en Solitaire,Petite Amie (Deluxe),Juliette Armanet,2018-02-02,175266,0,0.404000,0.797,0.50600,0.000153,...,0.0327,128.027,0.539,4,5,0,spotify:track:2tn51grfchxArwPXeXkoX5,2018,2018,gamma
4,Goodnight Moon,Volta,Boogie Belgique,2016-09-23,264735,53,0.061600,0.788,0.67500,0.711000,...,0.0318,107.993,0.525,4,7,0,spotify:track:2rvo9Ddv18aRV0OJldhWTf,2016,2020,alpha
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3894,My Funny Valentine,Undercurrent,Bill Evans,1962,324133,41,0.992000,0.529,0.13300,0.922000,...,0.0575,115.924,0.585,4,0,0,spotify:track:5jIvud0zWXDpINxFqXfbpv,1962,2020,beta
3895,Petit homme,Earthquake (Edition Deluxe),Kolinga,2019-04-05,185506,0,0.942000,0.417,0.19600,0.001700,...,0.0871,76.774,0.505,3,4,0,spotify:track:6ejfdgCwtSp7g2kl9RMAT1,2019,2020,delta
3896,Neo-Tokyo - Dance With The Dead Remix,Dreams of Neo-Tokyo (Deluxe Edition),Scandroid,2017-03-31,352105,51,0.000016,0.467,0.95400,0.908000,...,0.0414,122.004,0.061,4,0,0,spotify:track:44K1GCbNLf1NHlCaLaYONk,2017,2021,dzeta
3897,Fool,Fool,Antis,2017-12-15,154826,16,0.045300,0.589,0.79200,0.008480,...,0.0569,182.051,0.658,4,6,1,spotify:track:4osf9MVY2OuJOoYXfTDzPP,2017,2018,beta


### Prepare mixed_playlist data to being processed

In [79]:
# To display categories "Users" that exists in the column user
num_categories = mixed_playlist.user.unique()
num_categories


array(['unknown', 'delta', 'gamma', 'alpha', 'dzeta', 'beta', 'epsilon'],
      dtype=object)

##### Converting categories "users" in numbers that serves as index

'unknown' = 0
 'delta' = 1
'gamma' = 2 
'alpha' = 3
 'dzeta' = 4
 'beta' = 5
'epsilon' = 6

In [80]:
mapping = {'unknown': 0, 'delta': 1, 'gamma': 2, 'alpha': 3, 'dzeta': 4, 'beta': 5, 'epsilon': 6}
mixed_playlist['user'] = mixed_playlist['user'].replace(mapping)
mixed_playlist.user

0       0
1       1
2       2
3       2
4       3
       ..
3894    5
3895    1
3896    4
3897    5
3898    0
Name: user, Length: 3899, dtype: int64

In [87]:
# Replace 'unknown' values in 'top_year' column with 0
mixed_playlist['top_year'] = mixed_playlist['top_year'].replace('unknown', 0)

mixed_playlist.top_year

0          0
1       2022
2       2020
3       2018
4       2020
        ... 
3894    2020
3895    2020
3896    2021
3897    2018
3898       0
Name: top_year, Length: 3890, dtype: object

In [88]:
mixed_playlist.info()
mixed_playlist.dropna(inplace=True)

# display the descriptive statistics of numeric variables
mixed_playlist.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3890 entries, 0 to 3898
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3890 non-null   object 
 1   album             3890 non-null   object 
 2   artist            3890 non-null   object 
 3   release_date      3890 non-null   object 
 4   length            3890 non-null   int64  
 5   popularity        3890 non-null   int64  
 6   acousticness      3890 non-null   float64
 7   danceability      3890 non-null   float64
 8   instrumentalness  3890 non-null   float64
 9   liveness          3890 non-null   float64
 10  speechiness       3890 non-null   float64
 11  tempo             3890 non-null   float64
 12  valence           3890 non-null   float64
 13  time_signature    3890 non-null   int64  
 14  key               3890 non-null   int64  
 15  mode              3890 non-null   int64  
 16  uri               3890 non-null   object 


Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,user
count,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0
mean,244418.7,32.005398,0.42512,0.554521,0.296768,0.167134,0.085179,115.826347,0.384641,3.873779,5.29563,0.466838,2012.748072,3.357326
std,104574.7,24.920088,0.363021,0.192029,0.386456,0.143656,0.092698,28.662454,0.254691,0.465588,3.507038,0.498963,10.418784,1.844656
min,31053.0,0.0,1e-06,0.0,0.0,0.0179,0.0,0.0,0.0,0.0,0.0,0.0,1957.0,0.0
25%,188033.2,1.0,0.058025,0.42425,4e-06,0.0945,0.0368,94.54675,0.161,4.0,2.0,0.0,2012.0,2.0
50%,224248.5,34.0,0.339,0.575,0.00573,0.112,0.047,115.1515,0.355,4.0,5.0,0.0,2016.0,3.0
75%,274490.0,51.0,0.793,0.701,0.773,0.172,0.086775,131.98675,0.572,4.0,8.0,1.0,2018.0,5.0
max,1921683.0,91.0,0.996,0.971,0.995,0.991,0.952,209.596,0.981,5.0,11.0,1.0,2022.0,6.0


In [89]:
# display the correlation matrix between numeric variables
corr = mixed_playlist.corr(numeric_only = True)
corr.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,user
length,1.0,-0.095678,0.00843,-0.12779,0.168193,-0.012691,-0.034805,-0.003102,-0.196543,-0.02302,-0.026732,0.018647,-0.134922,-0.002029
popularity,-0.095678,1.0,-0.031579,0.059827,-0.130565,-0.053067,0.039192,0.022633,0.034257,-0.013903,0.004236,-0.008805,0.018607,-0.114075
acousticness,0.00843,-0.031579,1.0,-0.385711,0.163068,-0.118148,-0.163287,-0.186696,-0.270082,-0.225935,-0.041198,-0.014634,-0.194986,-0.165433
danceability,-0.12779,0.059827,-0.385711,1.0,-0.371521,0.007763,0.250873,0.084809,0.573579,0.242775,0.024329,-0.015994,0.12288,-0.080037
instrumentalness,0.168193,-0.130565,0.163068,-0.371521,1.0,-0.091973,-0.256769,-0.035079,-0.40729,-0.113255,-0.000996,0.004675,0.012262,0.171177
liveness,-0.012691,-0.053067,-0.118148,0.007763,-0.091973,1.0,0.111733,0.037593,0.115369,-0.01328,0.008491,0.00652,-0.025289,-0.031247
speechiness,-0.034805,0.039192,-0.163287,0.250873,-0.256769,0.111733,1.0,0.036513,0.250839,0.067924,0.017841,-0.033367,0.097793,-0.095971
tempo,-0.003102,0.022633,-0.186696,0.084809,-0.035079,0.037593,0.036513,1.0,0.089352,0.006864,0.019338,-0.057536,0.038141,0.01311
valence,-0.196543,0.034257,-0.270082,0.573579,-0.40729,0.115369,0.250839,0.089352,1.0,0.155021,0.032526,-0.040391,-0.032345,-0.11288
time_signature,-0.02302,-0.013903,-0.225935,0.242775,-0.113255,-0.01328,0.067924,0.006864,0.155021,1.0,0.017347,0.013523,0.07216,0.042049


In [90]:
correlation_matrix = mixed_playlist.corr()
# You can set a threshold for correlation values and remove features with high correlation
threshold = 0.7  # Adjust this threshold as needed
highly_correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.add(colname)

mixed_playlist.drop(highly_correlated_features, axis=1, inplace=True)





In [91]:
mixed_playlist

Unnamed: 0,name,album,artist,release_date,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,uri,release_year,top_year,user
0,"Variations on a Polish Theme, Op. 10: No. 5 An...","Szymanowski: Piano Works, Vol. 2",Karol Szymanowski,06/12/1996,76933,53,0.996000,0.329,0.866000,0.0906,0.0448,70.295,0.238,4,11,0,spotify:track:3bcdLMrAxrfn5dxInjIdI2,1996,0,0
1,Je vous trouve un charme fou - En duo avec Gaë...,Il suffit d'y croire (Version deluxe),Hoshi,2018-11-30,172626,62,0.622000,0.615,0.000008,0.1920,0.2530,86.976,0.626,4,1,1,spotify:track:0C2yaSWVgCUiiqPyYxSOkd,2018,2022,1
2,Me Gusta,On ira où ?,DTF,2019-10-11,175269,72,0.413000,0.834,0.000040,0.1130,0.3410,89.989,0.356,4,6,0,spotify:track:6P3FBaZfUjeWYExU2ShaPZ,2019,2020,2
3,L’amour en Solitaire,Petite Amie (Deluxe),Juliette Armanet,2018-02-02,175266,0,0.404000,0.797,0.000153,0.2550,0.0327,128.027,0.539,4,5,0,spotify:track:2tn51grfchxArwPXeXkoX5,2018,2018,2
4,Goodnight Moon,Volta,Boogie Belgique,2016-09-23,264735,53,0.061600,0.788,0.711000,0.1000,0.0318,107.993,0.525,4,7,0,spotify:track:2rvo9Ddv18aRV0OJldhWTf,2016,2020,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3894,My Funny Valentine,Undercurrent,Bill Evans,1962,324133,41,0.992000,0.529,0.922000,0.1100,0.0575,115.924,0.585,4,0,0,spotify:track:5jIvud0zWXDpINxFqXfbpv,1962,2020,5
3895,Petit homme,Earthquake (Edition Deluxe),Kolinga,2019-04-05,185506,0,0.942000,0.417,0.001700,0.1250,0.0871,76.774,0.505,3,4,0,spotify:track:6ejfdgCwtSp7g2kl9RMAT1,2019,2020,1
3896,Neo-Tokyo - Dance With The Dead Remix,Dreams of Neo-Tokyo (Deluxe Edition),Scandroid,2017-03-31,352105,51,0.000016,0.467,0.908000,0.0856,0.0414,122.004,0.061,4,0,0,spotify:track:44K1GCbNLf1NHlCaLaYONk,2017,2021,4
3897,Fool,Fool,Antis,2017-12-15,154826,16,0.045300,0.589,0.008480,0.1020,0.0569,182.051,0.658,4,6,1,spotify:track:4osf9MVY2OuJOoYXfTDzPP,2017,2018,5


In [92]:
# It is considered Irrelevant features every description or string that does not contribute in understanding the nature of the song
irrelevant_features = ['name', 'album', 'artist','release_date','uri']  # Modify this list with your irrelevant feature names
mixed_playlist.drop(irrelevant_features, axis=1, inplace=True)


In [93]:
# Feauture Scaling

scaler = MinMaxScaler()
mixed_playlist_scaled = scaler.fit_transform(mixed_playlist)
mixed_playlist = pd.DataFrame(mixed_playlist_scaled, columns=mixed_playlist.columns)

mixed_playlist


Unnamed: 0,length,popularity,acousticness,danceability,instrumentalness,liveness,speechiness,tempo,valence,time_signature,key,mode,release_year,top_year,user
0,0.024267,0.582418,1.000000,0.338826,0.870352,0.074710,0.047059,0.335383,0.242610,0.8,1.000000,0.0,0.600000,0.000000,0.000000
1,0.074881,0.681319,0.624497,0.633368,0.000008,0.178913,0.265756,0.414970,0.638124,0.8,0.090909,1.0,0.938462,1.000000,0.166667
2,0.076279,0.791209,0.414658,0.858908,0.000040,0.097729,0.358193,0.429345,0.362895,0.8,0.545455,0.0,0.953846,0.999011,0.333333
3,0.076278,0.000000,0.405622,0.820803,0.000154,0.243654,0.034349,0.610827,0.549439,0.8,0.454545,0.0,0.938462,0.998022,0.333333
4,0.123600,0.582418,0.061846,0.811535,0.714573,0.084370,0.033403,0.515244,0.535168,0.8,0.636364,0.0,0.907692,0.999011,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3885,0.155017,0.450549,0.995984,0.544799,0.926633,0.094646,0.060399,0.553083,0.596330,0.8,0.000000,0.0,0.076923,0.999011,0.833333
3886,0.081694,0.000000,0.945783,0.429454,0.001709,0.110061,0.091492,0.366295,0.514781,0.6,0.363636,0.0,0.953846,0.999011,0.166667
3887,0.169812,0.560440,0.000015,0.480947,0.912563,0.069571,0.043487,0.582091,0.062181,0.8,0.000000,0.0,0.923077,0.999505,0.666667
3888,0.065467,0.175824,0.045481,0.606591,0.008523,0.086425,0.059769,0.868581,0.670744,0.8,0.545455,1.0,0.923077,0.998022,0.833333


In [95]:
# K-Means clustering
X = mixed_playlist.values
model = KMeans(
    n_clusters = 6, # 6 clusters referred to 6 users 'delta': 1, 'gamma': 2, 'alpha': 3, 'dzeta': 4, 'beta': 5, 'epsilon': 6
    n_init = 'auto', # multiple attempt with different initial centroids
    random_state = 1, # fix ramdom state to guarantee reproducibility
    verbose = 1 # output log
)
model.fit(X)

Initialization complete
Iteration 0, inertia 3548.7558486529265.
Iteration 1, inertia 2296.255249186367.
Iteration 2, inertia 2178.2526012553294.
Iteration 3, inertia 2134.0134837180403.
Iteration 4, inertia 2113.306456790444.
Iteration 5, inertia 2099.979010165711.
Iteration 6, inertia 2094.286963700861.
Iteration 7, inertia 2092.852732574386.
Iteration 8, inertia 2092.427159946258.
Iteration 9, inertia 2092.352060016224.
Iteration 10, inertia 2092.269091746777.
Iteration 11, inertia 2092.2402103933073.
Converged at iteration 11: center shift 3.251393255717938e-06 within tolerance 6.816383810380061e-06.


In [96]:
# obtain the cluster centers
model.cluster_centers_

array([[ 1.07387768e-01,  3.64806482e-01,  7.17242490e-01,
         5.58804011e-01,  4.92461782e-02,  1.54129107e-01,
         9.16826600e-02,  5.56580255e-01,  4.03577765e-01,
         7.57585644e-01,  4.77383954e-01, -1.66533454e-16,
         8.27330907e-01,  9.54594082e-01,  3.41761827e-01],
       [ 1.27333722e-01,  2.84910212e-01,  1.10252973e-01,
         5.70759966e-01,  7.99528128e-01,  1.55922360e-01,
         6.58648673e-02,  5.82160689e-01,  2.88763643e-01,
         7.87195122e-01,  4.69235033e-01,  1.00000000e+00,
         9.05581614e-01,  9.86548877e-01,  6.60569106e-01],
       [ 1.27378935e-01,  3.12969123e-01,  5.26431331e-01,
         4.51871467e-01,  8.39522494e-01,  1.28344219e-01,
         5.46526455e-02,  5.50011691e-01,  2.19827919e-01,
         7.58478605e-01,  5.10012966e-01, -3.88578059e-16,
         8.62245520e-01,  9.52943924e-01,  6.55573164e-01],
       [ 1.03438068e-01,  3.77121674e-01,  1.49085573e-01,
         6.78173415e-01,  6.64341340e-02,  1.69559091