In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csgraph
from scipy.linalg import eigh

In [2]:
# Load the dataset
songs_df = pd.read_csv('./songs.csv')

# Display the first few rows of the dataset to understand its structure
songs_df.head()

Unnamed: 0,artist_familiarity,artist_ hotttnesss,artist_num_songs,release,duration,energy,pitches,timbre,loudness,danceability
0,0.570278,0.344414,-0.380229,0.033102,0.212251,-0.295284,-0.178672,-0.367908,-0.105688,-0.31591
1,-0.179399,-0.306438,-0.27876,0.102305,0.730888,0.270387,-0.039409,-0.320095,-0.271154,0.029891
2,-0.172195,-0.589682,0.261041,-0.133294,-0.429221,-0.270769,-0.355272,-0.109027,-0.032178,-0.374202
3,0.623788,-0.190461,0.161531,0.07132,0.201448,-0.342355,-0.288166,0.490152,-0.110037,0.224162
4,0.192972,-0.298097,0.088428,0.472626,0.463764,0.342086,0.110808,0.017268,0.454363,-0.302626


In [5]:
from scipy.spatial.distance import pdist, squareform

# Calculate the pairwise Euclidean distances between all songs
distance_matrix = pdist(songs_df.values, metric='euclidean')

# Convert the distances into a square matrix form
square_distance_matrix = squareform(distance_matrix)

# Generate the adjacency matrix where 1 indicates a distance smaller than 1, and 0 otherwise
adjacency_matrix = (square_distance_matrix < 1).astype(int)

# View the shape to confirm it's a 1000x1000 matrix and preview the matrix
adjacency_matrix.shape, adjacency_matrix[:5, :5]  # Display only a small portion to get a sense of the matrix

((1000, 1000),
 array([[1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]]))

In [7]:
# Step 2: Compute the degree matrix D
degree_matrix = np.diag(adjacency_matrix.sum(axis=1))

# Step 3: Compute the unnormalized Laplacian L
laplacian_matrix = degree_matrix - adjacency_matrix

# Step 4: Compute the normalized Laplacian
# Using the symmetric normalized Laplacian formula: L_sym = D^(-1/2) * L * D^(-1/2)
D_inv_sqrt = np.diag(1 / np.sqrt(degree_matrix.diagonal()))
normalized_laplacian_matrix = D_inv_sqrt @ laplacian_matrix @ D_inv_sqrt

# Step 5: Compute the eigenvector v corresponding to the second smallest eigenvalue of L
# We use eigh, which is designed for Hermitian matrices, which the Laplacian is.
eigenvalues, eigenvectors = eigh(normalized_laplacian_matrix)

# The eigenvector corresponding to the second smallest eigenvalue (since they are sorted in ascending order)
v = eigenvectors[:, 1]

# Step 6: Let x be the list of elements from the eigenvector v
x = v

# Step 7: Cluster each xi using the indicator function I
clusters = np.where(x >= 0, 1, 2)

# Report the final cluster labels for the first 5 songs
clusters[:5]

array([2, 1, 2, 2, 1])

In [9]:
# Split the original features into the two clusters based on the labels obtained
cluster_1 = songs_df[clusters == 1]
cluster_2 = songs_df[clusters == 2]

# Calculate the mean feature values across all samples in each cluster
mean_cluster_1 = cluster_1.mean()
mean_cluster_2 = cluster_2.mean()

# Calculate the difference between the mean features of the two clusters
mean_difference = (mean_cluster_1 - mean_cluster_2).abs()

# Identify the top 3 features with the highest absolute mean difference
top_3_features = mean_difference.nlargest(3).index.tolist()
top_3_features_values = mean_difference.nlargest(3).values.tolist()

top_3_features, top_3_features_values

(['energy', 'danceability', 'pitches'],
 [0.6555892018012586, 0.07532449523470011, 0.03411942206564113])