In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# load data
df = pd.read_csv('data/top_tracks_audio_features.csv')

# standardize data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['danceability', 'energy', 'valence', 'tempo', 'acousticness']])

# k-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

# visualize clusters with pairplot
sns.pairplot(df[['danceability', 'energy', 'valence', 'tempo', 'acousticness', 'cluster']], hue='cluster', palette='viridis')
plt.suptitle('Pairplot with Clusters', y=1.02)
plt.show()

# cluster distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='cluster', data=df, palette='viridis')
plt.title('Cluster Distribution')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

# scatter plot for two key features with clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='energy', y='danceability', hue='cluster', data=df, palette='viridis')
plt.title('Scatter Plot of Energy vs Danceability with Clusters')
plt.show()

# analyze and visualize cluster centers
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=['danceability', 'energy', 'valence', 'tempo', 'acousticness'])
plt.figure(figsize=(10, 6))
sns.heatmap(centers.T, annot=True, cmap='coolwarm', cbar=True, yticklabels=centers.columns)
plt.title('Cluster Centers (Original Scale)')
plt.show()

# visualize clusters using violin plots
plt.figure(figsize=(12, 8))
sns.violinplot(x='cluster', y='energy', data=df, palette='viridis')
plt.title('Violin Plot for Energy by Cluster')
plt.show()