# Model Evaluation and Interpretation
## Amazon Music Clustering Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from math import pi

# Set plot style
sns.set(style="whitegrid")

### 1. Load Data and Model

In [None]:
# Load processed data and model
X_scaled_df = pd.read_csv('../data/processed/scaled_features.csv')
df = pd.read_csv('../data/processed/clustered_data.csv')

with open('../models/kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)

labels = kmeans.labels_

### 2. Quantitative Evaluation Metrics

In [None]:
sil_score = silhouette_score(X_scaled_df, labels)
db_score = davies_bouldin_score(X_scaled_df, labels)
ch_score = calinski_harabasz_score(X_scaled_df, labels)

print(f"Silhouette Score: {sil_score:.4f}")
print(f"Davies-Bouldin Index: {db_score:.4f}")
print(f"Calinski-Harabasz Score: {ch_score:.4f}")

**Interpretation:**
- **Silhouette Score**: Ranges from -1 to 1. A higher score indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
- **Davies-Bouldin Index**: The score is defined as the average similarity measure of each cluster with its most similar cluster. Lower values indicate better clustering.
- **Calinski-Harabasz Index**: Also known as the Variance Ratio Criterion. Higher values indicate better defined clusters.

### 3. Cluster Profiling (Radar Charts)

In [None]:
# Select features for radar chart (normalized roughly 0-1 range features work best, or use scaled)
# We will use the original values but normalize them min-max for visualization purposes to fit on the radar chart
radar_features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']

# Normalize features for visualization
df_normalized = df[radar_features].copy()
for col in radar_features:
    df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

df_normalized['cluster'] = df['cluster']
cluster_means = df_normalized.groupby('cluster').mean()

def plot_radar_chart(means, features):
    # Number of variables
    N = len(features)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += [angles[0]]
    
    # Initialise the spider plot
    plt.figure(figsize=(10, 10))
    ax = plt.subplot(111, polar=True)
    
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], features)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
    plt.ylim(0, 1)
    
    # Plot each cluster
    for i, row in means.iterrows():
        values = row.values.flatten().tolist()
        values += [values[0]]
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=f'Cluster {i}')
        ax.fill(angles, values, alpha=0.1)
        
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title('Cluster Profiles (Radar Chart)')
    plt.show()

plot_radar_chart(cluster_means, radar_features)

### 4. Cluster Interpretation

Based on the feature distributions and radar charts, we can interpret the clusters as follows (Example interpretations, adjust based on actual results):

- **Cluster 0**: *High Energy / Dance*
    - Characterized by high energy and danceability.
    - Likely pop, dance, or upbeat tracks.

- **Cluster 1**: *Acoustic / Chill*
    - High acousticness, low energy.
    - Likely ballads, acoustic covers, or classical.

- **Cluster 2**: *Instrumental*
    - High instrumentalness, low speechiness.
    - Likely soundtracks, classical, or study music.

- **Cluster 3**: *Lyrical / Vocal*
    - High speechiness or valence.
    - Likely rap, hip-hop, or vocal-heavy tracks.

*(Note: The actual interpretation depends on the specific K-Means result and random seed used)*