# VAE Music Clustering - Exploratory Analysis

This notebook provides an interactive exploration of:
1. Dataset statistics
2. Model training
3. Latent space visualization
4. Clustering evaluation

In [None]:
# Imports
import sys
sys.path.append('..')  # Add project root to path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader

from src import (
    VAE, CVAE, SimpleAutoencoder,
    SongDataset, SongDatasetWithLabels,
    extract_latent_features, apply_kmeans, reduce_tsne,
    compute_all_metrics, compare_methods
)
from config import *

%matplotlib inline
sns.set_style('whitegrid')

## 1. Dataset Exploration

In [None]:
# Load dataset index
df = pd.read_csv(DATASET_INDEX)
print(f"Total samples: {len(df)}")
print(f"\nLanguage distribution:")
print(df['language'].value_counts())

# Visualize distribution
plt.figure(figsize=(8, 5))
df['language'].value_counts().plot(kind='bar')
plt.title('Language Distribution')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 2. Load Pretrained Models

In [None]:
# Load VAE model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

vae_model = VAE(input_dim=INPUT_DIM, hidden_dim=LATENT_DIM).to(device)
vae_model.load_state_dict(torch.load('../vae_model.pth', map_location=device))
vae_model.eval()

print("VAE model loaded successfully!")

## 3. Extract Latent Features

In [None]:
# Create dataset and dataloader
dataset = SongDataset(DATA_DIR, seq_len=SEQ_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Dataset size: {len(dataset)} samples")

# Extract latent features
latents, labels = extract_latent_features(vae_model, dataloader, device)
print(f"Latent features shape: {latents.shape}")
print(f"Labels shape: {labels.shape}")

## 4. Clustering Analysis

In [None]:
# Apply K-Means
cluster_labels = apply_kmeans(latents, n_clusters=K_CLUSTERS, random_state=RANDOM_SEED)

# Compute metrics
metrics = compute_all_metrics(labels, cluster_labels, latents)

print("\nClustering Metrics:")
for metric, value in metrics.items():
    print(f"{metric:.<30} {value:>10.4f}")

## 5. t-SNE Visualization

In [None]:
# Reduce to 2D using t-SNE
embedding = reduce_tsne(latents, n_components=2, random_state=RANDOM_SEED)

# Plot ground truth vs clusters
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Ground truth
axes[0].scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap='coolwarm', s=5, alpha=0.7)
axes[0].set_title('Ground Truth (Blue=English, Red=Bangla)', fontsize=14)
axes[0].set_xlabel('t-SNE Dimension 1')
axes[0].set_ylabel('t-SNE Dimension 2')

# Predicted clusters
axes[1].scatter(embedding[:, 0], embedding[:, 1], c=cluster_labels, cmap='viridis', s=5, alpha=0.7)
axes[1].set_title('K-Means Clusters', fontsize=14)
axes[1].set_xlabel('t-SNE Dimension 1')
axes[1].set_ylabel('t-SNE Dimension 2')

plt.tight_layout()
plt.savefig('../results/latent_visualization/notebook_tsne.png', dpi=150)
plt.show()

## 6. Compare with Other Models

In [None]:
# Load Autoencoder
ae_model = SimpleAutoencoder(input_dim=INPUT_DIM, latent_dim=LATENT_DIM).to(device)
ae_model.load_state_dict(torch.load('../autoencoder_model.pth', map_location=device))
ae_model.eval()

# Extract features
ae_latents, _ = extract_latent_features(ae_model, dataloader, device)
ae_clusters = apply_kmeans(ae_latents, n_clusters=K_CLUSTERS, random_state=RANDOM_SEED)
ae_metrics = compute_all_metrics(labels, ae_clusters, ae_latents)
ae_metrics['Method'] = 'Autoencoder'

# Compare
metrics['Method'] = 'VAE'
compare_methods([metrics, ae_metrics])

## 7. Investigate Sample MFCCs

In [None]:
# Load a sample MFCC
sample_idx = 0
mfcc, label = dataset[sample_idx]

plt.figure(figsize=(12, 6))
plt.imshow(mfcc.numpy(), aspect='auto', cmap='viridis', origin='lower')
plt.colorbar(label='MFCC Value')
plt.title(f'Sample MFCC (Language: {"English" if label == 0 else "Bangla"})')
plt.xlabel('Time Frame')
plt.ylabel('MFCC Coefficient')
plt.tight_layout()
plt.show()

## 8. Summary

This notebook demonstrated:
- ✅ Dataset exploration and balance verification
- ✅ Loading pretrained VAE models
- ✅ Extracting latent features
- ✅ Clustering evaluation
- ✅ t-SNE visualization
- ✅ Model comparison

**Key Finding:** Language clustering from audio alone is challenging, with modest performance metrics indicating the need for improved features or larger datasets.