# Audio Cluster

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import SpectralClustering, DBSCAN, KMeans

In [None]:
feature_path = "../data/features.csv"
voices_path = "../data/voices.csv"
output_path = "../data/voice_cluster.csv"

In [None]:
features = pd.read_csv(feature_path, index_col="clip_id")
voices = pd.read_csv(voices_path, index_col="clip_id")
features.shape, voices.shape

In [None]:
scaler = StandardScaler()
features_scaled = pd.DataFrame(
    scaler.fit_transform(features),
    index=features.index,
    columns=features.columns
)

In [None]:
data = voices.merge(features_scaled, left_index=True, right_index=True, how='inner')
data.loc[data.voice_age_group == 90, "voice_age_group"] = 80
data.shape

In [None]:
min_count = data["voice_age_group"].value_counts().min()

# Sample each group to match the smallest group size
data = (
    data.groupby("voice_age_group", group_keys=False)
    .apply(lambda x: x.sample(100))
)

# Verify the balance
print(f"minimum count per group: {min_count}")
print(data["voice_age_group"].value_counts())

features_scaled = data[features_scaled.columns]

In [None]:
kmeans = KMeans(n_clusters=10, random_state=42)
cluster = kmeans.fit_predict(features_scaled)

In [None]:
tsne = TSNE(n_components=3, random_state=42, perplexity=20)
embeddings = tsne.fit_transform(features_scaled)

In [None]:
data.insert(0, "cluster", cluster)
data['x'] = embeddings[:, 0]
data['y'] = embeddings[:, 1]
data['z'] = embeddings[:, 2]

In [None]:
data.to_csv(output_path, index_label="clip_id")