# Audio Cluster

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.cluster import OPTICS
import dash
from dash import dcc, html
import plotly.express as px
from dash.dependencies import Input, Output

In [None]:
feature_path = "../data/features.csv"
voices_path = "../data/voices.csv"

In [None]:
features = pd.read_csv(feature_path, index_col="clip_id")
voices = pd.read_csv(voices_path, index_col="clip_id")

In [None]:
voices.voice_gender.value_counts()

In [None]:
voices.voice_age_group.value_counts()

In [None]:
data = pd.merge(features, voices, left_index=True, right_index=True)

In [None]:
# turn 90 and 80 to >= 80
data.loc[data.voice_age_group == 90, "voice_age_group"] = 80

In [None]:
min_size = data['voice_age_group'].value_counts().min()

balanced_sample = (data
                 .groupby('voice_age_group', group_keys=False)
                 .apply(lambda x: x.sample(min_size, random_state=42)))

features = balanced_sample[features.columns]
voices = balanced_sample[voices.columns]

In [None]:
scaler = StandardScaler()
features_scaled = pd.DataFrame(
    scaler.fit_transform(features),
    index=features.index,
    columns=features.columns
)

In [None]:
reducer = UMAP(
    n_components=3,
    n_neighbors=100,
    min_dist=0.1,
    metric='euclidean',
    verbose=True
)
embedding = reducer.fit_transform(features_scaled)

In [None]:
clusters = OPTICS(
    min_samples=15,           # Decrease from default 20
    xi=0.03,                 # More sensitive to small dips (was 0.05)
    metric='cosine',          # Keep for audio
    cluster_method='dbscan',  # More consistent than 'xi'
    eps=0.45,                # Try 0.3-0.6 range
    n_jobs=-1
).fit_predict(features_scaled)

In [None]:
results = pd.DataFrame({
    'x': embedding[:,0],
    'y': embedding[:,1],
    'z': embedding[:,2],
    'cluster': clusters,
    'clip_id': features_scaled.index  # Explicitly add the index
}).set_index('clip_id').join(voices, how='left')
results.shape

In [None]:
results.cluster.value_counts()

In [None]:
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(id='3d-scatter'),
    html.Audio(id='audio-player', controls=True)
])

@app.callback(
    Output('audio-player', 'src'),
    [Input('3d-scatter', 'hoverData')]
)
def play_audio(hoverData):
    if hoverData:
        filename = hoverData['points'][0]['customdata'][2]
        return f"/audio_files/{filename}"  # Serve from your backend
    return ""

if __name__ == '__main__':
    app.run(debug=True)