.\.venv\Scripts\Activate.ps1      

Cell 0: Setup and Imports

In [None]:

import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
from typing import Dict, List, Optional, Tuple, Union
import random
import json
import ast
import re
from tqdm import tqdm
from fastdtw import fastdtw
from sklearn.cluster import AgglomerativeClustering
import IPython.display as ipd
carnatic_functions_path = r"C:\Desktop\Python\Audio Signal Processing\CarnaticAnnotater"
sys.path.append(carnatic_functions_path)
from carnatic_functions import (
    # --- CONFIGURATION ---
    AUDIO_CONFIG,
    CREPE_CONFIG,
    CLUSTERING_CONFIG,
    CARNATIC_RATIOS,

    # --- UTILITY FUNCTIONS ---
    get_data_paths,
    get_closest_note,
    get_closest_frequency,
    clean_np_float_list,
    interpolate_list,

    # --- PHASE 1: DATA PREP & INITIAL CLUSTERING ---
    process_audio_directory,
    add_normalized_frequency_column,
    perform_multistage_clustering,

    # --- PHASE 2: SECONDARY CLUSTERING ---
    recluster_with_dtw,
    recluster_with_pca,

    # --- ANALYSIS & VISUALIZATION ---
    evaluate_clustering_results,
    label_cluster,
    cluster_curve,
    play_and_plot_cluster,
    play_and_plot_secondary_cluster
)
audio_dir = r"C:\Desktop\Python\Audio Signal Processing\CarnaticAnnotater\Mayamalavagowlai_Vocals"
carva_csv_path = r"C:\Desktop\Python\Audio Signal Processing\CarnaticAnnotater\VocalAnnotator\data\Mayamalavagowlai\carva_Mayamalavagowlai.csv"
print("✅ All libraries and functions imported successfully!")
print("🎵 Ready to analyze Carnatic music!")


Cell 1: Config

In [None]:
CONFIG = {
    "initial_window_size": 60,
    "decay_size": 2,
    "min_window_size": 20,
    "hop_factor": 5,
    "outlier_threshold": 2,
    "similarity_threshold": 0.3,
    "pca_components": 15,
    "second_phase_window_size": 20,
    "similarity_threshold_secondary": 0.7,
    "pca_components_secondary": 10
}

Cell 2: Crepe Processing

In [None]:
process_audio_directory(audio_dir)
add_normalized_frequency_column(audio_dir)

Cell 3: Dynamic Decay Clustering (Round 1)

In [None]:
print("🔍 STEP 3: Performing multi-stage clustering...")
perform_multistage_clustering(audio_dir, CONFIG)

In [None]:
# def cluster_curve(audio_dir: str, song_index: int,
#                   carva_path: Optional[str] = None,
#                   clusters_to_plot: Optional[Union[int, List[int]]] = None):

#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')

#     paths = get_data_paths(raaga_name)

#     try:
#         master_df = pd.read_csv(paths["master_csv"])

#         if carva_path:
#             carva_df = pd.read_csv(carva_path)
#         else:
#             carva_df = pd.read_csv(paths["carva_csv"])

#     except FileNotFoundError as e:
#         print(f"❌ Required data file not found: {e}")
#         return

#     song_data = master_df[master_df["Index"] == song_index]
#     clustered_segments = carva_df[carva_df["Index"] == song_index]

#     if song_data.empty:
#         print(f"⚠️ No data found for song with index {song_index}.")
#         return

#     if clusters_to_plot is not None:
#         if isinstance(clusters_to_plot, int):
#             clusters_to_plot = [clusters_to_plot]
#         label_col = 'Second Labels' if 'Second Labels' in clustered_segments.columns else 'Label'
#         clustered_segments = clustered_segments[clustered_segments[label_col].isin(clusters_to_plot)]

#     full_frequency = song_data["Frequency"].values
#     full_time = np.arange(len(full_frequency))
#     tonic_note = song_data["Tonic"].iloc[0]
#     song_name = song_data["SongName"].iloc[0]

#     label_col = 'Second Labels' if 'Second Labels' in clustered_segments.columns else 'Label'
#     unique_labels = sorted(clustered_segments[label_col].unique())
#     cmap = plt.get_cmap('tab20', len(unique_labels))
#     label_to_color = {label: cmap(i) for i, label in enumerate(unique_labels)}

#     plt.style.use('dark_background')
#     plt.figure(figsize=(18, 8))

#     plt.plot(full_time, full_frequency, color='gray', alpha=0.5)

#     for _, row in clustered_segments.iterrows():
#         start = int(row['StartFrame'])
#         end = int(row['EndFrame'])
#         label = row[label_col]
#         color = label_to_color.get(label, 'black')
        
#         plt.plot(full_time[start:end], full_frequency[start:end],
#                  color=color, linewidth=2)

#     carnatic_frequencies = {note: librosa.note_to_hz(tonic_note) * ratio for note, ratio in CARNATIC_RATIOS.items()}
#     valid_freqs = song_data['Frequency'].dropna()
#     if not valid_freqs.empty:
#         min_freq = valid_freqs.min()
#         max_freq = valid_freqs.max()
#         for note, freq in carnatic_frequencies.items():
#             if min_freq <= freq <= max_freq:
#                 plt.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
#                 plt.text(len(full_frequency) * 1.005, freq, note, color='orange', fontsize=9, verticalalignment='center')
    
#     # --- LEGEND CODE REMOVED ---

#     plt.title(f"Clustered F0 Contour for Song: '{song_name}' (Tonic: {tonic_note})")
#     plt.xlabel("Time (frames)")
#     plt.ylabel("Frequency (Hz)")
#     plt.ylim(valid_freqs.min() - 10, valid_freqs.max() + 10)
#     plt.grid(alpha=0.3)
#     plt.tight_layout()
#     plt.show()

# def play_and_plot_cluster(audio_dir: str, cluster_number: int, song_index: int, sr: int = 44100):
#     """
#     Plots segments of a cluster from a specific song in context, then shows a second 
#     plot with only those segments overlaid for similarity, and plays their audio.
#     """
#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')

#     paths = get_data_paths(raaga_name)
    
#     try:
#         master_df = pd.read_csv(paths["master_csv"])
#         carva_df = pd.read_csv(paths["carva_csv"])
#     except FileNotFoundError as e:
#         print(f"❌ Required data file not found: {e}")
#         return

#     label_col = 'Second Labels' if 'Second Labels' in carva_df.columns else 'Label'
    
#     # Filter for segments from the specific cluster AND song
#     cluster_in_song_segments = carva_df[
#         (carva_df[label_col] == cluster_number) &
#         (carva_df['Index'] == song_index)
#     ]
    
#     if cluster_in_song_segments.empty:
#         print(f"⚠️ No segments found for cluster {cluster_number} in song index {song_index}.")
#         return

#     song_data = master_df[master_df["Index"] == song_index].reset_index(drop=True)
#     if song_data.empty:
#         print(f"⚠️ No data found for song index {song_index} in the master CSV.")
#         return

#     print(f"🔍 Analyzing Cluster {cluster_number} in Song Index {song_index}...")

#     # PLOT 1: Segments in their original song context
#     plt.style.use('dark_background')
#     plt.figure(figsize=(18, 8))
#     ax1 = plt.gca()

#     full_frequency = song_data["Frequency"].values
#     full_time = np.arange(len(full_frequency))
#     tonic_note = song_data["Tonic"].iloc[0]
#     song_name = song_data["SongName"].iloc[0]

#     ax1.plot(full_time, full_frequency, color='gray', alpha=0.5, label='F0 Contour')
    
#     random.seed(cluster_number)
#     cluster_color = '#%06x' % random.randint(0, 0xFFFFFF)
    
#     for i, (_, row) in enumerate(cluster_in_song_segments.iterrows()):
#         start_frame = int(row['StartFrame'])
#         end_frame = int(row['EndFrame'])
#         ax1.plot(full_time[start_frame:end_frame], 
#                  full_frequency[start_frame:end_frame], 
#                  color=cluster_color, linewidth=2, 
#                  label=f"Cluster {cluster_number}" if i == 0 else "")
            
#     carnatic_frequencies = {note: librosa.note_to_hz(tonic_note) * ratio for note, ratio in CARNATIC_RATIOS.items()}
#     valid_freqs = song_data['Frequency'].dropna()
#     if not valid_freqs.empty:
#         min_freq, max_freq = valid_freqs.min(), valid_freqs.max()
#         for note, freq in carnatic_frequencies.items():
#             if min_freq <= freq <= max_freq:
#                 ax1.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
#                 ax1.text(ax1.get_xlim()[1] * 1.005, freq, note, color='orange', fontsize=9, verticalalignment='center')
    
#     ax1.set_title(f"Context Plot for Cluster {cluster_number} in Song: '{song_name}'")
#     ax1.set_xlabel("Time (frames)")
#     ax1.set_ylabel("Frequency (Hz)")
#     ax1.legend(loc='upper right')
#     ax1.grid(alpha=0.3)
#     plt.tight_layout()
#     plt.show()

#     # --- MODIFIED PLOT 2 ---
#     # PLOT 2: Segments from THIS SONG ONLY overlaid for similarity
#     print(f"\n📈 Plotting the {len(cluster_in_song_segments)} segments from Cluster {cluster_number} in this song, overlaid for comparison...")
    
#     plt.style.use('dark_background')
#     plt.figure(figsize=(12, 7))
#     ax2 = plt.gca()
    
#     all_freqs_in_cluster = []

#     # Iterate over the segments from THIS SONG ONLY
#     for _, row in cluster_in_song_segments.iterrows():
#         seg_start = int(row['StartFrame'])
#         seg_end = int(row['EndFrame'])
        
#         # We can use the 'song_data' dataframe directly since we already filtered for this song
#         segment_freq_data = song_data['Frequency'].values[seg_start:seg_end]
        
#         if len(segment_freq_data) > 0:
#             ax2.plot(np.arange(len(segment_freq_data)), segment_freq_data, color='cyan', alpha=0.4, linewidth=1.5)
#             all_freqs_in_cluster.extend(segment_freq_data)
            
#     if all_freqs_in_cluster:
#         valid_cluster_freqs = [f for f in all_freqs_in_cluster if pd.notna(f)]
#         if valid_cluster_freqs:
#             min_freq_cluster, max_freq_cluster = min(valid_cluster_freqs), max(valid_cluster_freqs)
#             for note, freq in carnatic_frequencies.items():
#                 if min_freq_cluster <= freq <= max_freq_cluster:
#                     ax2.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
#                     ax2.text(ax2.get_xlim()[1] * 1.005, freq, note, color='orange', fontsize=9, verticalalignment='center')
#             ax2.set_ylim(min_freq_cluster - 10, max_freq_cluster + 10)

#     ax2.set_title(f"Shape Comparison of Segments in Cluster {cluster_number} from Song '{song_name}'")
#     ax2.set_xlabel("Time (frames within segment)")
#     ax2.set_ylabel("Frequency (Hz)")
#     ax2.grid(alpha=0.3)
#     plt.tight_layout()
#     plt.show()

#     # AUDIO PLAYBACK (unchanged)
#     print("\n🎧 Playing segments from the specified song:")
#     for _, row in cluster_in_song_segments.iterrows():
#         start_frame = int(row['StartFrame'])
#         end_frame = int(row['EndFrame'])
#         audio_path = row['AudioPath']
        
#         print(f"   - Frames: {start_frame}-{end_frame}")
#         song_time_data = song_data['Time'].reset_index(drop=True)
#         start_time = song_time_data.get(start_frame, 0)
#         end_time = song_time_data.get(end_frame, song_time_data.iloc[-1])
        
#         try:
#             audio, _ = librosa.load(audio_path, sr=sr)
#             start_sample = int(start_time * sr)
#             end_sample = int(end_time * sr)
#             if end_sample > start_sample:
#                 ipd.display(ipd.Audio(audio[start_sample:end_sample], rate=sr))
#             else:
#                 print("   ⚠️ Invalid segment length, skipping playback.")
#         except Exception as e:
#             print(f"   ⚠️ Could not load or play audio from {audio_path}: {e}")

#     print("---------------------------------")

# def evaluate_clustering_results(audio_dir: str, song_idx: int, carva_path: str):
#     """
#     Evaluates and visualizes clustering results for a specific song.
#     """
#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')
        
#     paths = get_data_paths(raaga_name)
    
#     try:
#         master_df = pd.read_csv(paths["master_csv"])
#         carva_df = pd.read_csv(carva_path)
#     except FileNotFoundError as e:
#         print(f"❌ Required data file not found: {e}.")
#         return

#     # Filter data for the specific song
#     song_master_df = master_df[master_df["Index"] == song_idx]
    
#     if song_master_df.empty:
#         print(f"⚠️ No data found for song index {song_idx} in the master file.")
#         return
        
#     song_name = song_master_df["SongName"].iloc[0]

#     print(f"📊 Evaluating clustering for song: '{song_name}' (Index: {song_idx})")

#     # 1. Percentage of song clustered (Corrected for overlap)
#     total_song_frames = len(song_master_df)
    
#     song_carva_df = carva_df[carva_df['Index'] == song_idx]

#     if song_carva_df.empty:
#         total_clustered_frames = 0
#     else:
#         is_clustered = np.zeros(total_song_frames, dtype=bool)
#         for _, row in song_carva_df.iterrows():
#             start = int(row['StartFrame'])
#             end = int(row['EndFrame'])
#             if end > start and end <= total_song_frames: # Added boundary check
#                 is_clustered[start:end] = True
        
#         total_clustered_frames = np.sum(is_clustered)

#     percentage_clustered = (total_clustered_frames / total_song_frames) * 100 if total_song_frames > 0 else 0
#     print(f"   • Percentage of song clustered: {percentage_clustered:.2f}%")

#     # 2. Total number of clusters
#     if song_carva_df.empty:
#         num_clusters = 0
#     else:
#         num_clusters = song_carva_df['Label'].nunique()
#     print(f"   • Total number of clusters found: {num_clusters}")

#     # 3. Plotting the metrics
#     if not song_carva_df.empty:
#         song_carva_df['SegmentLength'] = song_carva_df['EndFrame'] - song_carva_df['StartFrame']
        
#         # Plot 1: Histogram of segment lengths
#         plt.style.use('dark_background')
#         plt.figure(figsize=(10, 6))
#         cluster_lengths = song_carva_df.groupby('Label')['SegmentLength'].first()
#         if not cluster_lengths.empty:
#             plt.hist(cluster_lengths, bins=range(cluster_lengths.min(), cluster_lengths.max() + 2), edgecolor='white')
#             plt.title(f"Distribution of Cluster Segment Lengths for '{song_name}'")
#             plt.xlabel("Segment Length (frames)")
#             plt.ylabel("Number of Clusters")
#             plt.grid(alpha=0.3)
#             plt.show()

#         # --- MODIFIED PLOT ---
#         # Plot 2: Bar chart showing the number of elements in each cluster
#         plt.style.use('dark_background')
#         plt.figure(figsize=(12, 8))
        
#         cluster_sizes = song_carva_df.groupby('Label')['Index'].count().sort_values(ascending=False)
        
#         cluster_sizes.plot(kind='bar', edgecolor='white', alpha=0.8, color='cyan')
        
#         plt.title(f"Number of Segments per Cluster for '{song_name}'")
#         plt.xlabel("Cluster Label")
#         plt.ylabel("Number of Segments")
#         plt.xticks(rotation=45, ha='right')
#         plt.grid(axis='y', alpha=0.3)
#         plt.tight_layout() # Adjust layout to prevent labels from overlapping
#         plt.show()
#     else:
#         print("   • No clustered segments to plot.")

# def clean_np_float_list(seg_str: str) -> np.ndarray:
#     """Convert a stringified list with np.float64 entries into a proper list of floats."""
#     cleaned = re.sub(r'np\\.float64\\(([^)]+)\\)', r'\\1', seg_str)
#     return np.array(ast.literal_eval(cleaned), dtype=float)
# def interpolate_list(lst, target_len: int) -> list:
#     """
#     Corrected version: Interpolates a list or NumPy array to a target length,
#     and correctly handles empty inputs.
#     """
#     # --- FIX: Explicitly check the length of the list/array ---
#     if lst is None or len(lst) == 0:
#         return [0.0] * target_len
#     # -----------------------------------------------------------
    
#     original_len = len(lst)
#     return list(np.interp(np.linspace(0, original_len - 1, target_len),
#                           np.arange(original_len), lst))
# def recluster_with_dtw(audio_dir: str, config: Dict, song_index: Optional[int] = None):
#     """
#     Takes segments from the carva file, interpolates them to a fixed length,
#     and then re-clusters them using an all-vs-all DTW comparison.
#     """
#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')
        
#     paths = get_data_paths(raaga_name)
#     carva_path = paths["carva_csv"]

#     print(f"🔬 Starting DTW re-clustering for '{raaga_name}'...")
    
#     try:
#         carva_df = pd.read_csv(carva_path)
#     except FileNotFoundError:
#         print(f"❌ ERROR: File not found at {carva_path}")
#         return

#     # 1. Load Data & Filter
#     if song_index is not None:
#         print(f" targeting song with Index: {song_index}")
#         target_df = carva_df[carva_df['Index'] == song_index].copy()
#     else:
#         print(" targeting all songs.")
#         target_df = carva_df.copy()
        
#     # --- DEBUG ---
#     print(f"DEBUG: Found {len(carva_df)} total rows in the carva file.")
#     if song_index is not None:
#         print(f"DEBUG: Found {len(target_df)} segments belonging to song index {song_index}.")
#     # -------------

#     if target_df.empty:
#         print("⚠️ No segments found for the specified criteria.")
#         return
        
#     interpolation_size = config.get("interpolation_size", 50)
#     similarity_threshold = config.get("dtw_similarity_threshold", 1.5)

#     # 2. Prepare Segments: Parse and Interpolate
#     interpolated_segments = []
#     valid_original_indices = target_df.index.tolist() 
    
#     print("DEBUG: Starting to parse and interpolate segments...")
#     for i, seg_str in target_df['SegmentList'].items(): # Use .items() to get original index i
#         try:
#             segment = clean_np_float_list(seg_str)
#             if len(segment) > 0:
#                 interpolated = interpolate_list(segment, interpolation_size)
#                 interpolated_segments.append(np.array(interpolated))
#                 # --- DEBUG ---
#                 print(f"  Row {i}: ✅ Successfully interpolated segment.")
#                 # -------------
#             else:
#                 interpolated_segments.append(None)
#         except (ValueError, SyntaxError) as e:
#             interpolated_segments.append(None)
#             # --- DEBUG ---
#             print(f"  Row {i}: ❌ FAILED to parse segment. Error: {e}")
#             # -------------
    
#     final_segments = [seg for seg in interpolated_segments if seg is not None]
#     final_indices = [idx for i, idx in enumerate(valid_original_indices) if interpolated_segments[i] is not None]

#     print(f"Found {len(final_segments)} valid segments to re-cluster.")

#     if len(final_segments) < 2:
#         print("⚠️ Not enough valid segments (< 2) to perform DTW clustering.")
#         return

#     # 3. DTW Distance Matrix Calculation
#     num_segments = len(final_segments)
#     dist_matrix = np.zeros((num_segments, num_segments))
    
#     # --- DEBUG ---
#     print(f"DEBUG: Starting DTW comparison for {num_segments} segments.")
#     # -------------

#     for i in tqdm(range(num_segments), desc="Calculating DTW Matrix"):
#         for j in range(i + 1, num_segments):
#             dist, _ = fastdtw(final_segments[i], final_segments[j])
#             dist_matrix[i, j] = dist
#             dist_matrix[j, i] = dist
            
#     # (The rest of the function continues as before)
#     print("🧬 Performing hierarchical clustering...")
#     clustering = AgglomerativeClustering(
#         n_clusters=None,
#         distance_threshold=similarity_threshold,
#         metric='precomputed',
#         linkage='average'
#     )
#     labels = clustering.fit_predict(dist_matrix)

#     if 'DTW_Label' not in carva_df.columns:
#         carva_df['DTW_Label'] = -1
    
#     carva_df.loc[final_indices, 'DTW_Label'] = labels
#     carva_df.to_csv(carva_path, index=False)
    
#     print(f"\n✅ DTW re-clustering finished! Found {len(set(labels))} new clusters.")
#     print(f"💾 Updated {carva_path} with 'DTW_Label' column.")
# def recluster_with_pca(audio_dir: str, config: Dict, song_index: Optional[int] = None):
#     """
#     Takes segments from the carva file, interpolates them, and re-clusters them
#     using the fast PCA method instead of DTW.
#     """
#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')
        
#     paths = get_data_paths(raaga_name)
#     carva_path = paths["carva_csv"]

#     print(f"🔬 Starting PCA re-clustering for '{raaga_name}'...")
    
#     try:
#         carva_df = pd.read_csv(carva_path)
#     except FileNotFoundError:
#         print(f"❌ ERROR: File not found at {carva_path}")
#         return

#     # 1. Load Data & Filter
#     if song_index is not None:
#         print(f" targeting song with Index: {song_index}")
#         target_df = carva_df[carva_df['Index'] == song_index].copy()
#     else:
#         print(" targeting all songs.")
#         target_df = carva_df.copy()

#     if target_df.empty:
#         print("⚠️ No segments found for the specified criteria.")
#         return
        
#     interpolation_size = config.get("interpolation_size", 50)
#     similarity_threshold = config.get("pca_similarity_threshold", 0.7)
#     pca_components = config.get("pca_components", 10)

#     # 2. Prepare Segments: Parse and Interpolate
#     interpolated_segments = []
#     valid_original_indices = target_df.index.tolist() 
    
#     for seg_str in target_df['SegmentList']:
#         try:
#             segment = clean_np_float_list(seg_str)
#             if len(segment) > 0:
#                 interpolated = interpolate_list(segment, interpolation_size)
#                 interpolated_segments.append(np.array(interpolated))
#             else:
#                 interpolated_segments.append(None)
#         except (ValueError, SyntaxError):
#             interpolated_segments.append(None)
    
#     final_segments = [seg for seg in interpolated_segments if seg is not None]
#     final_indices = [idx for i, idx in enumerate(valid_original_indices) if interpolated_segments[i] is not None]

#     print(f"Found {len(final_segments)} valid segments to re-cluster.")

#     if len(final_segments) < 2:
#         print("⚠️ Not enough valid segments (< 2) to perform PCA clustering.")
#         return

#     # 3. Perform PCA and Calculate Distance Matrix
#     print("🤖 Performing PCA transformation...")
#     X_abs = np.stack(final_segments)
#     X_shape = X_abs - np.mean(X_abs, axis=1, keepdims=True)
#     X_combined = np.concatenate([X_abs, X_shape], axis=1)

#     pca = PCA(n_components=min(pca_components, X_combined.shape[1]))
#     X_pca = pca.fit_transform(X_combined)
#     dist_matrix = squareform(pdist(X_pca, metric='euclidean'))
            
#     # 4. Perform Clustering
#     print("🧬 Performing hierarchical clustering...")
#     clustering = AgglomerativeClustering(
#         n_clusters=None,
#         distance_threshold=similarity_threshold,
#         metric='precomputed',
#         linkage='average'
#     )
#     labels = clustering.fit_predict(dist_matrix)

#     # 5. Update the carva.csv file
#     if 'PCA_Label' not in carva_df.columns:
#         carva_df['PCA_Label'] = -1
    
#     carva_df.loc[final_indices, 'PCA_Label'] = labels
#     carva_df.to_csv(carva_path, index=False)
    
#     print(f"\n✅ PCA re-clustering finished! Found {len(set(labels))} new clusters.")
#     print(f"💾 Updated {carva_path} with 'PCA_Label' column.")

# def play_and_plot_secondary_cluster(audio_dir: str, cluster_number: int, song_index: int, sr: int = 44100):
#     """
#     Plots segments of a SECONDARY cluster from a specific song in context, shows a 
#     second plot with only those segments overlaid, and plays their audio.
#     """
#     raaga_name = Path(audio_dir).name
#     if raaga_name.endswith('_Vocals'):
#         raaga_name = raaga_name.replace('_Vocals', '')

#     paths = get_data_paths(raaga_name)
    
#     try:
#         master_df = pd.read_csv(paths["master_csv"])
#         carva_df = pd.read_csv(paths["carva_csv"])
#     except FileNotFoundError as e:
#         print(f"❌ Required data file not found: {e}")
#         return

#     # --- KEY CHANGE: Automatically find the secondary label column ---
#     if 'DTW_Label' in carva_df.columns:
#         label_col = 'DTW_Label'
#     elif 'PCA_Label' in carva_df.columns:
#         label_col = 'PCA_Label'
#     elif 'Second Labels' in carva_df.columns:
#         label_col = 'Second Labels'
#     else:
#         print("❌ Error: No secondary label column ('DTW_Label', 'PCA_Label', or 'Second Labels') found in carva.csv.")
#         return
    
#     print(f"INFO: Using secondary cluster column: '{label_col}'")
#     # -----------------------------------------------------------------
    
#     # Filter for segments from the specific secondary cluster AND song
#     cluster_in_song_segments = carva_df[
#         (carva_df[label_col] == cluster_number) &
#         (carva_df['Index'] == song_index)
#     ]
    
#     if cluster_in_song_segments.empty:
#         print(f"⚠️ No segments found for secondary cluster {cluster_number} in song index {song_index}.")
#         return

#     song_data = master_df[master_df["Index"] == song_index].reset_index(drop=True)
#     if song_data.empty:
#         print(f"⚠️ No data found for song index {song_index} in the master CSV.")
#         return

#     print(f"🔍 Analyzing Secondary Cluster {cluster_number} in Song Index {song_index}...")

#     # (The rest of the plotting and playback logic is the same)

#     # PLOT 1: Segments in their original song context
#     plt.style.use('dark_background')
#     plt.figure(figsize=(18, 8))
#     ax1 = plt.gca()
#     full_frequency = song_data["Frequency"].values
#     x_axis_time = song_data["Time"].values # Using real time for x-axis
#     tonic_note = song_data["Tonic"].iloc[0]
#     song_name = song_data["SongName"].iloc[0]
#     ax1.plot(x_axis_time, full_frequency, color='gray', alpha=0.5, label='F0 Contour')
#     random.seed(cluster_number)
#     cluster_color = '#%06x' % random.randint(0, 0xFFFFFF)
#     for i, (_, row) in enumerate(cluster_in_song_segments.iterrows()):
#         start_frame, end_frame = int(row['StartFrame']), int(row['EndFrame'])
#         ax1.plot(x_axis_time[start_frame:end_frame], full_frequency[start_frame:end_frame], 
#                  color=cluster_color, linewidth=2, label=f"Cluster {cluster_number}" if i == 0 else "")
#     carnatic_frequencies = {note: librosa.note_to_hz(tonic_note) * ratio for note, ratio in CARNATIC_RATIOS.items()}
#     valid_freqs = song_data['Frequency'].dropna()
#     if not valid_freqs.empty:
#         min_freq, max_freq = valid_freqs.min(), valid_freqs.max()
#         plot_end_time = ax1.get_xlim()[1]
#         for note, freq in carnatic_frequencies.items():
#             if min_freq <= freq <= max_freq:
#                 ax1.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
#                 ax1.text(plot_end_time * 1.005, freq, note, color='orange', fontsize=9, verticalalignment='center')
#     ax1.set_title(f"Context Plot for Secondary Cluster {cluster_number} in Song: '{song_name}'")
#     ax1.set_xlabel("Time (seconds)"), ax1.set_ylabel("Frequency (Hz)"), ax1.legend(loc='upper right'), ax1.grid(alpha=0.3)
#     plt.tight_layout(), plt.show()

#     # PLOT 2: Segments from THIS SONG ONLY overlaid for similarity
#     print(f"\n📈 Plotting the {len(cluster_in_song_segments)} segments from Secondary Cluster {cluster_number} in this song, overlaid for comparison...")
#     plt.style.use('dark_background'), plt.figure(figsize=(12, 7)), (ax2 := plt.gca())
#     all_freqs_in_cluster = []
#     for _, row in cluster_in_song_segments.iterrows():
#         start_frame, end_frame = int(row['StartFrame']), int(row['EndFrame'])
#         segment_freq_data = song_data['Frequency'].values[start_frame:end_frame]
#         if len(segment_freq_data) > 0:
#             ax2.plot(np.arange(len(segment_freq_data)), segment_freq_data, color='cyan', alpha=0.4, linewidth=1.5)
#             all_freqs_in_cluster.extend(segment_freq_data)
#     if all_freqs_in_cluster:
#         valid_cluster_freqs = [f for f in all_freqs_in_cluster if pd.notna(f)]
#         if valid_cluster_freqs:
#             min_freq_cluster, max_freq_cluster = min(valid_cluster_freqs), max(valid_cluster_freqs)
#             for note, freq in carnatic_frequencies.items():
#                 if min_freq_cluster <= freq <= max_freq_cluster:
#                     ax2.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
#                     ax2.text(ax2.get_xlim()[1] * 1.005, freq, note, color='orange', fontsize=9, verticalalignment='center')
#             ax2.set_ylim(min_freq_cluster - 10, max_freq_cluster + 10)
#     ax2.set_title(f"Shape Comparison of Segments in Secondary Cluster {cluster_number} from Song '{song_name}'")
#     ax2.set_xlabel("Time (frames within segment)"), ax2.set_ylabel("Frequency (Hz)"), ax2.grid(alpha=0.3)
#     plt.tight_layout(), plt.show()

#     # AUDIO PLAYBACK
#     print("\n🎧 Playing segments from the specified song:")
#     song_time_data = song_data['Time'].values
#     audio_path = cluster_in_song_segments['AudioPath'].iloc[0]
#     try:
#         audio, native_sr = librosa.load(audio_path, sr=None)
#         if sr and native_sr != sr:
#             audio = librosa.resample(y=audio, orig_sr=native_sr, target_sr=sr)
#     except Exception as e:
#         print(f"⚠️ Could not load audio from {audio_path}: {e}")
#         return
#     for _, row in cluster_in_song_segments.iterrows():
#         start_frame, end_frame = int(row['StartFrame']), int(row['EndFrame'])
#         if start_frame >= len(song_time_data) or end_frame > len(song_time_data):
#             print(f"   - Frames: {start_frame}-{end_frame}: ⚠️ Invalid frame indices, skipping.")
#             continue
#         start_time, end_time = song_time_data[start_frame], song_time_data[end_frame - 1]
#         print(f"   - Frames: {start_frame}-{end_frame} -> Playing from {start_time:.2f}s to {end_time:.2f}s")
#         start_sample, end_sample = int(start_time * sr), int(end_time * sr)
#         if end_sample > start_sample:
#             ipd.display(ipd.Audio(audio[start_sample:end_sample], rate=sr))
#         else:
#             print("   ⚠️ Calculated segment duration is zero, skipping.")
#     print("---------------------------------")


Cell 4: Round 1 Evalutation & Results

In [None]:
song_idx=5
evaluate_clustering_results(audio_dir, song_idx)  # Example usage with first song
cluster_curve(audio_dir, song_idx)

In [None]:
play_and_plot_cluster(audio_dir, 13, 0)

In [None]:
recluster_with_pca(audio_dir, CONFIG)

In [None]:
play_and_plot_secondary_cluster(audio_dir, 26)

In [None]:
from collections import Counter

# (This function can be placed in your carnatic_functions.py file)
LABELING_CONFIG = {
    "histogram_bins": 100,      # How many bins to use for frequency analysis
    "peak_prominence": 0.15,    # How prominent a histogram peak must be to be considered a note
    "max_notes_in_label": 3     # The maximum number of notes to include in the final label
}

def auto_label_cluster(audio_dir: str, cluster_number: int, config: Dict = LABELING_CONFIG):

    raaga_name = Path(audio_dir).name.replace('_Vocals', '')
    paths = get_data_paths(raaga_name)
    
    try:
        master_df = pd.read_csv(paths["master_csv"])
        carva_df = pd.read_csv(paths["carva_csv"])
    except FileNotFoundError as e:
        print(f"❌ Required data file not found: {e}"); return

    label_col = next((col for col in ['DTW_Label', 'PCA_Label', 'Second Labels', 'Label'] if col in carva_df.columns), None)
    if not label_col:
        print("❌ No label column found in carva.csv."); return

    cluster_rows = carva_df[carva_df[label_col] == cluster_number]
    if cluster_rows.empty:
        print(f"⚠️ No segments found for cluster {cluster_number}."); return

    # Determine a representative tonic for the cluster to get the swara map
    tonic = master_df[master_df['Index'] == cluster_rows.iloc[0]['Index']]['Tonic'].iloc[0]
    carnatic_frequencies = {note: librosa.note_to_hz(tonic) * ratio for note, ratio in CARNATIC_RATIOS.items()}

    # This list will store the simplified note sequence for each segment
    list_of_sequences = []

    for _, row in cluster_rows.iterrows():
        try:
            segment_data = clean_np_float_list(row['SegmentList'])
            if len(segment_data) < 5: continue

            # --- NEW SEQUENCE LOGIC ---
            # 1. Convert every frequency point in the segment to its closest swara
            swara_sequence_raw = [get_closest_note(freq, carnatic_frequencies) for freq in segment_data]
            
            # 2. Simplify the sequence by removing consecutive duplicates
            simplified_sequence = []
            if swara_sequence_raw:
                simplified_sequence.append(swara_sequence_raw[0])
                for i in range(1, len(swara_sequence_raw)):
                    if swara_sequence_raw[i] != swara_sequence_raw[i-1]:
                        simplified_sequence.append(swara_sequence_raw[i])
            
            if simplified_sequence:
                # Convert list to a tuple to make it hashable for the Counter
                list_of_sequences.append(tuple(simplified_sequence))

        except (ValueError, SyntaxError):
            continue

    if not list_of_sequences:
        print(f"Could not determine a dominant sequence for cluster {cluster_number}."); return

    # 3. Count the occurrences of each unique sequence
    sequence_counts = Counter(list_of_sequences)
    
    # 4. The most common sequence becomes the label
    most_common_sequence, count = sequence_counts.most_common(1)[0]
    final_label = " ".join(most_common_sequence)
    
    confidence = (count / len(list_of_sequences)) * 100
    print(f"Cluster {cluster_number}: Determined dominant sequence -> '{final_label}' (Confidence: {confidence:.1f}%)")
    
    # Use the existing label_cluster function to save the result
    label_cluster(audio_dir, cluster_number, final_label)


auto_label_cluster(audio_dir, 26) 

In [2]:
print("")


