In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import crepe
from scipy.interpolate import interp1d
from scipy.signal import argrelextrema
import pandas as pd
import plotly.graph_objects as go
import IPython.display as ipd
from sklearn.cluster import KMeans
from sklearn.manifold import SpectralEmbedding
from scipy.signal import find_peaks as scipy_find_peaks
import os
from tqdm import tqdm
from scipy.spatial.distance import cdist
from fastdtw import fastdtw
from sklearn.cluster import AgglomerativeClustering
from collections import Counter, defaultdict

In [2]:
def plot_spectrogram_with_crepe(spec_time, conf, S_db, sr):
    plt.figure(figsize=(14, 8))
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='linear', cmap='viridis')
    plt.plot(spec_time, conf, color='r', linewidth=1.5, label='CREPE Pitch')  # Use spec_time
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.ylim(0, 2000)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.show()
    
def find_tonic(S, sr):
    chroma = librosa.feature.chroma_stft(S=np.abs(S), sr=sr)
    pitch_class_sums = np.sum(np.abs(chroma), axis=1)
    pitch_labels = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
    pitch_class_dict = dict(zip(pitch_labels, pitch_class_sums))
    return max(pitch_class_dict, key=pitch_class_dict.get)

def get_carnatic_frequencies(tonic):
    # Intonational ratios for the basic set of Carnatic notes
    carnatic_ratios = {
        'sa': 0.5*1.0,    # Tonic (Sa)
        'ri1': 0.5*16/15, # Ri1
        'ri2': 0.5*9/8,  # Ri2
        'ga1': 0.5*6/5,  # Ga1
        'ga2': 0.5*5/4, # Ga2
        'ma1': 0.5*4/3, # Ma1
        'ma2': 0.5*45/32,   # Ma2
        'pa': 0.5*3/2,    # Pa
        'da1': 0.5*8/5, # Dha1
        'da2': 0.5*5/3, # Dha2
        'ni1': 0.5*16/9, # Ni1
        'ni2': 0.5*15/8,   # Ni2

        'Sa': 1.0,    # Tonic (Sa)
        'Ri1': 16/15, # Ri1
        'Ri2': 9/8,  # Ri2
        'Ga1': 6/5,  # Ga1
        'Ga2': 5/4, # Ga2
        'Ma1': 4/3, # Ma1
        'Ma2': 45/32,   # Ma2
        'Pa': 3/2,    # Pa
        'Da1': 8/5, # Dha1
        'Da2': 5/3, # Dha2
        'Ni1': 16/9, # Ni1
        'Ni2': 15/8,   # Ni2

        'SA': 2.0,   # Octave higher (Sa)
        'RI1': 2*16/15, # Ri1
        'RI2': 2*9/8,  # Ri2
        'GA1': 2*6/5,  # Ga1
        'GA2': 2*5/4, # Ga2
        'MA1': 2*4/3, # Ma1
        'MA2': 2*45/32,   # Ma2
        'PA': 2*3/2,    # Pa
        'DA1': 2*8/5, # Dha1
        'DA2': 2*5/3, # Dha2
        'NI1': 2*16/9, # Ni1
        'NI2': 2*15/8,   
    }

    tonic_freq = librosa.note_to_hz(tonic)  # Get the frequency of the tonic

    # Calculate the frequencies for each Carnatic note relative to the tonic
    carnatic_frequencies = {note: tonic_freq * ratio for note, ratio in carnatic_ratios.items()}
    return carnatic_frequencies

def get_closest_note(freq, carnatic_frequencies):
    """Find the closest Carnatic note for a given frequency."""
    return min(carnatic_frequencies, key=lambda note: abs(carnatic_frequencies[note] - freq))

def get_closest_frequency(freq, carnatic_frequencies):
    """Find the closest Carnatic note frequency for a given frequency."""
    return min(carnatic_frequencies.values(), key=lambda f: abs(f - freq))

def get_index_from_time(time_input,conf):
    # Define the start and end times
    total_duration = end_time - start_time
    num_pieces = len(conf)
    
    # Calculate the duration of each piece
    duration_per_piece = total_duration / num_pieces
    
    # Check if the input time is within the valid range
    if time_input < start_time or time_input > end_time:
        raise ValueError(f"Input time must be between {start_time} and {end_time} seconds.")
    
    # Calculate the index
    index = int((time_input - start_time) / duration_per_piece)
    
    return index

def plot_frequency_with_carnatic_notes(frequency_list, beat_frames, tonic,beat_sr):
    beat_frames= librosa.frames_to_time(beat_frames, sr=beat_sr)
    loc_extremes = np.where(np.diff(np.sign(np.diff(frequency_list, prepend=np.nan, append=np.nan))) != 0)[0]
    extremes = frequency_list[loc_extremes].tolist()
    angles = np.degrees(np.arctan(np.diff(frequency_list, prepend=np.nan, append=np.nan) / 2))
    # notelist = [(conf[i], i, angles[i], angles[i + 1], i in loc_extremes) for i in range(len(conf) - 1)]
    carnatic_frequencies = get_carnatic_frequencies(tonic)
    frequency_array = np.array(frequency_list)
    
    beat_points=[]
    for i in beat_frames:
        if i < start_time or i > end_time:
            continue
        beat_points.append(get_index_from_time(i,frequency_list))




    # Identify valid (non-NaN) frames
    valid_indices = ~np.isnan(frequency_array)  
    valid_frequencies = frequency_array[valid_indices]
    if len(valid_frequencies) == 0:
        raise ValueError("No valid frequencies to process.")

    carnatic_frequencies = get_carnatic_frequencies(tonic)

    # Plot the graph
    fig = go.Figure()

    # Plot the frequency graph with gaps for NaNs
    for start, end in zip(
        np.where(np.diff(np.concatenate(([0], valid_indices, [0]))) == 1)[0],
        np.where(np.diff(np.concatenate(([0], valid_indices, [0]))) == -1)[0]
    ):
        fig.add_trace(go.Scatter(
            x=np.arange(start, end),
            y=frequency_array[start:end],
            mode='lines',
            name='Frequency (Hz)',
            line=dict(color='blue')
        ))

    # Plot horizontal lines for Carnatic notes
    for note, freq in carnatic_frequencies.items():
        fig.add_trace(go.Scatter(
            x=[0, len(frequency_list) - 1],
            y=[freq, freq],
            mode='lines',
            line=dict(dash='dash', color='gray', width=2),
            name=note,
            hovertemplate=f"{note} ({freq:.2f} Hz)"
        ))

    # Plot the extremes as red dots
    fig.add_trace(go.Scatter(
        x=loc_extremes,
        y=extremes,
        mode='markers',
        marker=dict(color='red', size=2, symbol='circle'),
        name='Extremes'
    ))

    # Plot vertical lines for beat points
    for beat in beat_points:
        fig.add_trace(go.Scatter(
            x=[beat, beat],  # Vertical line at 'beat'
            y=[np.nanmin(frequency_array), np.nanmax(frequency_array)],  # Full y-range
            mode='lines',
            line=dict(color='orange', width=2),
            name=f'Beat @ {beat}'
        ))

    fig.update_layout(
        title=f'Frequency with Carnatic Notes (Tonic: {tonic})',
        xaxis_title='Time',
        yaxis_title='Frequency (Hz)',
        showlegend=True
    )

    fig.show()

def breaklist(elements, indexes):
    segmented_lists = []
    start_index = 0  

    for idx in indexes:
        segment = elements[start_index:idx]
        segmented_lists.append(segment)
        start_index = idx  
    if start_index < len(elements):
        segmented_lists.append(elements[start_index:])

    return segmented_lists

def plot_with_carnatic_bars(note_num, noteslist, carnatic_frequencies):
    bars = list(carnatic_frequencies.values())
    
    # Find relevant frequency range
    min_freq = get_closest_frequency(np.nanmin(noteslist[note_num]), carnatic_frequencies)
    max_freq = get_closest_frequency(np.nanmax(noteslist[note_num]), carnatic_frequencies)
    
    # Filter bars within the frequency range
    newbars = [i for i in bars if min_freq <= i <= max_freq]
    
    # Plot
    plt.plot(noteslist[note_num])
    for i in newbars:
        plt.axhline(y=i, color='r', linestyle='--')
    plt.show()
    for i in newbars:
        print(get_closest_note(i, carnatic_frequencies))

def spectral_decomp(note, n_clusters, plot=True):
    note = np.array(note)
    X = np.column_stack((np.arange(len(note)), note))
    embedding = SpectralEmbedding(n_components=2, affinity='nearest_neighbors')
    X_transformed = embedding.fit_transform(X)

    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    labels = kmeans.fit_predict(X_transformed)

    # Sort clusters based on first occurrence
    unique_clusters = np.unique(labels, return_index=True)
    sorted_clusters = [cluster for _, cluster in sorted(zip(unique_clusters[1], unique_clusters[0]))]
    label_mapping = {old: new for new, old in enumerate(sorted_clusters)}
    sorted_labels = np.array([label_mapping[label] for label in labels])

    # Assign frequencies to clusters
    segments = [[] for _ in range(n_clusters)]
    for idx, freq in enumerate(note):
        segments[sorted_labels[idx]].append((idx, freq))

    if plot:
        fig = go.Figure()

        colors = ['red', 'blue', 'green', 'orange', 'purple', 'cyan', 'magenta']
        for i in range(n_clusters):
            indices, freqs = zip(*segments[i]) if segments[i] else ([], [])
            fig.add_trace(go.Scatter(
                x=indices,
                y=freqs,
                mode='markers+lines',
                marker=dict(size=6, color=colors[i % len(colors)]),
            ))

        # Plot horizontal lines at each unique frequency

        unique_freqs = [i for i in get_carnatic_frequencies("C#3").values() if min(note) <= i <= max(note)]
        unique_notes= [i for i in get_carnatic_frequencies("C#3").keys() if min(note) <= get_carnatic_frequencies("C#3")[i] <= max(note)]
        x_values = np.linspace(min(X[:, 0]), max(X[:, 0]), num=100)  # Densely spaced x values

        for i in range (len( unique_freqs)):
            y_values = np.full_like(x_values,unique_freqs[i])
            fig.add_trace(go.Scatter(
                x=x_values,
                y=y_values,
                mode="lines",
                line=dict(color="gray", dash="dash"),
                showlegend=False,
                hovertemplate=f"{unique_notes[i]}({unique_freqs[i]:.2f} Hz)"
            ))

        
        fig.show()

    return [list(zip(*seg))[1] if seg else [] for seg in segments] 

def playnote(n, beat_audio, beat_sr, beat_times, start_time):
    adjusted_beat_times = beat_times - start_time
    adjusted_beat_times = adjusted_beat_times[adjusted_beat_times >= 0]  # Remove negative times
    if n < 0 or n >= len(adjusted_beat_times) - 1:
        print("Invalid note index")
        return
    note_start_time = adjusted_beat_times[n]
    note_end_time = adjusted_beat_times[n+1]
    start_sample = int(note_start_time * beat_sr)
    end_sample = int(note_end_time * beat_sr)

    note_audio = beat_audio[start_sample:end_sample]
    ipd.display(ipd.Audio(note_audio, rate=beat_sr))

def find_peaks_and_valleys(conf):
    peaks = []
    valleys = []
    
    for i in range(1, len(conf) - 1):
        if not np.isnan(conf[i-1]) and not np.isnan(conf[i]) and not np.isnan(conf[i+1]):
            if conf[i] > conf[i-1] and conf[i] > conf[i+1]:
                peaks.append(i)
            elif conf[i] < conf[i-1] and conf[i] < conf[i+1]:
                valleys.append(i)
    
    return peaks, valleys

def play_segment_between_beats(beat_audio, beat_sr, beat_frames, beat_index,offset=0):
    # Ensure the beat_index is valid
    if beat_index < 0 or beat_index >= len(beat_frames) - 1:
        print("Invalid beat index. Please provide a valid index.")
        return

    # Get the start and end frames for the segment
    start_frame = beat_frames[beat_index-offset]
    end_frame = beat_frames[beat_index + 1+offset]

    # Convert frames to time
    start_time = librosa.frames_to_time(start_frame, sr=beat_sr)
    end_time = librosa.frames_to_time(end_frame, sr=beat_sr)

    # Convert time to sample indices
    start_sample = int(start_time * beat_sr)
    end_sample = int(end_time * beat_sr)

    # Slice the audio segment
    audio_segment = beat_audio[start_sample:end_sample]

    # Play the audio segment
    ipd.display(ipd.Audio(audio_segment, rate=beat_sr))

def trim(data):
    data = np.array(data)  
    valid_indices = np.where(~np.isnan(data))[0]
    valid_data = data[valid_indices]
    peaks = argrelextrema(valid_data, np.greater, order=2)[0]

    troughs = argrelextrema(valid_data, np.less, order=2)[0]

    # Combine peaks & troughs and sort them
    extrema = np.sort(np.concatenate((peaks, troughs)))

    if len(extrema) < 2:
        return data  # Not enough peaks/troughs to trim

    # Find start and end positions in original indices
    start, end = valid_indices[extrema[0]], valid_indices[extrema[-1]]

    return data[start:end+1]

def shift_beats_to_peaks_or_valleys(beat_frames, conf):
    """
    Shift the beat frames to align with the nearest peak or valley in the confidence array.
    
    Parameters:
    - beat_frames: The original beat frames.
    - conf: The confidence array.
    
    Returns:
    - shifted_beat_frames: The updated beat frames.
    """
    peaks, valleys = find_peaks_and_valleys(conf)
    shifted_beat_frames = []

    for beat in beat_frames:
        # Find the nearest peak or valley
        nearest_index = None
        min_distance = float('inf')

        for index in peaks + valleys:
            distance = abs(index - beat)
            if distance < min_distance:
                min_distance = distance
                nearest_index = index

        shifted_beat_frames.append(nearest_index)

    return np.array(shifted_beat_frames)

def extend_sublists(main_list, num=4):
    extended_list = []
    for i in range(len(main_list)):
        current_sublist = main_list[i]
        if i == 0 or i == len(main_list) - 1:
            extended_list.append(current_sublist)
        else:
            new_sublist = []
            new_sublist.extend(main_list[i - 1][-num:])
            new_sublist.extend(current_sublist)
            new_sublist.extend(main_list[i + 1][:num])
            extended_list.append(new_sublist)
    return extended_list

def plot_with_carnatic_bars_with_peaks(segment, carnatic_frequencies, color='lime'):
    plt.style.use('dark_background')  # Dark mode

    bars = list(carnatic_frequencies.values())

    # Find relevant frequency range
    min_freq = get_closest_frequency(np.nanmin(segment), carnatic_frequencies)
    max_freq = get_closest_frequency(np.nanmax(segment), carnatic_frequencies)
    newbars = [i for i in bars if min_freq <= i <= max_freq]

    plt.figure(figsize=(12, 6))
    x_vals = np.arange(len(segment))
    plt.scatter(x_vals, segment, s=8, color=color)

    # Peaks and valleys
    peaks, _ = scipy_find_peaks(segment)
    valleys, _ = scipy_find_peaks(-np.array(segment))
    plt.plot(peaks, segment[peaks], "o", markersize=4, color="cyan", label="Peaks/Valleys")
    plt.plot(valleys, segment[valleys], "o", markersize=4, color="cyan")

    # Plot Carnatic bars with labels
    for freq in newbars:
        note = get_closest_note(freq, carnatic_frequencies)
        plt.axhline(y=freq, color='orange', linestyle='--', linewidth=0.8)
        plt.text(0, freq, note, color='orange', fontsize=9, verticalalignment='bottom')

    plt.xlabel("Frame Index")
    plt.ylabel("Frequency (Hz)")
    plt.title("Segment with Peaks, Valleys, and Carnatic Frequency Bars")
    plt.tight_layout()
    plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.3)
    plt.show()

def interpolate_with_nans(data, target_length=128):

    data = np.array(data, dtype=np.float64)
    original_length = len(data)
    x_original = np.linspace(0, 1, original_length)
    x_target = np.linspace(0, 1, target_length)
    valid = ~np.isnan(data)
    if np.count_nonzero(valid) < 2:
        return np.full(target_length, np.nan)
    interpolator = interp1d(x_original[valid], data[valid], kind='linear', bounds_error=False, fill_value="extrapolate")
    interpolated = interpolator(x_target)
    nan_mask_original = np.isnan(data)
    nan_mask_interpolated = np.interp(x_target, x_original, nan_mask_original.astype(float)) > 0.5
    interpolated[nan_mask_interpolated] = np.nan
    return interpolated

def play_segment(audio_index, sr, start_frame,end_frame):
    df = pd.read_csv('carva.csv')
    start_time = librosa.frames_to_time(start_frame, sr =sr)
    end_time = librosa.frames_to_time(end_frame, sr =sr)

    # Convert time to sample indices
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)

    # Slice the audio segment
    audio_path = df.loc[df['Index'] == 1, 'AudioPath'].values[0]
    print(audio_path)
    audio = librosa.load(audio_path, sr=sr)[0]
    audio_segment = audio[start_sample:end_sample]

    # Play the audio segment
    ipd.display(ipd.Audio(audio_segment, rate=sr))

In [None]:
df = pd.read_csv("Master_Crepe.csv")
# === Add new column ===
normalized_values = []
for tonic, freq in zip(df["Tonic"], df["Frequency"]):
    base = get_carnatic_frequencies(tonic)["Sa"]
    if base and base > 0:
        normalized_values.append(freq / base)
    else:
        normalized_values.append(None)  # or np.nan

df["Tonic_Normalized_Frequency"] = normalized_values

# === Save it back ===
df.to_csv("Master_Crepe.csv", index=False)

print("✅ Tonic_Normalized_Frequency column added.")


✅ Tonic_Normalized_Frequency column added.
Playing: Vocals_file\Evari_Bodhana_C#3.wav


In [28]:

def play_segment(audio_index, sr, start_frame, end_frame):
    import pandas as pd
    import librosa
    import IPython.display as ipd

    # Load the CSV
    df = pd.read_csv('carva.csv')

    # Get the correct audio path using the given audio_index
    audio_path = df.loc[df['Index'] == audio_index, 'AudioPath'].values[0]
    print(f"Playing: {audio_path}")

    # Load the audio
    audio = librosa.load(audio_path, sr=sr)[0]

    # Convert frames to time
    start_time = librosa.frames_to_time(start_frame, sr=sr)
    end_time = librosa.frames_to_time(end_frame, sr=sr)

    # Convert time to sample indices
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)

    # Slice the audio segment
    audio_segment = audio[start_sample:end_sample]

    # Play the audio segment
    return ipd.Audio(audio_segment, rate=sr)


audio1 = play_segment(1, 44100, 348, 405)
audio2 = play_segment(1, 44100, 1316, 1373)
audio3= play_segment(1, 44100, 696, 753)
audio4= play_segment(1, 44100, 1432, 1489)

display(audio1)
display(audio2)
display(audio3)
display(audio4)


Playing: Vocals_file\Evari_Bodhana_C#3.wav
Playing: Vocals_file\Evari_Bodhana_C#3.wav
Playing: Vocals_file\Evari_Bodhana_C#3.wav
Playing: Vocals_file\Evari_Bodhana_C#3.wav


In [4]:
csv_path = "Master_Crepe.csv"
new_confidence = pd.read_csv(csv_path)["Confidence"].values
new_frequency = pd.read_csv(csv_path)["Frequency"].values
spec_time = pd.read_csv(csv_path)["Time"].values
normalized_frequency = pd.read_csv(csv_path)["Tonic_Normalized_Frequency"].values
print(len(new_frequency))


264474


In [5]:
def interpolate_list(a, n):
    if n <= 1: return a[:1]*n
    return [a[int(i)] + (a[min(int(i)+1, len(a)-1)] - a[int(i)]) * (i - int(i)) for i in [(len(a)-1)*j/(n-1) for j in range(n)]]

def non_overlapping_segments(conf, window_size, hop_size):
    segments = []
    indices = []
    i = 0
    while i < len(conf) - window_size:
        segment = conf[i:i + window_size]
        if np.isnan(segment).any():
            i += hop_size
            continue
        segments.append(segment)
        indices.append(i)
        i += window_size  # skip all overlapping windows
    return np.array(segments), np.array(indices)

def dtw_distance_matrix(segments):
    n = len(segments)
    dists = np.zeros((n, n))
    for i in tqdm(range(n)):
        for j in range(i+1, n):
            dist, _ = fastdtw(segments[i], segments[j])
            dists[i, j] = dist
            dists[j, i] = dist
    return dists

def extract_notes_from_conf(conf, initial_window_size, decay_size, min_window_size, outlier_threshold,similairity_threshold=100):
    conf = conf.copy()
    remaining_conf = conf.copy()
    all_removed_segments = []

    window_size = initial_window_size
    global_label_offset = 0

    total_iters = (initial_window_size - min_window_size) // decay_size + 1
    iter_count = 0

    while window_size >= min_window_size:
        iter_count += 1
        print(f"Iteration {iter_count}/{total_iters} — Window Size: {window_size}")

        hop_size = int(window_size / 12)
        segments, segment_starts = non_overlapping_segments(remaining_conf, window_size, hop_size)
        
        if len(segments) == 0:
            print("  Skipped — no valid segments")
            window_size -= decay_size
            continue

        dtw_dists = dtw_distance_matrix(segments)

        if len(segments) < 2:
            return remaining_conf, all_removed_segments  # <-- add this safeguard

        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=similairity_threshold,
            metric='precomputed',
            linkage='average'
        )
        labels = clustering.fit_predict(dtw_dists)
        labels = clustering.fit_predict(dtw_dists)

        cluster_dict = defaultdict(list)
        cluster_origins = defaultdict(list)

        for seg, start_idx, lbl in zip(segments, segment_starts, labels):
            cluster_dict[lbl].append(seg)
            cluster_origins[lbl].append(start_idx)

        clustered = False
        for label, starts in cluster_origins.items():
            if len(starts) >= outlier_threshold:
                clustered = True
                global_label = global_label_offset + label
                for i in starts:
                    remaining_conf[i:i + window_size] = np.full(window_size, np.nan)
                    all_removed_segments.append([i, i + window_size, global_label])

        if clustered:
            print(f"  Clusters found: {len(set(labels))}, removed some segments.")
        else:
            print(f"  Clusters found: {len(set(labels))}, but none met the threshold.")

        global_label_offset += len(set(labels))
        window_size -= decay_size

    return remaining_conf, all_removed_segments

In [6]:
second_phase_segments=[]
second_queue =[]
carva_df = pd.read_csv('carva.csv')
segments = carva_df['SegmentList']
for segment in segments:
     second_phase_segments.append(eval(segment))
     second_queue.extend(interpolate_list(second_phase_segments[-1], 50))

In [15]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
import ast

# --------- Utility Functions ---------

def interpolate_list(lst, target_len):
    original_len = len(lst)
    if original_len == 0:
        return [0] * target_len
    return list(np.interp(np.linspace(0, original_len - 1, target_len),
                          np.arange(original_len), lst))

def non_overlapping_segments(arr, window_size, hop_size):
    segments = []
    starts = []
    for start in range(0, len(arr) - window_size + 1, hop_size):
        segment = arr[start:start + window_size]
        if not np.any(np.isnan(segment)):
            segments.append(segment)
            starts.append(start)
    return segments, starts

# --------- Main Clustering Function ---------

def extract_notes_from_conf_once_pca(conf, window_size=128, hop_size=128,
                                     outlier_threshold=3, similarity_threshold=0.7,
                                     pca_components=10):
    conf = conf.copy()
    remaining_conf = conf.copy()
    all_removed_segments = []

    segments, segment_starts = non_overlapping_segments(remaining_conf, window_size, hop_size)

    if len(segments) < 2:
        print("Not enough segments to compare.")
        return remaining_conf, all_removed_segments, []

    X_abs = np.stack(segments)
    X_shape = X_abs - np.mean(X_abs, axis=1, keepdims=True)
    X_combined = np.concatenate([X_abs, X_shape], axis=1)

    pca = PCA(n_components=min(pca_components, X_combined.shape[1]))
    X_pca = pca.fit_transform(X_combined)

    dist_matrix = squareform(pdist(X_pca, metric='euclidean'))

    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=similarity_threshold,
        metric='precomputed',
        linkage='average'
    )
    labels = clustering.fit_predict(dist_matrix)

    cluster_origins = defaultdict(list)
    for start_idx, lbl in zip(segment_starts, labels):
        cluster_origins[lbl].append(start_idx)

    for label, starts in cluster_origins.items():
        if len(starts) >= outlier_threshold:
            for i in starts:
                remaining_conf[i:i + window_size] = np.full(window_size, np.nan)
                all_removed_segments.append([i, i + window_size, label])

    print(f"Clusters found: {len(set(labels))}. Removed {len(all_removed_segments)} segments.")
    return remaining_conf, all_removed_segments, labels

# --------- CSV Update Function ---------

def update_carva_csv_with_labels(file_path, labels):
    carva_df = pd.read_csv(file_path)

    if len(labels) != len(carva_df):
        raise ValueError(f"Label count ({len(labels)}) does not match row count ({len(carva_df)})")

    carva_df['Second Labels'] = labels
    carva_df.to_csv(file_path, index=False)
    print(f"Updated {file_path} with Second Labels.")

# --------- Main Driver Code ---------

file_path = 'carva.csv'
second_phase_segments = []
second_queue = []

carva_df = pd.read_csv(file_path)
segments = carva_df['SegmentList']

for segment in segments:
    if pd.isna(segment):
        continue  # skip blank entries
    try:
        seg = ast.literal_eval(segment)
    except:
        continue  # skip invalid entries
    interp = interpolate_list(seg, 50)
    second_phase_segments.append(interp)
    second_queue += interp  # extend the full signal queue

# --------- Debug Info ---------
print("Total rows in CSV:", len(carva_df))
print("Valid segments used:", len(second_phase_segments))
print("Total length of signal:", len(second_queue))
print("Window size: 50, hop size: 50")
print("Expected segments:", len(second_queue) // 50)

# --------- Run Clustering ---------
_, _, labels = extract_notes_from_conf_once_pca(second_queue, 50, 50)

print("Number of labels:", len(labels))

# --------- Update CSV ---------
update_carva_csv_with_labels(file_path, labels)


Total rows in CSV: 7279
Valid segments used: 7279
Total length of signal: 363950
Window size: 50, hop size: 50
Expected segments: 7279
Clusters found: 846. Removed 6696 segments.
Number of labels: 7279
Updated carva.csv with Second Labels.
