In [3]:

import numpy as np
import pickle
from pathlib import Path
from hmmlearn import hmm
import json
from collections import defaultdict


In [None]:
def calculate_target_time_steps(bpm=120, sr=22050, hop_length=512):
    """
    Calculate the target time steps for a 16th note based on BPM and audio parameters.

    Args:
        bpm: Beats per minute
        sr: Sample rate
        hop_length: Hop length for STFT

    Returns:
        int: Recommended target time steps (rounded up)
    """
    # At given BPM: 1 beat = 60/bpm seconds
    # 1 16th note = (60/bpm) / 4 seconds
    seconds_per_16th = (60.0 / bpm) / 4.0
    samples_per_16th = seconds_per_16th * sr
    frames_per_16th = samples_per_16th / hop_length

    # Round up to ensure we capture the full 16th note
    target_steps = int(np.ceil(frames_per_16th))

    return target_steps


In [7]:
def extract_16th_note_spectrograms(audio_path, sr=22050, n_mels=80,
                                    n_fft=2048, hop_length=512):
    """
    Extract mel spectrograms for each 16th note in the audio.

    Args:
        audio_path: Path to audio file
        sr: Sample rate
        n_mels: Number of mel bins
        n_fft: FFT window size
        hop_length: Hop length

    Returns:
        List of mel spectrograms (one per 16th note), shape (n_mels, time_steps)
    """
    y, _ = librosa.load(audio_path, sr=sr)

    # Get beat times
    _, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    beat_times = librosa.frames_to_time(beat_frames, sr=sr)

    if len(beat_times) < 2:
        return None

    # Create 16th note times (4 divisions per beat)
    sixteenth_times = []
    for i in range(len(beat_times) - 1):
        beat_start = beat_times[i]
        beat_end = beat_times[i + 1]
        beat_duration = beat_end - beat_start

        for j in range(4):  # 4 sixteenth notes per beat
            sixteenth_times.append(beat_start + (j / 4) * beat_duration)

    sixteenth_times.append(beat_times[-1])  # Add last beat

    # Extract spectrogram for each 16th note segment
    specs = []
    for i in range(len(sixteenth_times) - 1):
        start_time = sixteenth_times[i]
        end_time = sixteenth_times[i + 1]

        # Convert time to samples
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        # Extract segment
        segment = y[start_sample:end_sample]

        if len(segment) == 0:
            continue

        # Compute mel spectrogram
        mel = librosa.feature.melspectrogram(
            y=segment,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            fmin=20,
            fmax=8000
        )

        # Convert to dB
        mel_db = librosa.power_to_db(mel, ref=np.max)

        # Normalize
        mel_db = (mel_db + 80) / 80.0  # Scale to [0, 1]

        specs.append(mel_db)

    return specs