In [5]:
import librosa
import numpy as np
import pandas as pd
import joblib
import os
import sys
import scipy.ndimage
import IPython.display
from IPython.display import  Audio

In [6]:
MODEL_PATH = 'model_rf.xz'
ENCODER_PATH = 'encoder.xz'
SEGMENT_DURATION_SEC = 0.1
INPUT_FILE = '../datasets/ontario.mp3'
OUTPUT_CSV_NAME = 'output.csv'

In [12]:
def featurize():
    try:
        y, sr = librosa.load(INPUT_FILE)
        file_duration = librosa.get_duration(y=y, sr=sr)
        hop_length = int(SEGMENT_DURATION_SEC * sr)

        y_harm = librosa.effects.harmonic(y=y, margin=8)
        # chroma_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr, hop_length=hop_length)
        chroma_harm = librosa.feature.chroma_stft(y=y_harm, sr=sr, hop_length=hop_length)
        # chroma_filter = np.minimum(
            # chroma_harm,
            # librosa.decompose.nn_filter(chroma_harm, aggregate=np.median)
        # )
        # chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))
        # chroma = chroma_smooth
        chroma = chroma_harm

        features = chroma.T

        num_segments = features.shape[0]
        start_times = np.arange(0, num_segments) * SEGMENT_DURATION_SEC
        end_times = np.minimum(start_times + SEGMENT_DURATION_SEC, file_duration)

        return features, start_times, end_times
        
    except Exception as e:
        print(f"featurize() -> Error processing audio '{INPUT_FILE}': {e}")
        return np.array([]), np.array([]), np.array([]) 

In [13]:
def merge_chords(predictions):
    merge_data = []
    current_chord = None
    current_start_time = None
    current_end_time = None

    for index, row in predictions.iterrows():
        chord = row['chords']
        start_time = row['starts']
        end_time = row['ends']

        if current_chord is None:
            current_chord = chord
            current_start_time = start_time
            current_end_time = end_time
        elif chord == current_chord:
            current_end_time = end_time
        else:
            merge_data.append({
                'start': current_start_time,
                'end': current_end_time,
                'chord': current_chord,
            })
            current_chord = chord
            current_start_time = start_time
            current_end_time = end_time
            
    if current_chord is not None:
        merge_data.append({
            'start': current_start_time,
            'end': current_end_time,
            'chord': current_chord,
        })

    return pd.DataFrame(merge_data)

In [14]:
def predict():
    if not os.path.exists(INPUT_FILE):
        print(f"predict() -> File not found {INPUT_FILE}")
        sys.exit(1)

    try:
        model = joblib.load(MODEL_PATH)
        encoder = joblib.load(ENCODER_PATH)
    except FileNotFoundError:
        print(f"predict() -> File not found {MODEL_PATH}  or  {ENCODER_PATH}")
        sys.exit(1)
    except Exeception as e:
        print(f"predict() -> Error {e}")

    features, starts, ends = featurize()
    raw_predictions_df = pd.DataFrame({
        "starts": starts,
        "ends": ends,
    })

    predictions = model.predict(features)
    predictions_decoded = encoder.inverse_transform(predictions)
    raw_predictions_df["chords"] = predictions_decoded
    final_predictions_df = merge_chords(raw_predictions_df)

    final_predictions_df.to_csv(OUTPUT_CSV_NAME, index=False, float_format="%.3f")
    for index, row in final_predictions_df.iterrows():
        # Using f-string for formatted output (e.g., 0.000 - 0.500: Cmaj)
        #print(f"{row['start']:.3f} - {row['end']:.3f}: {row['chord']}")
        start_total_sec = row['start']
        end_total_sec = row['end']
        start_min = int(start_total_sec // 60)
        end_min = int(end_total_sec // 60)
        start_sec = start_total_sec % 60
        end_sec = end_total_sec % 60
        print(f"{start_min:02d}:{start_sec:02.0f} - {end_min:02d}:{end_sec:02.0f} --> {row['chord']}")
    print("------------------------")
    

In [15]:
predict()



00:00 - 00:00 --> GMin7
00:00 - 00:02 --> AbMin7
00:02 - 00:02 --> BMin7
00:02 - 00:03 --> AbMin7
00:03 - 00:03 --> CMin7
00:03 - 00:03 --> BbMin7
00:03 - 00:04 --> AbMin7
00:04 - 00:05 --> GbMaj7
00:05 - 00:05 --> AbMin7
00:05 - 00:06 --> GbMaj7
00:06 - 00:06 --> GbMin7
00:06 - 00:06 --> GbMaj7
00:06 - 00:06 --> GbMin7
00:06 - 00:07 --> GbMaj7
00:07 - 00:07 --> BbMin7
00:07 - 00:07 --> GbMaj7
00:07 - 00:08 --> FMin7
00:08 - 00:08 --> CMin7
00:08 - 00:08 --> EMin7
00:08 - 00:10 --> CMin7
00:10 - 00:10 --> EbMaj7
00:10 - 00:10 --> CMin7
00:10 - 00:10 --> AbMaj7
00:10 - 00:11 --> CMin7
00:11 - 00:12 --> AbMin7
00:12 - 00:12 --> GMaj7
00:12 - 00:12 --> EMaj7
00:12 - 00:13 --> AbMin7
00:13 - 00:13 --> BMaj7
00:13 - 00:15 --> AbMin7
00:15 - 00:15 --> DbMin7
00:15 - 00:16 --> AbMin7
00:16 - 00:16 --> EbMin7
00:16 - 00:17 --> AbMin7
00:17 - 00:17 --> EMaj7
00:17 - 00:18 --> AbMin7
00:18 - 00:18 --> BbMin7
00:18 - 00:19 --> AbMin7
00:19 - 00:19 --> GbMaj7
00:19 - 00:19 --> Ab7
00:19 - 00:21 --

In [16]:
display(Audio(INPUT_FILE))