## Task1: Tempo estimation

In [7]:
import glob

genres = ["ChaCha", "Jive", "Quickstep", "Rumba", "Samba", "Tango", "Viennese waltz", "Waltz"]
# genres = ["ChaCha", "Jive"]
filepaths = dict()
for genre in genres:
    filepaths[genre] = sorted(glob.glob(f"../Ballroom/BallroomData/{genre}/*.wav"))

In [8]:
from pathlib import Path

ground_truth = dict()
for path in glob.glob('../Ballroom/BallroomAnnotations/ballroomGroundTruth/*.bpm'):
    name = Path(path).stem
    bpm = -1
    with open(path) as f:
        bpm = int(f.readline().strip("\n"))
    ground_truth[name] = bpm

### Q1. 

In [9]:
import librosa
import pandas as pd
import numpy as np
import scipy.signal as sps
import utils

for genre, audio_paths in filepaths.items():
    auto_scores = {"p":[], "alotc":[]}
    fourier_scores = {"p":[], "alotc":[]}

    for audio_path in audio_paths:
        y, sr = librosa.load(audio_path)
        hop_length = 512
        oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

        tempogram_auto = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length, norm=None)
        tempo_vector_auto = np.sum(tempogram_auto, axis=1)
        tempo_vector_auto = [x/tempogram_auto.shape[1] for x in tempo_vector_auto]
        peak_id_auto = sps.argrelmax(np.array(tempo_vector_auto))
        auto_frequency = librosa.tempo_frequencies(len(tempo_vector_auto))
        tempo_vector_auto = [(tempo_vector_auto[pid], pid) for pid in peak_id_auto[0]]
        tempo_vector_auto = sorted(tempo_vector_auto, key=lambda x: x[0], reverse=True)

        T1T2_auto = (auto_frequency[tempo_vector_auto[0][1]], auto_frequency[tempo_vector_auto[1][1]])
        s_auto = (tempo_vector_auto[0][0])/(tempo_vector_auto[0][0] + tempo_vector_auto[1][0])
        reference_bpm = ground_truth[Path(audio_path).stem]
        auto_scores["p"].append(utils.P_SCORE(T1T2_auto, s_auto, reference_bpm))
        auto_scores["alotc"].append(utils.ALOTC_SCORE(T1T2_auto, reference_bpm))


        tempogram_fourier = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
        tempo_vector_fourier = np.sum(np.abs(tempogram_fourier), axis=1)
        tempo_vector_fourier = [x/tempogram_fourier.shape[1] for x in tempo_vector_fourier]
        peak_id_fourier = sps.argrelmax(np.array(tempo_vector_fourier))
        fourier_frequency = librosa.fourier_tempo_frequencies(hop_length = hop_length)
        tempo_vector_fourier = [(tempo_vector_fourier[pid], pid) for pid in peak_id_fourier[0]]
        tempo_vector_fourier = sorted(tempo_vector_fourier, key=lambda x: x[0], reverse=True)

        T1T2_fourier = (auto_frequency[tempo_vector_fourier[0][1]], fourier_frequency[tempo_vector_fourier[1][1]])
        s_fourier = (tempo_vector_fourier[0][0])/(tempo_vector_fourier[0][0] + tempo_vector_fourier[1][0])
        reference_bpm = ground_truth[Path(audio_path).stem]
        fourier_scores["p"].append(utils.P_SCORE(T1T2_fourier, s_fourier, reference_bpm))
        fourier_scores["alotc"].append(utils.ALOTC_SCORE(T1T2_fourier, reference_bpm))

    print(f"------{genre}-------")
    print("fourier")
    print(f"  avg p score: {sum(fourier_scores['p']) / len(fourier_scores['p'])}")
    print(f"  avg alotc score: {sum(fourier_scores['alotc']) / len(fourier_scores['alotc'])}")
    print("auto")
    print(f"  avg p score: {sum(auto_scores['p']) / len(auto_scores['p'])}")
    print(f"  avg alotc score: {sum(auto_scores['alotc']) / len(auto_scores['alotc'])}")

------ChaCha-------
fourier
  avg p score: 0.005030572878275906
  avg alotc score: 0.009009009009009009
auto
  avg p score: 0.46831828392232594
  avg alotc score: 0.9099099099099099
------Jive-------
fourier
  avg p score: 0.13477542364349387
  avg alotc score: 0.3
auto
  avg p score: 0.44384553644991015
  avg alotc score: 0.8833333333333333
------Quickstep-------
fourier
  avg p score: 0.03237802817002383
  avg alotc score: 0.07317073170731707
auto
  avg p score: 0.46471286753172286
  avg alotc score: 0.926829268292683
------Rumba-------
fourier
  avg p score: 0.038036416841842266
  avg alotc score: 0.07142857142857142
auto
  avg p score: 0.45723413632328014
  avg alotc score: 0.9081632653061225
------Samba-------
fourier
  avg p score: 0.0121887365635885
  avg alotc score: 0.023255813953488372
auto
  avg p score: 0.35610301116538395
  avg alotc score: 0.7093023255813954
------Tango-------
fourier
  avg p score: 0.1879115902854876
  avg alotc score: 0.4883720930232558
auto
  avg p sco

### Q2.

In [12]:
WINDOW_LEN_PER_SECOND = 43
window_lengths = [4, 8]
for genre, audio_paths in filepaths.items():
    print(f"--------{genre}--------")
    for wl in window_lengths:
        win_length = int(wl * WINDOW_LEN_PER_SECOND)
        auto_scores = {"p":[], "alotc":[]}
        fourier_scores = {"p":[], "alotc":[]}

        for audio_path in audio_paths:
            y, sr = librosa.load(audio_path)
            hop_length = 512
            oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

            tempogram_auto = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length, win_length=win_length, norm=None)
            tempo_vector_auto = np.sum(tempogram_auto, axis=1)
            tempo_vector_auto = [x/tempogram_auto.shape[1] for x in tempo_vector_auto]
            peak_id_auto = sps.argrelmax(np.array(tempo_vector_auto))
            auto_frequency = librosa.tempo_frequencies(nbins=len(tempo_vector_auto))
            tempo_vector_auto = [(tempo_vector_auto[pid], pid) for pid in peak_id_auto[0]]
            tempo_vector_auto = sorted(tempo_vector_auto, key=lambda x: x[0], reverse=True)

            T1T2_auto = (auto_frequency[tempo_vector_auto[0][1]], auto_frequency[tempo_vector_auto[1][1]])
            s_auto = (tempo_vector_auto[0][0])/(tempo_vector_auto[0][0] + tempo_vector_auto[1][0])
            reference_bpm = ground_truth[Path(audio_path).stem]
            auto_scores["p"].append(utils.P_SCORE(T1T2_auto, s_auto, reference_bpm))
            auto_scores["alotc"].append(utils.ALOTC_SCORE(T1T2_auto, reference_bpm))


            tempogram_fourier = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr, win_length=win_length, hop_length=hop_length)
            tempo_vector_fourier = np.sum(np.abs(tempogram_fourier), axis=1)
            tempo_vector_fourier = [x/tempogram_fourier.shape[1] for x in tempo_vector_fourier]
            peak_id_fourier = sps.argrelmax(np.array(tempo_vector_fourier))
            fourier_frequency = librosa.fourier_tempo_frequencies(hop_length = hop_length, win_length=win_length)
            tempo_vector_fourier = [(tempo_vector_fourier[pid], pid) for pid in peak_id_fourier[0]]
            tempo_vector_fourier = sorted(tempo_vector_fourier, key=lambda x: x[0], reverse=True)

            T1T2_fourier = (auto_frequency[tempo_vector_fourier[0][1]], fourier_frequency[tempo_vector_fourier[1][1]])
            s_fourier = (tempo_vector_fourier[0][0])/(tempo_vector_fourier[0][0] + tempo_vector_fourier[1][0])
            reference_bpm = ground_truth[Path(audio_path).stem]
            fourier_scores["p"].append(utils.P_SCORE(T1T2_fourier, s_fourier, reference_bpm))
            fourier_scores["alotc"].append(utils.ALOTC_SCORE(T1T2_fourier, reference_bpm))
        print(f"---- wl: {wl}")
        print(f"avg fourier alotc score: {sum(fourier_scores['alotc']) / len(fourier_scores['alotc'])}")
        print(f"avg auto alotc score: {sum(auto_scores['alotc']) / len(auto_scores['alotc'])}")

--------ChaCha--------
---- wl: 4
avg fourier alotc score: 0.009009009009009009
avg auto alotc score: 0.9099099099099099
---- wl: 8
avg fourier alotc score: 0.009009009009009009
avg auto alotc score: 0.9099099099099099
--------Jive--------
---- wl: 4
avg fourier alotc score: 0.31666666666666665
avg auto alotc score: 0.8833333333333333
---- wl: 8
avg fourier alotc score: 0.26666666666666666
avg auto alotc score: 0.8833333333333333
--------Quickstep--------
---- wl: 4
avg fourier alotc score: 0.4268292682926829
avg auto alotc score: 0.926829268292683
---- wl: 8
avg fourier alotc score: 0.0975609756097561
avg auto alotc score: 0.926829268292683
--------Rumba--------
---- wl: 4
avg fourier alotc score: 0.09183673469387756
avg auto alotc score: 0.9081632653061225
---- wl: 8
avg fourier alotc score: 0.30612244897959184
avg auto alotc score: 0.9081632653061225
--------Samba--------
---- wl: 4
avg fourier alotc score: 0.7325581395348837
avg auto alotc score: 0.7093023255813954
---- wl: 8
avg f

### Q3 (Not Finished)

In [None]:
for genre, audio_paths in filepaths.items():
    auto_scores = {"p":[], "alotc":[]}
    fourier_scores = {"p":[], "alotc":[]}

    for audio_path in audio_paths:
        y, sr = librosa.load(audio_path)
        hop_length = 512
        oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

        tempogram_auto = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length, norm=None)
        tempo_vector_auto = np.sum(tempogram_auto, axis=1)
        tempo_vector_auto = [x/tempogram_auto.shape[1] for x in tempo_vector_auto]
        peak_id_auto = sps.argrelmax(np.array(tempo_vector_auto))
        auto_frequency = librosa.tempo_frequencies(len(tempo_vector_auto))
        tempo_vector_auto = [(tempo_vector_auto[pid], pid) for pid in peak_id_auto[0]]
        tempo_vector_auto = sorted(tempo_vector_auto, key=lambda x: x[0], reverse=True)

        T1T2_auto = (auto_frequency[tempo_vector_auto[0][1]], auto_frequency[tempo_vector_auto[1][1]])


        tempogram_fourier = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
        tempo_vector_fourier = np.sum(np.abs(tempogram_fourier), axis=1)
        tempo_vector_fourier = [x/tempogram_fourier.shape[1] for x in tempo_vector_fourier]
        peak_id_fourier = sps.argrelmax(np.array(tempo_vector_fourier))
        fourier_frequency = librosa.fourier_tempo_frequencies(hop_length = hop_length)
        tempo_vector_fourier = [(tempo_vector_fourier[pid], pid) for pid in peak_id_fourier[0]]
        tempo_vector_fourier = sorted(tempo_vector_fourier, key=lambda x: x[0], reverse=True)

        T1T2_fourier = (auto_frequency[tempo_vector_fourier[0][1]], fourier_frequency[tempo_vector_fourier[1][1]])

        print(f"gt: {ground_truth[Path(audio_path).stem]}, auto: {T1T2_auto}, fourier: {T1T2_fourier}")

gt: 124, auto: (63.02400914634146, 123.046875), fourier: (69.83741554054055, 504.6844482421875)
gt: 124, auto: (123.046875, 61.5234375), fourier: (69.83741554054055, 491.2261962890625)
gt: 124, auto: (61.5234375, 123.046875), fourier: (69.83741554054055, 497.955322265625)
gt: 124, auto: (123.046875, 61.5234375), fourier: (69.83741554054055, 497.955322265625)
gt: 124, auto: (123.046875, 61.5234375), fourier: (69.83741554054055, 497.955322265625)
gt: 124, auto: (61.5234375, 123.046875), fourier: (34.918707770270274, 248.9776611328125)
gt: 124, auto: (123.046875, 61.5234375), fourier: (69.83741554054055, 497.955322265625)
gt: 136, auto: (135.99917763157896, 67.99958881578948), fourier: (64.599609375, 538.330078125)
gt: 123, auto: (123.046875, 60.09265988372093), fourier: (71.77734375, 484.4970703125)
gt: 128, auto: (63.02400914634146, 258.3984375), fourier: (67.99958881578948, 504.6844482421875)
gt: 128, auto: (129.19921875, 64.599609375), fourier: (67.99958881578948, 511.41357421875)
gt:

# Task 2: DP for beat tracking

In [15]:
import mir_eval

beats_ground_truth = dict()
for path in glob.glob('../Ballroom/BallroomAnnotations-master/*.beats'):
    with open(path) as f:
        lines = f.readlines()
        beats_ground_truth[Path(path).stem] = [float(line.split()[0]) for line in lines]

In [16]:
for genre, audio_paths in filepaths.items():
    f_scores = []
    for audio_path in audio_paths:
        y, sr = librosa.load(audio_path)
        tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
        estimated_beats = librosa.frames_to_time(beats, sr=sr)
        reference_beats = beats_ground_truth[Path(audio_path).stem]
        f_scores.append(mir_eval.beat.f_measure(np.array(reference_beats), estimated_beats))
    print(f"{genre} f-score: {sum(f_scores)/len(f_scores)}")

ChaCha f-score: 0.9032701240928692
Jive f-score: 0.6652708585730417
Quickstep f-score: 0.6215061098650467
Rumba f-score: 0.7925994885206332
Samba f-score: 0.5623747946920912
Tango f-score: 0.8031221442332775


KeyboardInterrupt: 