# Tempo
Testing librosa tempo detection

In [7]:
import numpy as np
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import math
import glob

import warnings
warnings.filterwarnings("ignore")

Let's load a snippet of a song.

In [17]:

SRC_PATH = '**/*Ed*'
START = 60.34
# LEN = 5.0
# LEN = 10.0
LEN = 2.0
# LEN = 20.0
KNOWN_TEMPO = 126

# SRC_PATH = '**/*126 BPM*'
# START = 22.8
# LEN = 5
# KNOWN_TEMPO = 126

# SRC_PATH = '**/*Dua*Rules*'
# # START = 60.8
# # LEN = 5
# # LEN = 10
# START = 80.8
# LEN = 20
# KNOWN_TEMPO = 116

END = START + LEN

src = glob.glob(SRC_PATH)[0]
y, sr = librosa.load(src, sr=48000, offset=START, duration=LEN)
ipd.display('Sample rate {}, Num samples {}, single channel: {}'.format(sr, len(y), len(y.shape)))
ipd.Audio(y, rate=sr)

'Sample rate 48000, Num samples 96000, single channel: 1'

-0.89282227

Now use built in beat track method, and overlay clicks onto detected segment. Note that the derived tempo from the detected beats (e.g. the time between, or spb, seconds per beat, an inverse of bpm, beats per minute) is often better than reported.

In [9]:
HOP_LENGTH = 256
onset_env = librosa.onset.onset_strength_multi(y=y, sr=sr,
                                         hop_length=HOP_LENGTH,
                                         aggregate=np.median, # default is mean
                                         lag=1, # default, unit? "time lag for computing differences"
                                         max_size=1, # default, do not filter freq bins
                                         detrend=False, # default, do not "filter onset strength to remove DC component"
                                         center=True, # Centered frame analysis in STFT, by hop length
                                         )
onset_env = onset_env[..., 0, :]
# HOP_LENGTH = 512
# onset_env = librosa.onset.onset_strength(y=y, sr=sr,
#                                          # hop_length=HOP_LENGTH,
#                                          aggregate=np.median, # default is mean
#                                          lag=1, # default, unit? "time lag for computing differences"
#                                          max_size=1, # default, do not filter freq bins
#                                          detrend=False, # default, do not "filter onset strength to remove DC component"
#                                          center=True, # Centered frame analysis in STFT, by hop length
#                                          )

reported_tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr, units='time',
                                                hop_length=HOP_LENGTH,
                                                tightness=1000, # yikers island, what does this do... good? 800 1000, bad 400 600 1600
                                        # start_bpm=126,
                                    #    trim=False,
                                       )
tempo = 60 / np.average(np.diff(beats))
ipd.display('Reported tempo {}, derived {}, num beats detected {} vs (not quite right...) expected {}'.format(
    reported_tempo, tempo, len(beats), math.floor(KNOWN_TEMPO * LEN / 60.0)))
# ipd.display(beats)
# ipd.display(np.diff(beats))

click_track = librosa.clicks(times=beats, sr=sr, length=len(y))
ipd.Audio(y + click_track, rate=sr)


'Reported tempo 126.40449438202248, derived 127.11864406779662, num beats detected 3 vs (not quite right...) expected 4'

Curiously, the reported tempo from Librosa is often not as good as the one indicated by the detected beats.

In [10]:
reported_tempo = tempo
spb = np.average(np.diff(beats)) # beats per sec
tempo = 60 / spb

ipd.display('Reported tempo {} vs derrived {} vs known {}, spb {}'.format(reported_tempo, tempo, KNOWN_TEMPO, spb))

'Reported tempo 127.11864406779662 vs derrived 127.11864406779662 vs known 126, spb 0.472'

Now let's use that prediction and overlay the would be assumed beats onto the next chunk of the track and see how it sounds.

In [13]:
# never got this right...

doubled, _ = librosa.load(src, sr=sr, offset=START, duration=2.0 * LEN)
shift = LEN
# shift = END - beats[-1] + LEN - START
# shift = LEN + LEN - beats[-1]
shifted_beats = beats + shift
# ipd.display(beats[-2], beats[-1], beats[0] + shift, beats[1] + shift)
extra = np.array([])

# if it seems like we'd be missing one, let's stick it in here
if ((LEN + beats[0] - beats[-1]) / (2 * spb) > 0.85):
    extra = np.array([ (LEN + beats[0] + beats[-1]) / 2.0 ])
ipd.display(beats[-1], extra, beats[0] + LEN)
doubled_click = librosa.clicks(times=np.concatenate((beats, extra, shifted_beats)), sr=sr, length=len(doubled))
ipd.Audio(doubled + doubled_click, rate=sr)

1.0986666666666667

array([1.62666667])

2.1546666666666665

Or choose just one beat and use the derived tempo to create overlayed click track.

In [14]:
times = [beats[0]]
while times[-1] < END:
    times = times + [ times[-1] + spb ]
ipd.Audio(doubled + librosa.clicks(times=times, sr=sr, length=len(doubled)), rate=sr)

This might be good enough?

My shift logic isn't quite right. But assuming it was, sometimes one sounds better, or they're often close. It may also be that longer windows get derived (not reported!) tempo closer to known. Maybe I can use the derived to find the best / a good "anchor" time, and listen clicks around it...