# Tempo
Testing librosa tempo detection

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import math
import glob
import os

import warnings
warnings.filterwarnings("ignore")

Let's load a snippet of a song.

In [2]:

SRC_PATH = '**/*Ed*'
START = 60.34
# LEN = 5.0
# LEN = 10.0
LEN = 2.05
# LEN = 20.0
KNOWN_TEMPO = 126

# SRC_PATH = '**/*126 BPM*'
# START = 22.8
# LEN = 5
# KNOWN_TEMPO = 126

# SRC_PATH = '**/*Dua*Rules*'
# # START = 60.8
# # LEN = 5
# # LEN = 10
# START = 80.8
# LEN = 20
# KNOWN_TEMPO = 116

END = START + LEN

src = glob.glob(SRC_PATH)[0]
y, sr = librosa.load(src, sr=48000, offset=START, duration=LEN)
ipd.display('Sample rate {}, Num samples {}, single channel: {}'.format(sr, len(y), len(y.shape)))
ipd.Audio(y, rate=sr)

'Sample rate 48000, Num samples 98400, single channel: 1'

Now use built in beat track method, and then generate a click track at those detected beats and overlay that click audio onto detected segment.

Librosa `beat_track` returns two outputs, the assumed detected tempo or BPM (beats per minute), as well as an array of detected beat events (in seconds time or sample time). We can compare the reported tempo to the tempo inferred by the average time between these beat events.

Also note, depending on the number of detected beat events is often lower than expected (both if you listen for kick drum or clicks in the sample audio or infer number of beats from known tempo and snippet length).

In [3]:
HOP_LENGTH = 256
onset_env = librosa.onset.onset_strength_multi(y=y, sr=sr,
                                         hop_length=HOP_LENGTH,
                                         aggregate=np.median, # default is mean
                                         lag=1, # default, unit? "time lag for computing differences"
                                         max_size=1, # default, do not filter freq bins
                                         detrend=False, # default, do not "filter onset strength to remove DC component"
                                         center=True, # Centered frame analysis in STFT, by hop length
                                         )
onset_env = onset_env[..., 0, :]
# HOP_LENGTH = 512
# onset_env = librosa.onset.onset_strength(y=y, sr=sr,
#                                          # hop_length=HOP_LENGTH,
#                                          aggregate=np.median, # default is mean
#                                          lag=1, # default, unit? "time lag for computing differences"
#                                          max_size=1, # default, do not filter freq bins
#                                          detrend=False, # default, do not "filter onset strength to remove DC component"
#                                          center=True, # Centered frame analysis in STFT, by hop length
#                                          )

reported_tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr, units='time',
                                                hop_length=HOP_LENGTH,
                                                tightness=1000, # yikers island, what does this do... good? 800 1000, bad 400 600 1600
                                        # start_bpm=126,
                                    #    trim=False,
                                       )
derived_tempo = 60 / np.average(np.diff(beats))
ipd.display(('Reported tempo {}, derived avg tempo {}, '
        + 'num beats detected {} vs (not quite right...) expected {}').format(
    reported_tempo, derived_tempo, len(beats), math.floor(KNOWN_TEMPO * LEN / 60.0)))
# ipd.display(beats)
# ipd.display(np.diff(beats))

click_track = librosa.clicks(times=beats, sr=sr, length=len(y))
ipd.Audio(y + click_track, rate=sr)

tempo = derived_tempo

'Reported tempo 126.40449438202248, derived avg tempo 127.11864406779662, num beats detected 3 vs (not quite right...) expected 4'

In the past, the reported tempo from Librosa was often not as good as the one indicated by the detected beats, not sure what may have changed here.

In [4]:
spb = np.average(np.diff(beats)) # beats per sec
tempo = 60 / spb

ipd.display(('Reported tempo {} vs avg derrived {} '
             + 'vs min {} vs max {} vs median {} '
             + 'vs known {}, spb {}').format(
                 reported_tempo, derived_tempo,
                 60 / np.max(np.diff(beats)),
                 60 / np.min(np.diff(beats)),
                 60 / np.median(np.diff(beats)),
                 KNOWN_TEMPO, spb))

'Reported tempo 126.40449438202248 vs avg derrived 127.11864406779662 vs min 126.40449438202248 vs max 127.84090909090908 vs median 127.11864406779662 vs known 126, spb 0.472'

Now let's use that prediction and overlay the would be assumed beats onto the next chunk of the track and see how it sounds. This first method is to duplicate and shift the beat events over, it doesn't sound great, I probably have a bug?

In [5]:
# todo: I don't think I ever got this quite right?...

# first load twice as much audio
doubled, _ = librosa.load(src, sr=sr, offset=START, duration=2.0 * LEN)

shift = LEN
# shift = END - beats[-1] + LEN - START
# shift = LEN + LEN - beats[-1]
shifted_beats = beats + shift
# ipd.display(beats[-2], beats[-1], beats[0] + shift, beats[1] + shift)
extra = np.array([])

# if it seems like we'd be missing one, let's stick it in here
if ((LEN + beats[0] - beats[-1]) / (2 * spb) > 0.85):
    extra = np.array([ (LEN + beats[0] + beats[-1]) / 2.0 ])
ipd.display(beats[-1], extra, beats[0] + LEN)
doubled_click = librosa.clicks(times=np.concatenate((beats, extra, shifted_beats)), sr=sr, length=len(doubled))
ipd.Audio(doubled + doubled_click, rate=sr)

1.0986666666666667

array([1.65166667])

2.2046666666666663

Or choose just one beat (the last one) and use one of the derived tempos as constant spacing to create overlayed click track.

In [6]:
times = [beats[0]]
while times[-1] < END:
    times = times + [ times[-1] + spb ]
ipd.Audio(doubled + librosa.clicks(times=times, sr=sr, length=len(doubled)), rate=sr)

This might be good enough?

My shift logic isn't quite right. But assuming it was, sometimes one sounds better, or they're often close. It may also be that longer windows get derived (not reported!) tempo closer to known. Maybe I can use the derived to find the best / a good "anchor" time, and listen clicks around it...

## Next Steps(?):

- Improve the above, test with multiple sound files (especially ones with multiple songs started, stoppped, transitioned to, etc) and past time windows. (These now ancient notes indicate we can probably improve using the librosa output with even basic / brute-force overlay fitting of the detected beat tempo + sample times?).

- Add visual rendering of waveform, with past window's beat detections and future predictions marked.

- Use real time audio stream. Obviously, this notebook uses pre-baked audio files for quick demo / testing. In this repo, I've started code that uses Python audio lib(s) to listen to a real time audio device stream (like what would be played live, for beat detection and sync), store the samples in a ring buffer, and use the same librosa code to run short-windowed-into-the-past beat detection aglos. Needs revisit + clean up.

- Take that prediction output, and make sure it is sample time syncrhonized / accurate with the real time audio input samples and wall-clock time.

- Fire OSC LX Studio compatible events synchronized to predictions.

- Apply "smoothing". This is where statistics / math nerds might be able to quickly help? I imagine, if I get all the above working, the predictions will vary in accuracy, plus we must remember that anyone can stop a currently playing song and play another one of entirely different tempo. It's luckily not the end of the world if the beat sync is wildly off (I hope), especially for a short time, but it would nice if we could strike a balance that weights the last N calcs / M minutes of tempos, with abrupt changes (stops, starts, new songs, etc.).

- Create LX Studio beat specific FX, namely to test + demo. (Can be done in parallel, before / while actual tempo improvements are being made).

# SCRATCH:

In [7]:
a = [1.3888, 2, 3.37, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
# np.ediff1d(np.r_[0, a == 0, 0]).nonzero()[0].reshape(-1, 2)

def find_zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges
zr = find_zero_runs(a)
zr = list(map(list, list(zr)))
ipd.display(zr)

zr = list(filter(lambda x: x[1] - x[0] > 2, zr))
zr

[[3, 9], [12, 16], [19, 20]]

[[3, 9], [12, 16]]

In [8]:
a = [1.3888, 2, 3.37, -1, -2, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
a = np.sign(a)
a = np.diff(a)
a

array([ 0.,  0., -2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
       -1.,  0.,  0.,  0.,  1.,  0.,  0., -1.,  1.,  0.])

In [9]:
y, sr = librosa.load('/tmp/sigh.wav', sr=48000)
y

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sigh.wav'

In [None]:
sr

In [None]:
zr = find_zero_runs(y)
zr

In [None]:
zero_runs = list(map(list, zr))
if len(zero_runs) >= 4:
    bpm = 60 / ((zero_runs[2][1] - zero_runs[1][1]) / sr)
    beep_len = ((zero_runs[2][0] - zero_runs[1][1] + 1) / sr)
bpm
beep_len