In [2]:
# Quickstart
# Beat Tracking Example
import librosa

In [3]:
# 1. Get the file to an included audio example
# librosaに含まれているオーディオサンプルファイルへのパスを取得する
filename = librosa.example('nutcracker')

In [4]:
# 2. Load the audio as a waveform 'y'
#    Store the sampling rate as 'sr'
# オーディオを時系列としてロード及びデコードする。
y, sr = librosa.load(filename)

In [5]:
# 3. Run the dafault beat tracker
# 
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

In [6]:
print('Estimated tempo : {: .2f} beats per minute'.format(tempo))

Estimated tempo :  107.67 beats per minute


In [7]:
# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

In [8]:
print(beat_times)

[  1.18421769   1.71827664   2.32199546   2.87927438   3.45977324
   4.01705215   4.59755102   5.13160998   5.7353288    6.29260771
   6.84988662   7.40716553   7.9876644    8.54494331   9.12544218
   9.65950113  10.21678005  10.72761905  11.28489796  11.79573696
  12.32979592  12.86385488  13.42113379  13.95519274  14.4892517
  15.02331066  15.55736961  16.09142857  16.62548753  17.15954649
  17.69360544  18.25088435  18.80816327  19.31900227  19.87628118
  20.38712018  20.92117914  21.4552381   21.98929705  22.52335601
  23.05741497  23.59147392  24.12553288  24.65959184  25.19365079
  25.72770975  26.26176871  26.81904762  27.35310658  27.88716553
  28.44444444  29.00172336  29.55900227  30.11628118  30.67356009
  31.20761905  31.78811791  32.34539683  32.85623583  33.36707483
  33.90113379  34.43519274  34.94603175  35.45687075  35.99092971
  36.52498866  37.03582766  37.56988662  38.12716553  38.66122449
  39.2185034   39.75256236  40.30984127  40.84390023  41.40117914
  41.935238

In [18]:
# Advanced Usage
# Feature extraction example
import numpy as np
import librosa

In [19]:
# Load the example clip
y, sr = librosa.load(librosa.ex('nutcracker'))

In [20]:
# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 512

In [21]:
# Separate harmonics and percussives into two waveforms
y_harmonic, y_percussive = librosa.effects.hpss(y)

In [23]:
# Beat track on the percussive signal
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr)
print(tempo)
print(beat_frames)

107.666015625
[  51   74  100  124  149  173  198  221  246  270  295  319  344  367
  393  415  439  461  485  507  531  554  577  600  623  646  669  692
  716  739  762  785  807  831  855  877  900  923  946  969  993 1015
 1038 1061 1085 1107 1131 1155 1178 1201 1225 1248 1272 1296 1320 1344
 1368 1392 1414 1437 1460 1483 1505 1527 1550 1573 1595 1618 1641 1664
 1688 1712 1735 1758 1782 1806 1829 1852 1876 1900 1924 1947 1971 1994
 2018 2041 2064 2087 2110 2132 2155 2177 2200 2222 2244 2266 2289 2312
 2335 2357 2380 2404 2427 2451 2474 2498 2521 2544 2568 2592 2615 2638
 2661 2684 2706 2728 2752 2775 2797 2819 2842 2864 2887 2910 2933 2955
 2978 3001 3024 3050 3075 3100 3125 3150 3174 3199 3223 3246 3269 3295
 3317 3340 3362 3385 3409 3432 3456 3480 3508 3533 3560 3585 3612 3637
 3663 3688 3714 3740 3766 3791 3816 3841 3866 3890 3915 3940 3965 3989
 4014 4038 4063 4087 4111 4135 4159 4183 4208 4231 4255 4279 4304 4327
 4351 4374 4399 4422 4447 4471 4495 4519 4543 4567 4590 4614 46

In [24]:
# Compute MFCC feature from the raw signal
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
print(mfcc)

[[-602.36005 -602.36005 -602.36005 ... -602.36005 -602.36005 -602.36005]
 [   0.         0.         0.      ...    0.         0.         0.     ]
 [   0.         0.         0.      ...    0.         0.         0.     ]
 ...
 [   0.         0.         0.      ...    0.         0.         0.     ]
 [   0.         0.         0.      ...    0.         0.         0.     ]
 [   0.         0.         0.      ...    0.         0.         0.     ]]


In [25]:
# And the first-order difference(delta features)
mfcc_delta = librosa.feature.delta(mfcc)
print(mfcc_delta)

[[ 5.5717695e-14  5.5717695e-14  5.5717695e-14 ... -6.2357583e-03
  -6.2357583e-03 -6.2357583e-03]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ... -3.2735988e-03
  -3.2735988e-03 -3.2735988e-03]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  6.3850912e-03
   6.3850912e-03  6.3850912e-03]
 ...
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ... -6.9502019e-03
  -6.9502019e-03 -6.9502019e-03]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ... -7.6169395e-03
  -7.6169395e-03 -7.6169395e-03]
 [ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  1.2936007e-03
   1.2936007e-03  1.2936007e-03]]


In [26]:
# Stack and synchronize between beat events
# This time .we'll use the mean value (default) instead of median
beat_mfcc_delta = librosa.util.sync(np.vstack([mfcc, mfcc_delta]), beat_frames)
print(beat_mfcc_delta)

[[-5.96049988e+02 -4.71975800e+02 -4.18740723e+02 ... -2.30209213e+02
  -1.81633041e+02 -4.22515228e+02]
 [ 5.29752207e+00  1.19491829e+02  1.19481026e+02 ...  6.63411026e+01
   6.66919479e+01  7.45729675e+01]
 [ 1.79374278e+00  5.01075478e+01  1.31605749e+01 ... -6.48235626e+01
  -6.93393173e+01 -2.06293182e+01]
 ...
 [-6.54890090e-02 -1.46449879e-01  5.33927120e-02 ... -1.69990063e-01
   6.22177720e-01  1.29084527e-01]
 [-2.32196283e-02 -1.37379736e-01 -1.14388466e-02 ... -2.49835119e-01
   2.49134481e-01  8.30471981e-03]
 [ 2.86480282e-02 -2.01024413e-01 -5.54223843e-02 ...  7.50200152e-02
  -6.15619838e-01  7.62829483e-02]]


In [29]:
# Compute chroma features from the harmonic signal
chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
print(chromagram)

[[0.5482865  0.55957216 0.34219363 ... 0.08856481 0.2162379  0.49575174]
 [0.18835203 0.10942067 0.1231493  ... 0.16239384 0.15858239 0.2961298 ]
 [0.4294046  0.31900612 0.23598962 ... 0.51924497 0.3996351  0.33668664]
 ...
 [0.7164631  0.48105586 0.24196246 ... 0.11832213 0.15542479 0.3689094 ]
 [0.34954774 0.34639952 0.28646216 ... 0.2515552  0.20143913 0.51555187]
 [1.         1.         1.         ... 0.98473096 0.86376625 1.        ]]


In [30]:
# Aggregate chroma features between beat events
# We'll use the median value of each feature between beat frames
beat_chroma = librosa.util.sync(chromagram, beat_frames, aggregate=np.median)
print(beat_chroma)

[[0.3025479  0.03345221 0.10583739 ... 0.15276003 0.07555084 0.08141489]
 [0.25782552 0.04074807 0.08428393 ... 0.09633807 0.12807782 0.09325492]
 [0.39978737 0.05756589 0.09429981 ... 0.4461109  0.18658903 0.13245766]
 ...
 [0.2408627  0.01813981 0.03972691 ... 0.18358055 0.90317076 0.04014793]
 [0.1903084  0.01895034 0.05992787 ... 0.15093869 0.18061489 0.05555063]
 [0.24867898 0.04809323 0.35862568 ... 0.40117818 0.29769832 0.35132235]]


In [31]:
# Finally, stack all beat-synchronous features together
beat_features = np.vstack([beat_chroma, beat_mfcc_delta])
print(beat_features)

[[ 0.3025479   0.03345221  0.10583739 ...  0.15276003  0.07555084
   0.08141489]
 [ 0.25782552  0.04074807  0.08428393 ...  0.09633807  0.12807782
   0.09325492]
 [ 0.39978737  0.05756589  0.09429981 ...  0.4461109   0.18658903
   0.13245766]
 ...
 [-0.06548901 -0.14644988  0.05339271 ... -0.16999006  0.6221777
   0.12908453]
 [-0.02321963 -0.13737974 -0.01143885 ... -0.24983512  0.24913448
   0.00830472]
 [ 0.02864803 -0.20102441 -0.05542238 ...  0.07502002 -0.61561984
   0.07628295]]
