In [1]:
import numpy as np
import librosa.display
import librosa
from pathlib import Path

# from rp_extract.py import rp_extract
from transformers import JukeboxVQVAE
import torch
import tqdm

from pathlib import Path


In [2]:
# datapath = Path('data_npy/gtzan')
# datapath = Path('data_npy/fma_small')
datapath = Path('data_npy/fma_small_22k')

data = np.load(datapath / 'data_train.npy')
labels = np.load(datapath / 'labels_train.npy')
sr = np.load(datapath / 'sr_train.npy')

data.shape, labels.shape, sr.shape

((800, 1321438), (800,), (800,))

 ## MFCC 


In [3]:
mfcc = []

for d, sr0 in zip(data, sr):
    mfcc.append(
        librosa.feature.mfcc(y=d, sr=sr0)
    )
mfcc = np.stack([f.ravel() for f in mfcc])
np.save(datapath / 'mfcc.npy', mfcc)

print(mfcc.shape)

del mfcc

(800, 51620)


## Zero Crossing Rate

In [4]:
zero = []

for d in data:
    zero.append(
       librosa.feature.zero_crossing_rate(y=d, frame_length=2048, hop_length=512)
    )
    
zero = np.stack(zero).squeeze()
np.save(datapath / 'zero.npy', zero)
print(zero.shape)

del zero

(800, 2581)


## Tempo

In [5]:
from librosa.feature import rhythm

tempo = []
hop_length = 512

for d, sr0 in zip(data, sr):
    oenv = librosa.onset.onset_strength(y=d, sr=sr0, hop_length=hop_length)
    tempo.append(
        #times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
        #tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length)
        rhythm.tempo(onset_envelope=oenv, sr=sr0, hop_length=hop_length)[0]
    )

tempo = np.stack([f.ravel() for f in tempo])
np.save(datapath / 'tempo.npy', tempo)

print(tempo.shape)


(800, 1)


## Spectral Contrast

In [6]:
#y_harmonic = librosa.effects.hpss(data)
sc = []

for d, sr0 in zip(data, sr):
    sc.append(
        librosa.feature.spectral_contrast(y=d, sr=sr0)
    )
sc = np.stack([f.ravel() for f in sc])
np.save(datapath / 'sc.npy', sc)

print(sc.shape)



(800, 18067)


## STFT

In [7]:
stft = []

n_fft = 2048
hop_length = 1024

print(data.shape)

for d, sr0 in zip(data, sr):
    stft.append(
        librosa.amplitude_to_db(
            np.abs(
                librosa.stft(y=d, hop_length=hop_length, n_fft=n_fft)
            )
        )
    )
stft = np.stack([f.ravel() for f in stft])
np.save(datapath / 'stft.npy', stft)

print(stft.shape)

(800, 1321438)
(800, 1323275)


 ## Mel Spectrogram

In [8]:
mel = []

for d, sr0 in zip(data, sr):
    mel.append(
        librosa.feature.melspectrogram(y=d, sr=sr0)
    )
mel = np.stack([f.ravel() for f in mel])
np.save(datapath / 'mel.npy', mel)
print(mel.shape)

(800, 330368)


 ## SSD

In [None]:
# from rp_extract import rp_extract

# SSD = []

# for d, sr0 in zip(data, sr):
#     SSD.append(
#         rp_extract(
#             wavedata,  # the two-channel wave-data of the audio-file
#             samplerate,  # the samplerate of the audio-file
#             extract_ssd=True,  # <== extract this feature!
#             transform_db=True,  # apply psycho-accoustic transformation
#             transform_phon=True,  # apply psycho-accoustic transformation
#             transform_sone=True,  # apply psycho-accoustic transformation
#             fluctuation_strength_weighting=True,  # apply psycho-accoustic transformation
#             skip_leadin_fadeout=1,  # skip lead-in/fade-out. value = number of segments skipped
#             step_width=1,
#         )  #
#     )
# SSD = np.stack([f.ravel() for f in SSD])
# np.save("SSD.npy", SSD)

# print(SSD.shape)


## Jukebox embedding

In [None]:
# datapath = Path('data_npy/gtzan')

data = np.load(datapath / 'data_train.npy')
labels = np.load(datapath / 'labels_train.npy')
sr = np.load(datapath / 'sr_train.npy')

data.shape, labels.shape, sr.shape

In [None]:
data = librosa.resample(data, orig_sr=sr[0], target_sr=44100, axis=1)
data.shape

In [None]:
data_tensor = torch.from_numpy(data[:, :, np.newaxis])
data_tensor.shape

In [None]:
model = JukeboxVQVAE.from_pretrained("ArthurZ/jukebox-vqvae")

In [None]:
embedded = [
    [e.numpy().ravel() for e in model.encode(d.unsqueeze(0))]
    for d in tqdm.tqdm(data_tensor)
]

In [None]:
embedding_large = np.array([e[0] for e in embedded])
embedding_med = np.array([e[1] for e in embedded])
embedding_small = np.array([e[2] for e in embedded])

embedding_large.shape, embedding_med.shape, embedding_small.shape

In [None]:
np.save(datapath / 'jukebox_embedding_large.npy', embedding_large)
np.save(datapath / 'jukebox_embedding_med.npy', embedding_med)
np.save(datapath / 'jukebox_embedding_small.npy', embedding_small)