```
A python package to parse and process the MUSDB18 dataset, the largest open access dataset for music source separation. The tool was originally developed for the Music Separation task as part of the Signal Separation Evaluation Campaign (SISEC).
```
### 1. Download & Parse musdb dataset 

In [1]:
#!sudo apt-get install ffmpeg
#!pip install musdb

import musdb
musdb_data_path = 'data/musdb18'
mus_total = musdb.DB(root=musdb_data_path)

## 2. Load Dataset

In [2]:
mus_train = musdb.DB(root=musdb_data_path, subsets="train", split='train')
mus_valid = musdb.DB(root=musdb_data_path, subsets="train", split='valid')
mus_test = musdb.DB(root=musdb_data_path, subsets="test")

In [3]:
import librosa

## 3.  Preprocess training data

In [4]:
dim_f = 2**10
dim_t = 2**8
n_fft=2*(dim_f-1)
hop_factor = 6
hop_length=n_fft//hop_factor
sampling_rate = 44100
sampling_size = hop_length * (dim_t+hop_factor-1)


def save_as_numpy (_audio, _norm, _dest_dir, _target_name, _audio_idx, _track_name) : 

    audio_mono = _audio.mean(axis=1)
    # ipd._audio(track._audio.T[:,:track.rate*5], rate=track.rate)
    # ipd._audio(_audio.T[:5*track.rate], rate=track.rate)

    spectrogram = librosa.stft(audio_mono, n_fft=n_fft, center=False, hop_length=hop_length)
    r = spectrogram.real
    i = spectrogram.imag
    
    np.save(os.path.join(_dest_dir, _target_name, num2str(_audio_idx) + '_' + _track_name + '_real'), r)
    np.save(os.path.join(_dest_dir, _target_name, num2str(_audio_idx) + '_' + _track_name + '_imag'), i)

In [5]:
# parameters: _win_size, _hop_size
from tqdm import tqdm_notebook
import librosa
import numpy as np
import os
from util import num2str

def preprocess_dataset(_dataset, _dest_dir):

    # create the folder
    if not os.path.exists(_dest_dir):
        os.mkdir(_dest_dir)
        os.mkdir(os.path.join(_dest_dir, 'mixture'))
        os.mkdir(os.path.join(_dest_dir, 'vocals'))
        os.mkdir(os.path.join(_dest_dir, 'drums'))
        os.mkdir(os.path.join(_dest_dir, 'bass'))
        os.mkdir(os.path.join(_dest_dir, 'other'))
        os.mkdir(os.path.join(_dest_dir, 'metadata'))

    targets = ['vocals', 'drums', 'bass', 'other']

    print('Preprocess data: ' , str(_dataset))
    
    for audio_idx, track in enumerate(tqdm_notebook(_dataset)):

        target = 'mixture'
        norm = None

        # After For Iteration
        norm = save_as_numpy(track.audio, None, _dest_dir, target, audio_idx, track.name)

        # save metadata
        np.save(os.path.join(_dest_dir, "metadata", num2str(audio_idx) + '_' + track.name + '_spec'), np.array([norm, track.rate]))

        for target in targets:
            save_as_numpy(track.targets[target].audio, norm, _dest_dir, target, audio_idx, track.name)



preprocess_dataset(mus_train, 'data/musdb18/preprocessed/fullspec/train')
preprocess_dataset(mus_valid, 'data/musdb18/preprocessed/fullspec/valid')
preprocess_dataset(mus_test, 'data/musdb18/preprocessed/fullspec/test')

Preprocess data:  <musdb.DB object at 0x7fe0e7fa1ef0>


HBox(children=(IntProgress(value=0, max=86), HTML(value='')))


Preprocess data:  <musdb.DB object at 0x7fe0e7fa1f28>


HBox(children=(IntProgress(value=0, max=14), HTML(value='')))


Preprocess data:  <musdb.DB object at 0x7fe0e7fa1f98>


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


