```
A python package to parse and process the MUSDB18 dataset, the largest open access dataset for music source separation. The tool was originally developed for the Music Separation task as part of the Signal Separation Evaluation Campaign (SISEC).
```
### 1. Download & Parse musdb dataset 

In [1]:
#!sudo apt-get install ffmpeg
#!pip install musdb

import musdb
musdb_data_path = 'data/musdb18'
mus_total = musdb.DB(root=musdb_data_path)

## 2. Load Dataset

In [None]:
mus_train = musdb.DB(root=musdb_data_path, subsets="train", split='train')
mus_valid = musdb.DB(root=musdb_data_path, subsets="train", split='valid')
mus_test = musdb.DB(root=musdb_data_path, subsets="test")

## 3.  Preprocess training data

In [None]:
def save_as_numpy (_audio, _norm, _dest_dir, _target_name, _audio_idx, _track_name, _n_fft, _win_size, _hop_size, _normalize) : 

    audio_mono = _audio.mean(axis=1)
    # ipd._audio(track._audio.T[:,:track.rate*5], rate=track.rate)
    # ipd._audio(_audio.T[:5*track.rate], rate=track.rate)

    stft = librosa.stft(audio_mono, n_fft=_n_fft, win_length=_win_size, hop_length=_hop_size)
    spectrum, phase = librosa.magphase(stft)
    spectrogram = np.abs(spectrum).astype(np.float32)
    
    if(_norm is None):
        _norm = spectrogram.max()
    
    if(_normalize):
        spectrogram /= _norm
        
    np.save(os.path.join(_dest_dir, _target_name, num2str(_audio_idx) + '_' + _track_name + '_spec'), spectrogram)
    np.save(os.path.join(_dest_dir, _target_name, num2str(_audio_idx) + '_' + _track_name + '_phase'), phase)
    
    return _norm

In [None]:
# parameters: _win_size, _hop_size
from tqdm import tqdm_notebook
import librosa
import numpy as np
import os
from util import num2str

def preprocess_dataset(_dataset, _dest_dir, _n_fft, _win_size, _hop_size, normalize):

    # create the folder
    if not os.path.exists(_dest_dir):
        os.mkdir(_dest_dir)
        os.mkdir(os.path.join(_dest_dir, 'mixture'))
        os.mkdir(os.path.join(_dest_dir, 'vocals'))
        os.mkdir(os.path.join(_dest_dir, 'drums'))
        os.mkdir(os.path.join(_dest_dir, 'bass'))
        os.mkdir(os.path.join(_dest_dir, 'other'))
        os.mkdir(os.path.join(_dest_dir, 'metadata'))

    targets = ['vocals', 'drums', 'bass', 'other']

    print('Preprocess data: ' , str(_dataset))
    
    for audio_idx, track in enumerate(tqdm_notebook(_dataset)):

        target = 'mixture'
        norm = None

        # After For Iteration
        norm = save_as_numpy(track.audio, None, _dest_dir, target, audio_idx, track.name, _n_fft, _win_size, _hop_size, normalize)

        # save metadata
        np.save(os.path.join(_dest_dir, "metadata", num2str(audio_idx) + '_' + track.name + '_spec'), np.array([norm, track.rate]))

        for target in targets:
            save_as_numpy(track.targets[target].audio, norm, _dest_dir, target, audio_idx, track.name, _n_fft, _win_size, _hop_size, normalize)

n_fft = 1024            
win_size = 512
hop_size = 256
normalize = False

preprocess_dataset(mus_train, 'data/musdb18/preprocessed/plain/train', n_fft, win_size, hop_size, normalize)
preprocess_dataset(mus_valid, 'data/musdb18/preprocessed/plain/valid', n_fft, win_size, hop_size, normalize)
preprocess_dataset(mus_test, 'data/musdb18/preprocessed/plain/test', n_fft, win_size, hop_size, normalize)