#### REMI Tokenization

In [1]:
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


#### Tokenizer Parameters
We will mostly use the default parameters, except we will not use velocity tokens and we will use program tokens.

In [2]:
# Tokenizer parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),  # Default pitch range
    "beat_res": {(0, 4): 8, (4, 12): 4},  # Beat resolution in samples per beat
    "special_tokens": ["PAD", "BOS", "EOS"],  # Padding, Start of Sequence, End of Sequence, Masking
    "use_velocities": False,
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": True,  # Specifies instrument/MIDI program of the notes
    "use_pitch_drum_tokens": False,
    "num_tempos": 32,  # Number of tempo bins
    "tempo_range": (40, 250),  # (Min, Max)
    "chord_unknown": None,  # Set unknown chords to None
    "programs": list(range(-1, 128)),  # Sequence of MIDI programs to use
    "one_token_stream_for_programs": False,
    "program_changes": False,  # Place Program token for each note
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

#### Create Tokenizer

In [3]:
# Creates the tokenizer
tokenizer = REMI(config)

# Specify MIDI paths
midi_paths = list(Path("sample_dataset").glob("**/*.mid"))

# Train the tokenizer with Byte Pair Encoding (BPE) using 30k tokens
tokenizer.train(vocab_size=30000, model="BPE", files_paths=midi_paths)

# Save tokens
tokenizer.save("tokenizer.json")

#### Use Tokenizer In Model
To use the tokens in the model, load the tokenizer from the saved parameters. The MIDI files can be split into smaller chunks based on token sequence length. Then use both the tokenizer and the original MIDI files to create a dataset for the model.

Check the credits for example code to prepare the dataset for training.

In [None]:
# Load tokenizer
tokenizer = REMI(params="tokenizer.json")

##### Credits
https://miditok.readthedocs.io/en/latest/examples.html

https://miditok.readthedocs.io/en/v3.0.1/bases.html

https://github.com/Natooz/MidiTok