In [12]:
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path

# Creating a multitrack tokenizer, read the doc to explore all the parameters
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)

# Train the tokenizer with Byte Pair Encoding (BPE)
files_paths = list(Path("/notebooks/classical-music-gen/midis_test").glob("**/*.midi"))
tokenizer.train(vocab_size=2000, files_paths=files_paths)
tokenizer.save(Path("tokenizer", "tokenizer.json"))
# And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
tokenizer.push_to_hub("ABicGrill/miditok_tokenizer", private=True, token="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")

# Split MIDIs into smaller chunks for training
dataset_chunks_dir = Path("chunks")
split_files_for_training(
    files_paths=files_paths,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=1024,
)

# Create a Dataset, a DataLoader and a collator to train a model
dataset = DatasetMIDI(
    files_paths=list(dataset_chunks_dir.glob("**/*.midi")),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = DataLoader(dataset, batch_size=1, collate_fn=collator)

# Iterate over the dataloader to train a model
for batch in dataloader:
    print(batch)
    break




{'input_ids': tensor([[ 256,  368, 1340, 1615,  481,   35,  391,  168,  412,  335, 1516, 1377,
          293,  472,  242,  721,  325,   99,  114,  826,   99,  113,  356, 1866,
          421, 1866,  249,  567, 1715, 1040,  734,  421,  391,  195,  273,  888,
         1051,  238, 1039,  226,  234,  356,  694,  982,  277,  274,   75, 1486,
         1305,  242, 1116, 1218,  262, 1219,  223,  544,  778,  240, 1595,  217,
           55, 1455,  233,  678,  228, 1297,  753,  264,  730,  258,  174,  292,
         1199,  511,  521,  229,  268,   60, 1073,  266,  614,  272,  544,  258,
         1164,  217,   55, 1002,  293, 1120,  779,  228,  730,  697,  230,  394,
           24,  268,   12,  648,  229, 1199, 1456,  272,  217,    7,  258,  186,
          249,  268,   43,  258, 1544,  274,    9,  260,   21,  318, 1705, 1041,
          718,  281,  318,  191,  368,  355,   47, 1348,  275,   35,  355,   38,
          316, 1112,  224, 1783, 1503,   23,  351,   11,  260,   38, 1348,  316,
          1

  split_files_for_training(


In [13]:
for batch in dataloader:
    score =tokenizer(batch['input_ids'][0])
    print(type(score))
    break

<class 'symusic.core.ScoreTick'>


In [24]:
tokenizer.vocab_model

{'\x82\x9d': 699,
 'áĹT': 970,
 '\x82¯': 1852,
 'èĹM': 922,
 '\x85\x8fÞ': 999,
 '\x85\x8eĹ\\': 856,
 '\x88\x93': 347,
 '\x83\x8e': 219,
 '\x81\x8eæ': 1228,
 '\x89\x8eĹQ': 806,
 'C': 33,
 '\x84\x8eì': 562,
 '\x82\x8eÝ': 899,
 '\x83\x8eĹe': 1595,
 '\x89\x8eĹf': 860,
 '\x84\x8fĹ': 296,
 '\x85\x8f': 251,
 '\x84\x8eÛ': 547,
 '\x88\x8e': 226,
 '\x83\x8eé': 584,
 'Ĺ7': 282,
 '\x89\x8eĹM': 1040,
 'ÛĹ[': 1261,
 '\x82\x8eí': 812,
 '\x93': 106,
 '\x89\x90Ĺl': 1623,
 '>': 28,
 'ìĹM': 1564,
 'ØĹT': 1892,
 '\x84\x90×': 1813,
 'áĹ_': 1932,
 'ĹK': 247,
 '\x83\x8eæ': 582,
 'ÒĹD': 1653,
 'ØĹF': 1896,
 '\x85\x8eĹ[': 599,
 '\x89\x8eĹi': 660,
 '\x8a\x90Ĺ': 351,
 'æĹP': 1704,
 '\x84¦': 1856,
 '\x84\x8eĹJ': 936,
 '\x84\x8eĹV': 862,
 '\x83\x91ĹL': 1841,
 '\x82\x91Ĺ': 352,
 'ÛĹM': 1164,
 'ÝĹY': 1530,
 '\x84\x8eĹe': 1438,
 'ÜĹH': 823,
 '\x86\x8fĹO': 1354,
 'ëĹR': 1561,
 'ĹH': 220,
 '\x80\x92': 956,
 '\x88\x8fê': 1006,
 '▁%ÑĹO': 1235,
 'åĹV': 1412,
 '\x88\x8eÓ': 778,
 '\x89\x8eÖ': 1327,
 '▁%Ò': 375,
 '\x8a\x9b':