In [22]:
import torch
from torch import tensor, nn, optim
import torch.nn.functional as F

In [23]:
n_embed = 256
block_size = 32
bs= 16
vocab_size = 500
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path
from transformers import AutoTokenizer

# Creating a multitrack tokenizer, read the doc to explore all the parameters
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)

directories = ['train', 'test']

# Train the tokenizer with Byte Pair Encoding (BPE)
files_paths = list(Path("/notebooks/classical-music-gen/midis_train").glob("**/*.midi")), list(Path("/notebooks/classical-music-gen/midis_test").glob("**/*.midi"))

tokenizer.train(vocab_size=vocab_size, files_paths=files_paths[0])
tokenizer.save(Path("tokenizer", "tokenizer.json"))
# And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
tokenizer.push_to_hub("ABicGrill/miditok_tokenizer", private=True, token="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")


# Split MIDIs into smaller chunks for training
for i, nm in enumerate(directories):
    dataset_chunks_dir = Path(f"chunks_{nm}")
    split_files_for_training(
        files_paths=files_paths[i],
        tokenizer=tokenizer,
        save_dir=dataset_chunks_dir,
        max_seq_len=block_size+1,
    )


# Create a Dataset, a DataLoader and a collator to train a model
ds_trn = DatasetMIDI(
    files_paths=list(Path('chunks_train').glob("**/*.midi")),
    tokenizer=tokenizer,
    max_seq_len=block_size+1,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

ds_val = DatasetMIDI(
    files_paths=list(Path('chunks_test').glob("**/*.midi")),
    tokenizer=tokenizer,
    max_seq_len=block_size+1,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=False)
dl_trn = DataLoader(ds_trn, batch_size=bs, collate_fn=collator, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=bs * 10, collate_fn=collator, shuffle=True)

len(ds_trn), len(ds_val)

  super().__init__(tokenizer_config, params)







Splitting music files (chunks_train): 100%|██████████| 132/132 [00:11<00:00, 11.09it/s]
Splitting music files (chunks_test): 100%|██████████| 5/5 [00:00<00:00, 11.94it/s]


(38007, 1475)

In [None]:
ds_trn[0]cab_sizeb_val1]

{'input_ids': tensor([[304, 276, 278,  37, 370,  53, 235, 176, 305, 329,  56, 401, 287, 241,
           56, 224, 182, 242, 298,  66, 278,  27, 230,  39, 241,  63, 450, 250,
          224, 186, 276, 278,  41],
         [332, 264, 418, 312, 339, 181, 269, 299, 184, 265, 229,  56, 354, 189,
          325, 368,  57, 102, 117, 194, 285, 266, 197, 263, 418, 198, 265, 306,
          203, 293, 357, 246, 259],
         [273, 255, 381, 178, 258, 323, 182, 247,  99, 128, 186, 261, 324, 189,
          248, 319, 193, 256,  99, 122, 270,  95, 122, 194, 325,  97, 123, 198,
          255, 471, 202, 258, 306],
         [246, 240, 355,  39, 495, 253, 224, 177, 260, 244,  44, 229,  55, 343,
          179, 261, 486, 237, 480, 251, 292,  53, 468, 184, 238, 235, 185, 253,
          277, 187, 275, 236, 189],
         [273, 261, 363,  54, 300, 177, 240, 363,  47, 313, 181, 257, 322, 264,
          323, 182, 243, 299, 185, 261, 363,  54, 300, 188, 251, 363,  56, 334,
          191, 243, 322, 264, 343],
       

In [48]:
dropout = 0.1
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.k = nn.Linear(n_embed, head_size)
        self.q = nn.Linear(n_embed, head_size)
        self.v = nn.Linear(n_embed, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x) # B, T, head_size
        q = self.q(x) # B, T, head_size
        v = self.k(x) # B, T, head_size
        
        wei = k @ q.transpose(-2, -1) * self.head_size ** -0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = self.dropout(wei.softmax(-1))
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.head_size = n_embed // num_heads 
        self.heads = nn.ModuleList([Head(self.head_size) for i in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        
        self.dropout = nn.Dropout(dropout)

    
    def forward(self, x):
        out = self.proj(torch.cat([h(x) for h in self.heads], dim=-1))
        return self.dropout(out)
    
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed),
            nn.ReLU(),
            nn.Linear(n_embed, n_embed),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        out = self.net(x)
        return out

class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        self.m_head = MultiHeadAttention(n_heads)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = x + self.m_head(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return self.dropout(x)
    

class MusicModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embs = nn.Embedding(vocab_size, n_embed)
        self.pos_embs = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(
            Block(4),
            Block(4),
        )
        self.lm_head = nn.Linear(n_embed, vocab_size)
        
    def forward(self, x, targets=None):
        B, T = x.shape
        token_embs = self.token_embs(x) # B, T, C
        pos_embs = self.pos_embs(torch.arange(T).to(device)) # T, C

        x = token_embs + pos_embs
        x = self.blocks(x)
        out= self.lm_head(x)
        if targets is not None:
            B, T, C = out.shape
            out = out.view(B * T, C)
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(out, targets)
            return out, loss
            
        return out, None
    
    def generate(self, idx, max_tokens):
        for i in range(max_tokens):
            idx_cond = idx[:, -block_size:]
            out, loss = self(idx_cond)
            
            out = out[:,-1,:]
            probs = out.softmax(-1)
            preds = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, preds), dim=-1)
            
        return idx

In [27]:
model = MusicModel().to(device)

In [28]:
opt = optim.AdamW(model.parameters(), lr=0.002)
n_epochs = 4
for epoch in range(n_epochs):
    for i, batch in enumerate(dl_trn):
        ids = batch['input_ids']
        xb = ids[:, :block_size].to(device)
        yb = ids[:, 1:].to(device)
        out, loss = model(xb, yb)
        
        loss.backward()
        with torch.no_grad():
            opt.step()
            opt.zero_grad()
            if i % int(len(dl_trn)/10) == 0 or i == len(dl_trn) -1:
                print(f'trn: {loss.item()}', end=' ')
                val_losses = []
                for batch in dl_val:
                    ids = batch['input_ids']
                    xb = ids[:, :block_size].to(device)
                    yb = ids[:, 1:].to(device)
                    out_val, loss_val = model(xb, yb)
                    val_losses.append(loss_val)
                print(f'val: {tensor(val_losses).mean()}')
                    

IndexError: list index out of range

In [None]:
for i in range(10):
    tokenizer(model.generate(torch.ones((1, 1), dtype=torch.long, device=device), max_tokens=300)[0]).dump_midi(f"recordings/bruh{i}.mid")

In [None]:
score = tokenizer(model.generate(torch.ones((1, 1), dtype=torch.long), max_tokens=3000)[0])

In [206]:
score = tokenizer(model.generate(torch.ones((1, 1), dtype=torch.long), max_tokens=30)[0])
score

Score(ttype=Tick, tpq=8, begin=0, end=77, tracks=1, notes=5, time_sig=1, key_sig=0, markers=0)

In [226]:
tokenizer(score)

TokSequence(tokens=['Bar_None', 'Bar_None', 'Position_5', 'Program_0', 'Pitch_66', 'Velocity_55', 'Duration_5.0.4', 'Program_0', 'Pitch_62', 'Velocity_23', 'Duration_2.5.8', 'Position_6', 'Program_0', 'Pitch_61', 'Velocity_47', 'Duration_1.1.8', 'Position_9', 'Program_0', 'Pitch_66', 'Velocity_31', 'Duration_0.1.8', 'Program_0', 'Pitch_59', 'Velocity_39', 'Duration_0.4.8'], ids=[4, 4, 178, 280, 50, 99, 144, 280, 46, 95, 129, 179, 280, 45, 98, 117, 182, 280, 50, 96, 109, 280, 43, 97, 112], bytes='', events=[Event(type=Bar, value=None, time=0, desc=0), Event(type=Bar, value=None, time=32, desc=0), Event(type=Position, value=5, time=37, desc=37), Event(type=Program, value=0, time=37, desc=77), Event(type=Pitch, value=66, time=37, desc=77), Event(type=Velocity, value=55, time=37, desc=55), Event(type=Duration, value=5.0.4, time=37, desc=40 ticks), Event(type=Program, value=0, time=37, desc=58), Event(type=Pitch, value=62, time=37, desc=58), Event(type=Velocity, value=23, time=37, desc=23),

In [230]:
from symusic import Score

Score('bruh.mid')

Score(ttype=Tick, tpq=8, begin=0, end=306, tracks=2, notes=2, time_sig=1, key_sig=0, markers=0)

In [231]:
tokenizer(Score('bruh.mid'))

[4, 174, 398, 72, 104, 149, 4, 4, 4, 4, 4, 4, 4, 179, 378, 11, 102, 162]