In [None]:
!pip install mido



In [None]:
# Data Exploration

from mido import MidiFile

# Load MIDI file
midi = MidiFile('/content/drive/MyDrive/Final Project Folder/Midi Files/monteverdi_libri_dei_madrigali_1_10_(c)icking-archive.mid')

# Inspect the structure of the MIDI file
print(f"Number of tracks: {len(midi.tracks)}")
for i, track in enumerate(midi.tracks):
    print(f"\nTrack {i}: {track.name}")
    for msg in track:
        print(msg)

In [None]:
import os

folder_path = "/content/drive/MyDrive/Final Project Folder/Midi Files"

# List all .mid or .midi files
midi_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.mid', '.midi'))]

monteverdi_libri_dei_madrigali_1_5_(c)icking-archive.mid
monteverdi_libri_dei_madrigali_1_10_(c)icking-archive.mid
monteverdi_libri_dei_madrigali_2_16_(c)icking-archive.mid
monteverdi_libri_dei_madrigali_3_9_(c)icking-archive.mid
monteverdi_libri_dei_madrigali_4_12_(c)icking-archive.mid
monteverdi_libri_dei_madrigali_4_13_(c)icking-archive.mid


In [None]:
# Tracks 1, 2, 5, 6 (Same Weird Singing Instrument)

# Tracks 4 Trumpet and Trombone

# Track 3 (Nothing)

'/content/drive/MyDrive/Final Project Folder/Midi Files/monteverdi_libri_dei_madrigali_1_5_(c)icking-archive.mid'

In [None]:
midi = MidiFile('/content/drive/MyDrive/Final Project Folder/Midi Files/' + midi_files[3])

# Inspect the structure of the MIDI file
print(f"Number of tracks: {len(midi.tracks)}")
for i, track in enumerate(midi.tracks):
    print(f"\nTrack {i}: {track.name}")

Number of tracks: 6

Track 0: 

Track 1: Trumpet in C 1

Track 2: Trumpet in C 2

Track 3: Trombone

Track 4: Bass Trombone

Track 5: 


In [None]:
test_midis = ['monteverdi_libri_dei_madrigali_1_5_(c)icking-archive.mid',
              'monteverdi_libri_dei_madrigali_1_10_(c)icking-archive.mid',
              'monteverdi_libri_dei_madrigali_4_12_(c)icking-archive.mid',
              'monteverdi_libri_dei_madrigali_4_13_(c)icking-archive.mid']

In [None]:
import os
from mido import MidiFile

input_folder = 'path/to/your/midi/folder'
output_folder = 'path/to/your/output/folder'

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

def Midi_File_Input(mids):
    token = []
    for i, track in enumerate(mids.tracks):
        for msg in track:
            if msg.type == 'note_on':
                token.append(f"<{msg.type}_{msg.channel}_{msg.note}_{msg.velocity}>")
            elif msg.type == 'note_off':
                token.append(f"<{msg.type}_{msg.channel}_{msg.note}_{msg.velocity}>")
            elif msg.type == 'track_name':
                token.append(f"<{msg.type}_{msg.name}>")
            elif msg.type == 'control_change':
                token.append(f"<{msg.type}_{msg.channel}_{msg.control}_{msg.value}>")
            elif msg.type == 'program_change':
                token.append(f"<{msg.type}_{msg.program}_>")
            elif msg.type == 'key_signature':
                token.append(f"<{msg.type}_{msg.key}>")
    return token

token_data = {}
all_tokens = []  # Flat list for training


# Process each MIDI file
for filename in test_midis:
    midi_path = '/content/drive/MyDrive/Final Project Folder/Midi Files/' + filename

    midi = MidiFile(midi_path)

    tokens = Midi_File_Input(midi)

    # Add song_start and song_end markers
    song_tokens = ['<song_start>'] + tokens + ['<song_end>']

    # Store in dictionary (per-song)
    token_data[filename] = song_tokens

    # Append to flat list (for generative model training)
    all_tokens.extend(song_tokens)


Token Data Length analysis

In [None]:
token_data_length = []

for song_names in test_midis:
  token_length_size = len(token_data[song_names])
  token_data_length.append(token_length_size)


In [None]:
token_data_length #[3212, 2739, 4322, 3837] These are our lengths of our token data so we are going to create a 500 cut off padding only one of the

[3212, 2739, 4322, 3837]

Vocab and Numerical Index Creation

In [None]:
from collections import Counter

# Flatten all tokens across all songs
all_tokens_flat = [token for tokens in token_data.values() for token in tokens]

# Count and sort tokens (optional for ordering)
token_freq = Counter(all_tokens_flat)

# Assign token ID
vocab = {token: idx for idx, token in enumerate(sorted(token_freq))}

# Optionally store reverse map too:
inv_vocab = {idx: token for token, idx in vocab.items()}


In [None]:
token_ids_data = {}

for filename, tokens in token_data.items():
    token_ids = [vocab[token] for token in tokens]
    token_ids_data[filename] = token_ids

In [None]:
vocab["<pad>"] = 279

In [None]:
# # vocab

# vocab["<song_start>"]



277

Transformer Building

In [None]:
# !pip install lightning

#!pip install pytorch_lightning



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

import lightning as L
from pytorch_lightning import Trainer


In [None]:
inputs = torch.tensor([vocab["<song_start>"]])

outputs = torch.tensor([vocab["<song_end>"]])

{'<control_change_0_100_0>': 0,
 '<control_change_0_101_0>': 1,
 '<control_change_0_10_104>': 2,
 '<control_change_0_10_24>': 3,
 '<control_change_0_10_44>': 4,
 '<control_change_0_10_64>': 5,
 '<control_change_0_10_84>': 6,
 '<control_change_0_12_3>': 7,
 '<control_change_0_38_0>': 8,
 '<control_change_0_6_12>': 9,
 '<control_change_0_7_100>': 10,
 '<control_change_0_7_101>': 11,
 '<control_change_0_7_102>': 12,
 '<control_change_0_7_103>': 13,
 '<control_change_0_7_104>': 14,
 '<control_change_0_7_105>': 15,
 '<control_change_0_7_106>': 16,
 '<control_change_0_7_107>': 17,
 '<control_change_0_7_108>': 18,
 '<control_change_0_7_109>': 19,
 '<control_change_0_7_110>': 20,
 '<control_change_0_7_111>': 21,
 '<control_change_0_7_112>': 22,
 '<control_change_0_7_113>': 23,
 '<control_change_0_7_114>': 24,
 '<control_change_0_7_115>': 25,
 '<control_change_0_7_116>': 26,
 '<control_change_0_7_117>': 27,
 '<control_change_0_7_118>': 28,
 '<control_change_0_7_83>': 29,
 '<control_change_0_7_8

Inputs

In [None]:
# dataset = TensorDataset(inputs, outputs)
# dataloader = DataLoader(dataset, batch_size=1)

Word Embedding

Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

#import lightning as L
from pytorch_lightning import Trainer, LightningModule


Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model, max_length):

    super().__init__()

    pe = torch.zeros(max_length, d_model) #Here we start by creating an empty matrix which will get updated.

  #d_model is the number of word embedding dimensions the larger this is the richer our description of a word
  #The actual word embedding values are updated via back propagation

    position = torch.arange(start = 0, end = max_length, step = 1).float().unsqueeze(1)

    embedding_index = torch.arange(start=0, end=d_model,step=2).float()

    div_term = 1/torch.tensor(10000.0)**(embedding_index/ d_model)


    #Our Positional encoding starts with a sin and cos equation.
  #Have a look at the literture to check why we do this but it's not too difficult to understand

    pe[:, 0::2] = torch.sin(position * div_term) #This updates the first column
    pe[:, 1::2] = torch.cos(position * div_term) # This updates the second column

  #This happens in an alternating method where the first column and the secondary column gets updated

    self.register_buffer('pe', pe.unsqueeze(0))

  def forward (self, word_embeddings):

    seq_len = word_embeddings.size(1)
    return word_embeddings + self.pe[:, :seq_len, :]

    #return word_embeddings + self.pe[:word_embeddings.size(0), :] #Finally we add the positonal encoding values to the word embedding values

Masked Self Attention Value

In [None]:
class Attention(nn.Module):
  def __init__(self, d_model):

#Here we pass in the number of word embedding values per token This is for the sake of matrix nultiplication
#When we do our matrix multiplication to create the queries, keys and values.

    super().__init__()

    self.W_q = nn.Linear(in_features = d_model, out_features = d_model, bias = False)
    self.W_k = nn.Linear(in_features = d_model, out_features = d_model, bias = False)
    self.W_v = nn.Linear(in_features = d_model, out_features = d_model, bias = False)

    #Above we have our weight matrix which are used to calculate our Query, Keys and Values
    #Finally, we don't include a bias term here when calculating attention which is why we set the bias to False

    self.row_dim = 0
    self.col_dim = 1 #To give us flexibility to input data sequentiallly into batches we create variables to do so

  def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask = None):

      #The forward method is where we use the masked self attention values for each token to be calculated
      #Next we give this forward method the flexbility
      #Finally since we want to do masked self attention we can pass in a mask too.

    q = self.W_q(encodings_for_q)
    k = self.W_k(encodings_for_k)
    v = self.W_v(encodings_for_v)

    #sims = torch.matmul(q, k.transpose(dim0 = self.row_dim, dim1 = self.col_dim))

    #scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

    sims = torch.matmul(q, k.transpose(-2, -1))  # [batch_size, seq_len, seq_len]

    scaled_sims = sims / (k.size(-1) ** 0.5)

    if mask is not None:
      scaled_sims = scaled_sims.masked_fill(mask = mask, value=-1e9) #Here we include our mask values which stops the attention from looking ahead to caluclate weights.

    attention_percents = F.softmax(scaled_sims, dim = self.col_dim)

    attention_scores = torch.matmul(attention_percents, v)

    return attention_scores

Residual Connection

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

#from pytorch_lightning import
from pytorch_lightning import Trainer, LightningModule


class DecodeOTrans(LightningModule):
  def __init__(self, d_model, max_length, num_tokens = 100):

    #Here we specify the number of tokens available in the vocabulary,
    #Number of word embeddings per token
    #Max token length
    super().__init__()

    self.we = nn.Embedding(num_embeddings = num_tokens, embedding_dim = d_model) #We creating a word embedding value
    #Embeddings needs to know how many tokens are in the vocabulary and understand the dimension size to represent the embedding

    self.pe = PositionalEncoding(d_model = d_model, max_length = max_length)

    #Then we create a positional encoding object using the class we created earlier

    self.self_attention = Attention(d_model = d_model)

    self.fc_layer = nn.Linear(in_features = d_model, out_features = num_tokens) #This is our fully connected layer also know as our Dense Layer of neurons (RNN neural networks)

    self.loss = nn.CrossEntropyLoss()

  def forward(self, token_ids):

    word_embeddings = self.we(token_ids)  # [batch_size, seq_len, d_model]
    positional_encoding = self.pe(word_embeddings)  # add positional encoding

    seq_len = token_ids.size(1)
    #mask = torch.tril(torch.ones((seq_len, seq_len), device=token_ids.device)) == 0

    mask = torch.tril(torch.ones((seq_len, seq_len), device=token_ids.device)).unsqueeze(0) == 0

    self_attention_values = self.self_attention(
        positional_encoding,
        positional_encoding,
        positional_encoding,
        mask=mask
    )

    residual_connection_value = positional_encoding + self_attention_values
    fc_layer_output = self.fc_layer(residual_connection_value)

    return fc_layer_output


  def configure_optimizers(self):
    return Adam(self.parameters(), lr = 1e-4) #Here we are using an Adam Optimiser to train all our parmeters with a learning rate of 0.1 (This is a fast learning rate)

  def training_step(self, batch, batch_idx):
    input_tokens, labels = batch  # input: [batch_size, seq_len], labels: [batch_size, seq_len]

    logits = self.forward(input_tokens)  # output: [batch_size, seq_len, vocab_size]

    # Flatten for CrossEntropyLoss: expects [N, C] and [N]
    loss = self.loss(
        logits.view(-1, logits.size(-1)),  # [batch_size * seq_len, vocab_size]
        labels.view(-1)                    # [batch_size * seq_len]
    )

    #self.log("train_loss", loss)
    return loss


Traning Our Model From Video

In [None]:
# def configure_optimizers(self):
#   return Adam(self.parameters(), lr = 0.1) #Here we are using an Adam Optimiser to train all our parmeters with a learning rate of 0.1 (This is a fast learning rate)

# def training_step(self, batch, batch_idx):

#   input_tokens, labels = batch #Next, we split the training data into inputs and labels

#   output = self.forward(input_tokens[0]) #Then we pass our input tokens into the forward class to get our output

#   loss = self.loss(output, labels[0]) #Next we compare the output to the known labels into our loss function to minimise
#   #Thhis does the softmax for us.

#   return loss

Training Model From GPT

In [None]:


#We want <start_token>, .... ,<end_token>

# <start_token> ..... <pad>, <end_token>

In [None]:
max_length = 100
d_model = 128
max_token_count = max_length - 2  # for <song_start> and <song_end>
pad_token_id = vocab["<pad>"]
song_start_id = vocab["<song_start>"]
song_end_id = vocab["<song_end>"]

input_sequences = []
target_sequences = []

for tokens in token_data.values():
    # Truncate the tokens to fit within max_token_count
    midi_tokens = all_tokens[:max_token_count]

    # Full token sequence with special tokens
    full_tokens = [song_start_id] + [vocab[t] for t in midi_tokens] + [song_end_id]

    # Prepare input and target
    input_ids = full_tokens[:-1]
    target_ids = full_tokens[1:]

    # Calculate padding length
    pad_len = max_length - len(input_ids)

    # Pad at the end AFTER <song_end>
    input_ids += [song_end_id] * pad_len
    target_ids += [song_end_id] * pad_len

    input_sequences.append(input_ids)
    target_sequences.append(target_ids)


In [None]:
# Convert to tensors
inputs_tensor = torch.tensor(input_sequences, dtype=torch.long)
targets_tensor = torch.tensor(target_sequences, dtype=torch.long)

dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch.nn.functional as F

# def training_step(self, batch, batch_idx):
#     input_tokens, target_tokens = batch  # Shape: (batch_size, seq_len)

#     logits = self(input_tokens)  # Shape: (batch_size, seq_len, vocab_size)

#     # Reshape for loss: flatten batch and seq dims
#     logits = logits.view(-1, logits.size(-1))             # (batch * seq_len, vocab_size)
#     target_tokens = target_tokens.view(-1)                # (batch * seq_len)

#     loss = self.loss(logits, target_tokens)
#     self.log("train_loss", loss)
#     return loss

# def configure_optimizers(self):
#     return torch.optim.Adam(self.parameters(), lr=1e-4)

model = DecodeOTrans(
    num_tokens=len(vocab),
    d_model=128,        # or 256 etc.
    max_length=100,
)

DecodeOTrans
trainer = Trainer(
    max_epochs=10,
    accelerator="auto",
)

trainer.fit(model, dataloader)


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | we             | Embedding          | 35.8 K | train
1 | pe             | PositionalEncoding | 0      | train
2 | self_attention | Attention          | 49.2 K | train
3 | fc_layer       | Linear             | 36.1 K | train
4 | loss           | CrossEn

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
max_gen_len = 500
start_token_id = vocab["<song_start>"]
end_token_id = vocab["<song_end>"]

import torch

# Start with just <song_start>
generated = [start_token_id]

for _ in range(max_gen_len):
    input_tensor = torch.tensor(generated).unsqueeze(0)  # shape: (1, seq_len)
    with torch.no_grad():
        logits = model(input_tensor)  # shape: (1, seq_len, vocab_size)

    # Get the logits for the last token position
    next_token_logits = logits[0, -1, :]  # shape: (vocab_size,)
    next_token = torch.argmax(next_token_logits).item()

    # Append the new token
    generated.append(next_token)

    if next_token == end_token_id:
        break


In [None]:
vocab['<song_end>']

276

In [None]:
generated

[277, 100, 55, 147, 85, 58, 247, 70, 70, 70, 70, 70, 70, 70, 70, 70, 276]

# Transformer Building All Inclusive

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model, max_length):

    super().__init__()

    pe = torch.zeros(max_length, d_model) #Here we start by creating an empty matrix which will get updated.

  #d_model is the number of word embedding dimensions the larger this is the richer our description of a word
  #The actual word embedding values are updated via back propagation

    position = torch.arange(start = 0, end = max_length, step = 1).float().unsqueeze(1)

    embedding_index = torch.arange(start=0, end=d_model,step=2).float()

    div_term = 1/torch.tensor(10000.0)**(embedding_index/ d_model)


    #Our Positional encoding starts with a sin and cos equation.
  #Have a look at the literture to check why we do this but it's not too difficult to understand

    pe[:, 0::2] = torch.sin(position * div_term) #This updates the first column
    pe[:, 1::2] = torch.cos(position * div_term) # This updates the second column

  #This happens in an alternating method where the first column and the secondary column gets updated

    self.register_buffer('pe', pe.unsqueeze(0))

  def forward (self, word_embeddings):

    return word_embeddings + self.pe[:word_embeddings.size(0), :] #Finally we add the positonal encoding values to the word embedding values

class Attention(nn.Module):
  def __init__(self, d_model):

#Here we pass in the number of word embedding values per token This is for the sake of matrix nultiplication
#When we do our matrix multiplication to create the queries, keys and values.

    super().__init__()

    self.W_q = nn.Linear(in_features = d_model, out_features = d_model, bias = False)
    self.W_k = nn.Linear(in_features = d_model, out_features = d_model, bias = False)
    self.W_v = nn.Linear(in_features = d_model, out_features = d_model, bias = False)

    #Above we have our weight matrix which are used to calculate our Query, Keys and Values
    #Finally, we don't include a bias term here when calculating attention which is why we set the bias to False

    self.row_dim = 0
    self.col_dim = 1 #To give us flexibility to input data sequentiallly into batches we create variables to do so

  def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask = None):

      #The forward method is where we use the masked self attention values for each token to be calculated
      #Next we give this forward method the flexbility
      #Finally since we want to do masked self attention we can pass in a mask too.

    q = self.W_q(encodings_for_q)
    k = self.W_k(encodings_for_k)
    v = self.W_v(encodings_for_v)

    sims = torch.matmul(q, k.transpose(dim0 = self.row_dim, dim1 = self.col_dim))

    scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

    if mask is not None:
      scaled_sims = scaled_sims.masked_fill(mask = mask, value=-1e9) #Here we include our mask values which stops the attention from looking ahead to caluclate weights.

    attention_percents = F.softmax(scaled_sims, dim = self.col_dim)

    attention_scores = torch.matmul(attention_percents, v)

    return attention_scores

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

#from pytorch_lightning import
from pytorch_lightning import Trainer, LightningModule


class DecodeOTrans(LightningModule):
  def __init__(self, d_model, max_length, num_tokens = 100):

    #Here we specify the number of tokens available in the vocabulary,
    #Number of word embeddings per token
    #Max token length
    super().__init__()

    self.we = nn.Embedding(num_embeddings = num_tokens, embedding_dim = d_model) #We creating a word embedding value
    #Embeddings needs to know how many tokens are in the vocabulary and understand the dimension size to represent the embedding

    self.pe = PositionalEncoding(d_model = d_model, max_length = max_length)

    #Then we create a positional encoding object using the class we created earlier

    self.self_attention = Attention(d_model = d_model)

    self.fc_layer = nn.Linear(in_features = d_model, out_features = num_tokens) #This is our fully connected layer also know as our Dense Layer of neurons (RNN neural networks)

    self.loss = nn.CrossEntropyLoss()

  def forward(self, token_ids):

    word_embeddings = self.we(token_ids)  # [batch_size, seq_len, d_model]
    positional_encoding = self.pe(word_embeddings)  # add positional encoding

    seq_len = token_ids.size(1)
    mask = torch.tril(torch.ones((seq_len, seq_len), device=token_ids.device)) == 0

    self_attention_values = self.self_attention(
        positional_encoding,
        positional_encoding,
        positional_encoding,
        mask=mask
    )

    residual_connection_value = positional_encoding + self_attention_values
    fc_layer_output = self.fc_layer(residual_connection_value)

    return fc_layer_output


  def configure_optimizers(self):
    return Adam(self.parameters(), lr = 0.1) #Here we are using an Adam Optimiser to train all our parmeters with a learning rate of 0.1 (This is a fast learning rate)

  def training_step(self, batch, batch_idx):
    input_tokens, labels = batch  # input: [batch_size, seq_len], labels: [batch_size, seq_len]

    logits = self.forward(input_tokens)  # output: [batch_size, seq_len, vocab_size]

    # Flatten for CrossEntropyLoss: expects [N, C] and [N]
    loss = self.loss(
        logits.view(-1, logits.size(-1)),  # [batch_size * seq_len, vocab_size]
        labels.view(-1)                    # [batch_size * seq_len]
    )

    #self.log("train_loss", loss)
    return loss



# Next Stage

In [None]:
!pip install musiclang_predict

Collecting musiclang_predict
  Downloading musiclang-predict-1.2.0.tar.gz (146 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/146.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.8/146.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting musiclang>=0.25 (from musiclang_predict)
  Downloading musiclang-0.26.0-py3-none-any.whl.metadata (5.8 kB)
Collecting torchtoolkit (from musiclang_predict)
  Downloading torchtoolkit-0.0.4-py3-none-any.whl.metadata (2.1 kB)
Collecting mido==1.2.10 (from musiclang>=0.25->musiclang_predict)
  Downloading mido-1.2.10-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting music21==8.1.0 (from musiclang>=0.25->musiclang_predict)
  Downloading music21-8.1.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pandas==1.5.3 (from musiclang>=0.25->musiclang_predict)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_

In [None]:
from musiclang_predict import MusicLangTokenizer
from musiclang import Score
# Load model and tokenizer, we use the v1 of the musiclang model for this purpose
midi_file = 'path_to_your_midi_file.mid'
score = Score.from_midi(midi_file)
tokenizer = MusicLangTokenizer('musiclang/musiclang-4k')
tokens = tokenizer.tokenize(score)
print(tokens)

ModuleNotFoundError: No module named 'musiclang_predict'