In [1]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Using GPU (Windows)')
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print('Using GPU (Mac)')
else:
    device = torch.device("cpu")
    print('Using CPU')

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import math
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments



import json
from pathlib import Path
import os
from typing import List

# Load miditok tokenizer
from miditok import REMI, TokenizerConfig, TokSequence
from miditoolkit import MidiFile, Instrument, Note
from miditok.pytorch_data import DatasetJSON




Using GPU (Windows)


In [7]:
tokenizer = REMI.from_pretrained("tokenizer.json")

class MIDITokenDataset(Dataset):
    def __init__(self, files_paths, bos_token_id=None, eos_token_id=None, max_seq_len=1024):
        self.paths = files_paths
        self.bos = bos_token_id
        self.eos = eos_token_id
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        # Load list of ids
        with open(self.paths[idx], "r") as f:
            ids = json.load(f)

        # Wrap in TokSequence (optional — just to stay consistent)
        seq = TokSequence(ids=ids)

        # Add BOS and EOS if specified
        tokens = []
        if self.bos is not None:
            tokens.append(self.bos)
        tokens += seq.ids
        if self.eos is not None:
            tokens.append(self.eos)

        # Truncate or pad as needed
        tokens = tokens[:self.max_seq_len]

        return torch.tensor(tokens)

    
class MIDIDataCollator:
    def __init__(self, pad_token_id):
        self.pad_token_id = pad_token_id

    def __call__(self, batch):
        # batch: list of 1D tensors
        input_ids_padded = pad_sequence(batch, batch_first=True, padding_value=self.pad_token_id)
        labels_padded = input_ids_padded.clone()  # copy for labels

        return {
            "input_ids": input_ids_padded,
            "labels": labels_padded,
            "attention_mask": (input_ids_padded != self.pad_token_id).long(),
        }



data_collator = MIDIDataCollator(tokenizer['PAD_None'])
right_hand_jsons = list(Path("tokenized_json/right_hand").glob("*.json"))

dataset = MIDITokenDataset(
    files_paths=right_hand_jsons,
    bos_token_id = tokenizer["BOS"] if "BOS" in tokenizer.special_tokens else tokenizer.vocab["BOS_None"],
    eos_token_id = tokenizer["EOS"] if "EOS" in tokenizer.special_tokens else tokenizer.vocab["EOS_None"],
    max_seq_len=512*2
)


  super().__init__(tokenizer_config, params)


In [9]:
model = GPT2LMHeadModel.from_pretrained("model_weights").to(device)

bos_token_id = tokenizer["BOS"] if "BOS" in tokenizer.special_tokens else tokenizer.vocab["BOS_None"]
eos_token_id = tokenizer["EOS"] if "EOS" in tokenizer.special_tokens else tokenizer.vocab["EOS_None"]

# Dataset and DataLoader
loader = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=data_collator)

def compute_perplexity(model, dataloader, device='cuda'):
    model.eval()
    model.to(device)
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            # Count tokens ignoring padding:
            num_tokens = attention_mask.sum().item()

            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


# Evaluate
ppl = compute_perplexity(model, loader, device=device)
print(f"Perplexity: {ppl:.2f}")

Perplexity: 1.01
