In [17]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import random

REST_ID = 128
BOS_ID = 129
EOS_ID = 130

class MIDIVAE(nn.Module):
  def __init__(self, encoder, decoder,classifier,max_length=502,device = 'cuda',kld_weight = 1.0, classifier_weight = 1.0):
      super(MIDIVAE,self).__init__()

      # untrained components
      self.encoder = encoder
      self.decoder = decoder  # Cross-Attention transformer to generate MIDI vectors
      self.max_length = max_length
      # pre-trained classifier
      self.classifier = classifier
      self.device = device

      #constants
      self.kld_weight = kld_weight
      self.classifier_weight = classifier_weight
      self.BOS_ID = BOS_ID
      self.EOS_ID = EOS_ID
      self.REST_ID = REST_ID

  def forward(self, x, label, teacher_forcing_ratio=0.9):
        # Encoder pass
        z, mean, logvar = self.encoder(x,label)

        # Prepare decoder inputs
        memory = self._prepare_memory(z)  # Could be latent projection

        if self.training and random.random() < teacher_forcing_ratio:
            # Teacher forcing - use ground truth as input
            recon_midi = self.decoder(
                tgt=x,
                memory=memory,
                z=z,
                label=label,
                teacher_forcing=True
            )
        else:
            # Autoregressive generation
            recon_midi = self.decoder.generate(
                memory=memory,
                z=z,
                label=label,
                max_len=self.max_length,
                bos_id=self.BOS_ID,
                eos_id=self.EOS_ID
            )

        # Classifier uses latent space
        composer_pred = self.classifier(z)

        return recon_midi, mean, logvar, composer_pred

  def train_model(self, dataloader, optimizer, epochs=10):
        self.train()
        for epoch in range(epochs):
            total_loss = 0
            for x, label, composer in dataloader:
                x, label, composer = x.to(self.device), label.to(self.device), composer.to(self.device)

                optimizer.zero_grad()
                recon_x, mu, logvar, pred = self(x, label)

                # Loss calculations
                kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
                recon_loss = F.cross_entropy(
                    recon_x.transpose(1, 2),  # [batch, vocab_size, seq_len]
                    x  # [batch, seq_len]
                )
                cls_loss = F.cross_entropy(pred, composer)

                loss = recon_loss + self.kld_weight*kld + self.classifier_weight*cls_loss
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch+1} | Loss: {total_loss/len(dataloader):.4f}")

  def _prepare_memory(self, z):
        ## not sure if we'll need to format it a specific way
        return z.unsqueeze(0)  # adjust based on your needs

  def generate(self, z, label, temperature=1.0):
        self.eval()
        with torch.no_grad():
            memory = self._prepare_memory(z)
            return self.decoder.generate(
                memory=memory,
                z=z,
                label=label,
                max_len=self.max_length,
                temperature=temperature,
                bos_id=self.bos_id,
                eos_id=self.eos_id
            )


In [18]:

class Token_Embedding(nn.Module):
  def __init__(self,vocab_size,embedding_dim):
    super(Token_Embedding,self).__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim)

  def forward(self,x):
    return self.embedding(x)

class Pos_Embedding(nn.Module):
  def __init__(self,max_len,embedding_dim):
    super(Pos_Embedding,self).__init__()
    self.pos_embedding = nn.Embedding(max_len,embedding_dim)

  def forward(self,x):
    seq_len = x.size(1)
    pos_ids = torch.arange(seq_len,device=x.device).unsqueeze(0)
    return self.pos_embedding(pos_ids)

class Transformer_Encoder(nn.Module):
    def __init__(self, embedding_dim, num_heads, num_layers, ff_dim):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=embedding_dim,
                nhead=num_heads,
                dim_feedforward=ff_dim,
                batch_first=True  # ← Critical for your input shape
            ) for _ in range(num_layers)
        ])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class LatentSpace_Mean_Log(nn.Module):
  def __init__(self,embedding_dim,latent_dim):
    super(LatentSpace_Mean_Log,self).__init__()
    self.fc_mu = nn.Linear(embedding_dim,latent_dim)
    self.fc_logvar = nn.Linear(embedding_dim,latent_dim)

  def forward(self,x):
    mu = self.fc_mu(x)
    logvar = self.fc_logvar(x)

    return mu,logvar

In [19]:
class Variational_Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len=502, latent_dim=64,
                 num_heads=8, num_layers=6, ff_dim=512, label_dim=0):
        super().__init__()
        assert max_len >= 502, "max_len must cover BOS+EOS+500 tokens"

        self.token_embedding = Token_Embedding(vocab_size, embedding_dim)
        self.pos_embedding = Pos_Embedding(max_len, embedding_dim)
        self.encoder = Transformer_Encoder(embedding_dim, num_heads, num_layers, ff_dim)
        self.latent_proj = LatentSpace_Mean_Log(embedding_dim, latent_dim)

        # Optional label conditioning
        self.label_proj = nn.Linear(label_dim, embedding_dim) if label_dim > 0 else None

    def forward(self, x, label=None):
        # Input x: [batch_size, 502]
        tok_emb = self.token_embedding(x)  # [B, 502, D]
        pos_emb = self.pos_embedding(x)    # [B, 502, D]
        embeddings = tok_emb + pos_emb

        # Inject label info (if provided)
        if self.label_proj and label is not None:
            label_emb = self.label_proj(label).unsqueeze(1)  # [B, 1, D]
            embeddings += label_emb.expand(-1, x.size(1), -1)  # [B, 502, D]

        # Transformer process
        output = self.encoder(embeddings)  # [B, 502, D]

        # Pool and project to latent space
        pooled = output.mean(dim=1)  # [B, D]
        mu, logvar = self.latent_proj(pooled)
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = mu + eps * std

        return z, mu, logvar


In [20]:
#!!! Adapted from HW5!!!

from torch import nn, Tensor
import torch
import math
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Cross-Attention Transformer Decoder
class Transformer_Decoder(nn.Module):

    def __init__(self,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 d_model: int,
                 nhead: int,
                 d_hid: int,
                 nlayers: int,
                 latent_dim: int,
                 label_dim: int,
                 dropout: float = 0.5,
                 device = 'cuda'
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.device = device
        # discrete latent tokens
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        # predicted MIDI tokens
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Decoder (self-attention and cross-attention)
        dec_layer = TransformerDecoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_decoder = TransformerDecoder(dec_layer, nlayers)
        self.d_model = d_model

        self.linear = nn.Linear(d_model, tgt_vocab_size)

        #weight modulation layers
        self.z_scale = nn.Linear(latent_dim, d_model)
        self.z_shift = nn.Linear(latent_dim, d_model)
        # label dim should be B x
        #label conditioning layers for cross attention
        self.label_projection = nn.Linear(label_dim,d_model)
        self.label_attention = nn.MultiheadAttention(d_model,nhead,dropout=dropout)
        # Project z to scaling factors
        self.init_weights()

    def autoregressive_embed(self,tgt:Tensor,z:Tensor,label:Tensor) -> Tensor:
      # goal is to generate things autoregressively, so it predicts the next
      # token
      # scale and embed target sequence and add pos encoding
      tgt = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
      tgt = self.pos_encoder(tgt)
      # weight modulation
      shift = self.z_shift(z)
      scale = self.z_scale(z)
      tgt = tgt * scale.unsqueeze(0) + shift.unsqueeze(0)
      # cross attention for label conditioning
      label_emb = self.label_proj(label).unsqueeze(0)
      tgt= self.label_attn(query=tgt_emb,key=label_emb,value=label_emb)[0]

      return tgt

    def init_weights(self) -> None:
        initrange = 0.1
        self.src_embedding.weight.data.uniform_(-initrange, initrange)
        self.tgt_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, tgt: Tensor,memory: Tensor,z : Tensor, label:Tensor,teacher_forcing=True):
        """
        Args:
          tgt: [tgt_seq_len, batch_size] - the target sequence to predict the
          next token of (discrete MIDI tokens) memory: [src_seq_len, batch_size]
           - the discrete VQ-VAE code indices
        Returns:
          [tgt_seq_len, batch_size, tgt_vocab_size] logits output for the next
          token
        """
        # Scale and Embed the source sequence
        src = self.src_embedding(memory) * math.sqrt(self.d_model)
        # Add positional encoding
        src = self.pos_encoder(src)

        # teacher forcing so that during training we can train well but also
        # autoregressively predict during evaluation
        if teacher_forcing:
          # Scale and Embed the target sequence
          tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
          # Add positional encoding
          tgt_emb = self.pos_encoder(tgt)
        else:
          tgt_emb = self.autoregressive_embed(tgt,z,label)

        # Feature-Wise Weight Modulation - meant to ensure unique generation for
        # given label
        shift = self.z_shift(z)
        scale = self.z_scale(z)
        #this alters the target b4 the transformer can see them, it should bias
        #the model enough
        tgt_emb = tgt_emb * scale.unsqueeze(0) + shift.unsqueeze(0)

        # cross-attention for labelness
        label_emb = self.label_proj(label).unsqueeze(0)
        tgt_emb= self.label_attn(query=tgt_emb,key=label_emb,value=label_emb)[0]

        # Create the mask
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(len(tgt_emb)).to(device)
        # Pass them through the transformer
        output = self.transformer_decoder(tgt=tgt_emb, memory=src, tgt_mask=tgt_mask)
        # Apply the linear layer
        output = self.linear(output)

        return output

    def generate(self,memory,z,label,max_len=502,temperature = 1.0):
      batch_size = z.size(0)
      device = self.device

      #start with <BOS> token
      # 1. Start with [BOS] token for all sequences in batch
      tgt = torch.tensor(BOS_IDX, device=device).repeat(batch_size, 1)  # [batch_size, 1]

      for _ in range(max_len):
          # 2. Forward pass (autoregressive mode)
          logits = self(
              tgt=tgt.transpose(0, 1),  # Transformer expects [seq_len, batch_size]
              memory=memory,
              z=z,
              label=label,
              teacher_forcing=False
          )[-1, :, :]  # Only keep logits for last position

          # 3. Sample next token
          probs = F.softmax(logits / temperature, dim=-1)
          next_token = torch.multinomial(probs, num_samples=1)  # [batch_size, 1]

          # 4. Update sequence
          tgt = torch.cat([tgt, next_token], dim=1)

          # 5. Early stop if all sequences finish
          if (next_token == EOS_IDX).all():
              break

      return tgt[:, 1:]  # Remove [BOS]

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
file_directory = '/content/drive/MyDrive/maestro_token_sequences.csv'

In [23]:
import pandas as pd
import numpy as np
import io

def string_to_vector(seq_str):
    return [int(x) for x in seq_str.split()]

def prepare_data(file, augmentations=False, collaborations=False,split='train'):
    # Read the data
    df = pd.read_csv(file)

    # only read train data
    if(split != 'all'):
      df = df[df['split'] == split].copy()

    # Convert sequences to vectors
    df['sequence_vector'] = df['sequence'].apply(string_to_vector)

    # Apply filters
    if (not collaborations):
        df = df[~df['composer'].str.contains('/', na=False)].copy()

    if (not augmentations):
        df = df[df['transposition amount'] == 0].copy()

    # Create labels (assuming composer_label_dict exists)
    clean_df = df.copy()  # Final cleaned version
    composer_list = sorted(list(set(clean_df['composer'])))  # Convert to sorted list for consistent ordering
    num_composers = len(composer_list)
    print(f"Unique composers: {composer_list}")
    print(f"Total composers: {num_composers}")

    # Create proper label dictionary
    composer_label_dict = {composer: idx for idx, composer in enumerate(composer_list)}
    index_composer_dict = {idx: composer for composer, idx in composer_label_dict.items()}
    clean_df['label'] = clean_df['composer'].map(composer_label_dict)

    # Select only the two columns we want
    data = clean_df[['sequence_vector', 'label']].copy()

    return data,composer_label_dict,index_composer_dict

In [24]:
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
class MIDI_Dataset(Dataset):
  def __init__(self,sequences,labels):
    self.sequences = sequences
    self.labels = labels

  def __len__(self):
      return len(self.sequences)

  def __getitem__(self, idx):
      # Convert to tensor directly (no padding needed)
      seq = torch.tensor(self.sequences[idx], dtype=torch.long)

      if self.labels is not None:
          label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
          return seq, label
      return seq
def create_MIDI_Dataloaders(train_data, batch_size=32):

  # Create datasets
  train_dataset = MIDI_Dataset(
      sequences=train_data['sequence_vector'].tolist(),
      labels=train_data['label']
  )

  # Create dataloaders
  train_loader = DataLoader(
      train_dataset,
      batch_size=batch_size,
      shuffle=True,
      pin_memory=True,
      num_workers=4  # Parallel loading
  )



  return train_loader


In [25]:
import torch

# Set random seed for reproducibility
torch.manual_seed(0)

# Shared Parameters
vocab_size = 130        # Number of unique MIDI tokens
embedding_dim = 256     # Size of token embeddings
max_len = 502           # Max sequence length (500 tokens + EOS + BOS)
latent_dim = 128        # Latent space dimension
num_heads = 8           # Number of attention heads
num_layers = 6          # Number of transformer layers
ff_dim = 512            # Feed-forward layer dimension
dropout = 0.1           # Dropout rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Additional Decoder-specific Parameters
label_dim = 55          # Dimension for label embeddings (# of composers)
hidden_dim = 256        # Hidden dimension in decoder (d_hid in your code)

# Prepare Data
data,composer_to_label_map,inv_map = prepare_data(file_directory,augmentations=True)
train_data = create_MIDI_Dataloaders(data)

print(data.shape)
# Initialize Encoder
encoder = Variational_Encoder(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    max_len=max_len,
    latent_dim=latent_dim,
    num_heads=num_heads,
    num_layers=num_layers,
    ff_dim=ff_dim
).to(device)

# Initialize Decoder
decoder = Transformer_Decoder(
    src_vocab_size=vocab_size,    # Same as encoder vocab size
    tgt_vocab_size=vocab_size,    # Same unless you have different input/output vocabs
    d_model=embedding_dim,        # Should match encoder's embedding_dim
    nhead=num_heads,              # Same as encoder
    d_hid=hidden_dim,             # Decoder-specific hidden dim
    nlayers=num_layers,           # Same as encoder
    latent_dim=latent_dim,        # Same as encoder
    label_dim=label_dim,          # For composer/style conditioning
    device=device,
    dropout=dropout,
).to(device)

#load classifier
# classifier = load_model

model = MIDIVAE(encoder,decoder,classifier,max_len)

print("Encoder initialized with:")
print(f"- Vocab size: {vocab_size}")
print(f"- Embedding dim: {embedding_dim}")
print(f"- Latent dim: {latent_dim}")
print(f"- {num_layers} layers with {num_heads} attention heads each")

print("\nDecoder initialized with:")
print(f"- Same vocab size: {vocab_size}")
print(f"- Label embedding dim: {label_dim}")
print(f"- Hidden dim: {hidden_dim}")
print(f"- Using dropout: {dropout}")



Unique composers: ['Alban Berg', 'Alexander Scriabin', 'Anton Arensky', 'Antonio Soler', 'Carl Maria von Weber', 'Claude Debussy', 'César Franck', 'Domenico Scarlatti', 'Edvard Grieg', 'Felix Mendelssohn', 'Franz Liszt', 'Franz Schubert', 'Frédéric Chopin', 'George Enescu', 'George Frideric Handel', 'Henry Purcell', 'Isaac Albéniz', 'Jean-Philippe Rameau', 'Johann Pachelbel', 'Johann Sebastian Bach', 'Johannes Brahms', 'Joseph Haydn', 'Leoš Janáček', 'Ludwig van Beethoven', 'Mily Balakirev', 'Modest Mussorgsky', 'Muzio Clementi', 'Nikolai Medtner', 'Orlando Gibbons', 'Percy Grainger', 'Pyotr Ilyich Tchaikovsky', 'Robert Schumann', 'Sergei Rachmaninoff', 'Wolfgang Amadeus Mozart']
Total composers: 34
(438646, 2)




NameError: name 'classifier' is not defined