# **GAN Experiments**

In [None]:
!pip install mido



In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import mido
import os

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Memory Allocated:", round(torch.cuda.memory_allocated(0)/1024**3, 2), "GB")
    print("Memory Cached:", round(torch.cuda.memory_reserved(0)/1024**3, 2), "GB")

Using device: cuda
GPU Name: Tesla T4
Memory Allocated: 0.0 GB
Memory Cached: 0.0 GB


In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7db37f4b2c90>

In [None]:
metadata = pd.read_csv("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0.csv")

For the GAN architecture, we are going to use a different method of tokenizing the music, inspired by [Olof Morgen](https://arxiv.org/abs/1611.09904). In his work, he modeled each tone as a quadruplet of *tone length, frequency, intensity* and *time spent since the previous tone*. This results in a matrix of shape `(n, 4)`. In this function, if max_values is provided, we also perform normalization to the *tone_length* and *time_since_prev* values. If it's None, we only normalize the frequency and intensity columns.

In [None]:
def preprocess_midi(file_path, max_values=None):
    midi = mido.MidiFile(file_path)

    time = 0
    features = []
    note_on_times_intensities = {}
    prev_note_on_time = None
    time_since_prev = 0

    for track in midi.tracks:
        for msg in track:
            if not msg.is_meta:
                time += msg.time

                if msg.type == "note_on" and msg.dict()["velocity"] > 0:
                    time_since_prev = (time - prev_note_on_time) if prev_note_on_time is not None else 0
                    note_on_times_intensities[msg.dict()["note"]] = (time, msg.dict()["velocity"])
                    prev_note_on_time = time

                elif (msg.type == "note_off" or (msg.type == "note_on" and msg.dict()["velocity"] == 0)) and msg.dict()["note"] in note_on_times_intensities:
                    tone_length = time - note_on_times_intensities[msg.dict()["note"]][0]
                    frequency = msg.dict()["note"]
                    intensity = note_on_times_intensities[msg.dict()["note"]][1]

                    features.append([tone_length, frequency, intensity, time_since_prev])

                    del note_on_times_intensities[msg.dict()["note"]]

    features = np.array(features, dtype=np.float32)

    if features.size > 0:
        features[:, 0] = features[:, 0] / (max_values["max_tone_length"] if max_values is not None else 1)
        features[:, 1] = features[:, 1] / 127
        features[:, 2] = features[:, 2] / 127
        features[:, 3] = features[:, 3] / (max_values["max_time_since_prev"] if max_values is not None else 1)

    if max_values is not None:
        seq_len = max_values["max_seq_len"]
        if len(features) < seq_len:
            padding = np.zeros((seq_len - len(features), 4))
            features = np.vstack([features, padding])
        else:
            features = features[:seq_len]

    return features

This is the function to get a dict of the max values for each metric in order to normalize them for training:

In [None]:
def get_max_values(metadata):
    all_tone_lengths = []
    all_time_since_prev = []

    filenames = [os.path.join("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0-midi/maestro-v3.0.0/", filename) for filename in metadata[metadata["split"] == "train"]["midi_filename"]]

    for file in tqdm(filenames):
        features = preprocess_midi(file)

        all_tone_lengths.append(features[:, 0])
        all_time_since_prev.append(features[:, 3])

    all_tone_lengths = np.concatenate(all_tone_lengths)
    all_time_since_prev = np.concatenate(all_time_since_prev)

    max_values = {}

    max_values["max_tone_length"] = np.max(all_tone_lengths)
    max_values["max_time_since_prev"] = np.max(all_time_since_prev)

    return max_values

This is the dataset we are going to use. Upon initializing, it converts all of the midi files into the desired representation. Then, when `__getitem__` is called, it just returns a song.

In [None]:
class GanMusicDataset(Dataset):
    def __init__(self, metadata):
        super().__init__()
        filenames = [os.path.join("/content/drive/MyDrive/Piano generation/Project/MAESTRO dataset/maestro-v3.0.0-midi/maestro-v3.0.0/", filename) for filename in metadata["midi_filename"]]
        self.songs = []
        max_values = get_max_values(metadata)

        for file in tqdm(filenames):
            features = preprocess_midi(file, max_values=max_values)
            self.songs.append(torch.from_numpy(features))

        self.songs = torch.stack(self.songs)

    def __len__(self):
        return len(self.songs)

    def __getitem__(self, idx):
        return self.songs[idx]

Again, it's way faster if we load it from a file.

In [None]:
music_dataset_train = torch.load("/content/drive/MyDrive/Piano generation/Project/saved_data/datasets/train/gan_music_dataset_train.pt", weights_only=False)

Here is out generator. It's a simple architecture with juts 2 LSTM layers and a Linear layer to project the output to the desired dimensions.

In [None]:
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_sizet, num_layers=2, batch_first=True)
        self.ff = nn.Linear(hidden_size, output_size)

    def forward(self, z):
        out, _ = self.lstm(z)
        out = self.ff(out)

        return out

This is the discriminator. It's again a simple architecture with 2 LSTM layers, although this time they are bidirectional. Again, in the end there is a Linear layer, which combines all of the outputs in a single dimension to tell if it's real or not.

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Discriminator, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=2, bidirectional=True, batch_first=True)
        self.ff = nn.Linear(hidden_size * 4, 1)

    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)

        final_h = torch.cat((h_n[-2,:], h_n[-1,:]), dim=1)
        final_c = torch.cat((c_n[-2,:], c_n[-1,:]), dim=1)

        combined_state = torch.cat((final_h, final_c), dim=1)

        x = self.ff(combined_state)

        return x

In [None]:
music_dataloader_train = DataLoader(music_dataset_train, batch_size=2, shuffle=True)

Here we initialize the hyperparameters for both models with their hidden size being 350, as inspired again by the same papaer ([Olof Morgen](https://arxiv.org/abs/1611.09904)). The output size of the generator and the input size of the discriminator are both 4, because that's what the data representation requires. This time we are going to experiment with different values for the betas argument, beginning with 0.5 for beta1 and 0.999 for beta2.

In [None]:
generator = Generator(input_size=100, hidden_size=350, output_size=4).to(device)
discriminator = Discriminator(input_size=4, hidden_size=350).to(device)

optimizer_G = optim.AdamW(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.AdamW(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

criterion = nn.BCELoss()

In [None]:
def train_gan(generator, discriminator, optimizer_d, optimizer_g, criterion, num_epochs, generator_loss_history, discriminator_loss_history, epochs_count=1, create_plot=True):
    for epoch in range(num_epochs):
        running_loss = 0.0

        for i, real_data in tqdm(enumerate(music_dataloader_train), f"Epoch {epoch+epochs_count}:"):
            real_data = real_data.to(device).float()

            z = torch.randn(real_data.size(0), real_data.size(1), 100, dtype=torch.float32).to(device)
            fake_data = generator(z)

            optimizer_d.zero_grad()

            real_labels = torch.ones(real_data.size(0), 1).to(device).float()
            fake_labels = torch.zeros(fake_data.size(0), 1).to(device).float()

            real_output = discriminator(real_data)
            fake_output = discriminator(fake_data)

            real_loss = criterion(real_output, real_labels)
            fake_loss = criterion(fake_output, fake_labels)

            d_loss = real_loss + fake_loss
            d_loss.backward(retain_graph=True)
            optimizer_d.step()

            discriminator_loss_history.append(d_loss.item())

            optimizer_g.zero_grad()

            fake_labels = torch.zeros(fake_data.size(0), 1).to(device)
            fake_output = discriminator(fake_data)

            g_loss = criterion(fake_output, fake_labels)
            g_loss.backward()
            optimizer_g.step()

            generator_loss_history.append(g_loss.item())

        torch.save(generator.state_dict(), f"/content/drive/MyDrive/Piano generation/Project/saved_data/saved_models/gan_generator_epoch_{epoch+epochs_count}.pt")
        torch.save(discriminator.state_dict(), f"/content/drive/MyDrive/Piano generation/Project/saved_data/saved_models/gan_discriminator_epoch_{epoch+epochs_count}.pt")

    if create_plot:
        plt.plot(generator_loss_history, label="Generator loss")
        plt.plot(discriminator_loss_history, label="Discriminator loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training Loss")
        plt.legend()
        plt.show()
        plt.savefig(f"/content/drive/MyDrive/Piano generation/Project/saved_data/plots/gan_loss_epoch_{epoch+num_epochs}_plot.png")

    return generator_loss_history, discriminator_loss_history

Let's try to run 10 epochs:

In [None]:
generator_loss_history, discriminator_loss_history = train_gan(generator, discriminator, optimizer_G, optimizer_D, criterion, 10, [], [])

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
    self._run_once()
  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 69

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [350, 4]], which is output 0 of AsStridedBackward0, is at version 3; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Due to time constraints, I couldn't fix this problem.