<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Dissertation_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:25
🔁 Restarting kernel...


In [1]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

environment.yml file created successfully.

 Creating environment
Channels:
 - pytorch
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done

 'dissertation' environment is ready to use.


In [2]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

⏳ Cloning GitHub repository...
Cloning into 'Dissertation'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 95 (delta 40), reused 5 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (95/95), 578.22 KiB | 2.15 MiB/s, done.
Resolving deltas: 100% (40/40), done.
Repository cloned.
Mounted at /content/drive
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded.


In [3]:
#load the data from the saved file
#path to where this was saved
base_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_file_path = os.path.join(base_path, 'master_processed_file_list.csv')

#load the master csv with all the paths and VA values
master_df = pd.read_csv(master_file_path)

print("Master csv loaded")
display(master_df.head())

#check data for 1 song
#pick a song between 0 and 2215
test_final_song_index = 1111
song_info = master_df.iloc[test_final_song_index]

print(f"\n--- Loading data for song: {song_info['song_id']} ---")

#Load the spectrogram from the file
spectrogram = np.load(song_info['spectrogram_path'])

#load the lyric tensors
encoded_lyrics = torch.load(song_info['lyrics_path'], weights_only=False)

#Get the labels
valence = song_info['valence']
arousal = song_info['arousal']

#check the data
print("Spectrogram Shape:", spectrogram.shape)
print("Encoded Lyrics Tensors:", encoded_lyrics)
print(f"Labels - Valence: {valence}, Arousal: {arousal}")


Master csv loaded


Unnamed: 0,song_id,spectrogram_path,lyrics_path,valence,arousal,quadrant
0,A001,/content/drive/MyDrive/dissertation/output_fro...,/content/drive/MyDrive/dissertation/output_fro...,0.89375,0.29375,Q4
1,A002,/content/drive/MyDrive/dissertation/output_fro...,/content/drive/MyDrive/dissertation/output_fro...,0.68125,0.3375,Q4
2,A003,/content/drive/MyDrive/dissertation/output_fro...,/content/drive/MyDrive/dissertation/output_fro...,0.225,0.25,Q3
3,A004,/content/drive/MyDrive/dissertation/output_fro...,/content/drive/MyDrive/dissertation/output_fro...,0.18125,0.2,Q3
4,A005,/content/drive/MyDrive/dissertation/output_fro...,/content/drive/MyDrive/dissertation/output_fro...,0.6875,0.7875,Q1



--- Loading data for song: MT0006531857 ---
Spectrogram Shape: (128, 1879)
Encoded Lyrics Tensors: {'input_ids': tensor([[  101,  1997,  8714,  1998,  8714,  2066,  1037, 11689,  1997,  6804,
          6953,  6532,  2013,  1996,  2353,  3239, 19262,  2943,  2207,  2013,
          1996,  2568,  1010,  7302,  2000,  1996,  2455,  1997,  1996,  7160,
         23172,  3357,  1999,  1996, 13576, 22774,  2182,  4832,  1996,  1062,
          8004, 27390,  4017, 21305, 12123,  7302,  2000,  8714,  1005,  1055,
          9812, 12702,  1011, 14448, 11305,  1997,  1996,  2534, 14194, 19185,
          4432,  1037,  2088,  1997, 27906,  1006,  1998,  1007,  2061, 19170,
          2854,  2391,  1996,  8782, 16681,  2012,  2017, 13769,  8754,  1997,
          1996, 13374,  7487,  1996,  7619,  3606,  1037,  2088,  2007, 27906,
          1006,  1998,  1007,  2061, 19170,  2854, 13769,  8754,  1997,  1996,
         18823,  2391,  1996, 16681,  2012,  2017,  2009,  1005,  1055,  2035,
          1999,  

To-Do:

Wrap the Dataset in a DataLoader.

Start building the model architecture (CNN for audio, Transformer for text).

In [4]:
class MER_Dataset(Dataset):
    """
    Custom PyTorch Dataset for loading MER spectrograms and lyric tensors.
    """
    def __init__(self, annotations_file, tokenizer):
        """
        Creation of the Dataset from the CSV file.
        """
        self.annotations = pd.read_csv(annotations_file)
        self.tokenizer = tokenizer

    def __len__(self):
        """
        Function to return the total number of songs in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """
        Function to get a song from the dataset.
        """
        song_info = self.annotations.iloc[index] #which song ID/row is picked from the dataset as per the index

        spectrogram_path = song_info['spectrogram_path'] # columns from the df
        lyrics_path = song_info['lyrics_path'] # columns from the df
        valence = song_info['valence'] # columns from the df
        arousal = song_info['arousal'] # columns from the df

        #change spectorgram into a tensor
        spectrogram = np.load(spectrogram_path) #loading spectorgram from path saved in df
        spectrogram_tensor = torch.from_numpy(spectrogram).float() # changing the np array to tensor
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) #Adding a "channel" dimension for CNN

        #Load the lyric tokens
        encoded_lyrics = torch.load(lyrics_path, weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0) #remove the batch dimension from input ids so 1d
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0) #remove the batch dimension from attention mask so 1d


        labels = torch.tensor([valence, arousal], dtype=torch.float32) # extract labels


        return spectrogram_tensor, input_ids, attention_mask, labels

In [6]:
master_file_path = '/content/drive/MyDrive/dissertation/output_from_code/master_processed_file_list.csv'

#create an instance of dataset
mer_dataset = MER_Dataset(annotations_file=master_file_path, tokenizer=tokenizer)

#check the length
print(f"Dataset length: {len(mer_dataset)} songs")

#Check an item
spectrogram, input_ids, attention_mask, labels = mer_dataset[0]

print("\n--- Verifying a single item from the Dataset ---")
print(f"Spectrogram shape: {spectrogram.shape}, Type: {spectrogram.dtype}")
print(f"Input IDs shape: {input_ids.shape}, Type: {input_ids.dtype}")
print(f"Attention Mask shape: {attention_mask.shape}, Type: {attention_mask.dtype}")
print(f"Labels: {labels}, Type: {labels.dtype}")

Dataset length: 2216 songs

--- Verifying a single item from the Dataset ---
Spectrogram shape: torch.Size([1, 128, 1879]), Type: torch.float32
Input IDs shape: torch.Size([512]), Type: torch.int64
Attention Mask shape: torch.Size([512]), Type: torch.int64
Labels: tensor([0.8938, 0.2937]), Type: torch.float32


In [None]:
# Create the DataLoader
data_loader = DataLoader(mer_dataset, batch_size=16, shuffle=True)

# Grab one batch of data
spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch = next(iter(data_loader))

# Print the shapes to verify
print("\n--- Verifying a batch from the DataLoader ---")
print(f"Spectrogram batch shape: {spectrogram_batch.shape}")
print(f"Input IDs batch shape: {input_ids_batch.shape}")
print(f"Attention Mask batch shape: {attention_mask_batch.shape}")
print(f"Labels batch shape: {labels_batch.shape}")