<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Dissertation_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

In [None]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

In [None]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

In [None]:
#load the data from the saved file
#path to where this was saved
base_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_file_path = os.path.join(base_path, 'master_processed_file_list.csv')

#load the master csv with all the paths and VA values
master_df = pd.read_csv(master_file_path)

print("Master csv loaded")
display(master_df.head())

#check data for 1 song
#pick a song between 0 and 2215
test_final_song_index = 9
song_info = master_df.iloc[test_final_song_index]

print(f"\n--- Loading data for song: {song_info['song_id']} ---")

#Load the spectrogram from the file
spectrogram = np.load(song_info['spectrogram_path'])

#load the lyric tensors
encoded_lyrics = torch.load(song_info['lyrics_path'], weights_only=False)

#Get the labels
valence = song_info['valence']
arousal = song_info['arousal']

#check the data
print("Spectrogram Shape:", spectrogram.shape)
print("Encoded Lyrics Tensors:", encoded_lyrics)
print(f"Labels - Valence: {valence}, Arousal: {arousal}")


In [None]:
class MER_Dataset(Dataset):
    """ Custom PyTorch Dataset for loading MER data. """
    def __init__(self, annotations_df, tokenizer):
        """ Creation of the Dataset from the dataframe (predefined splits in MERGE dataset) """
        self.annotations = annotations_df
        self.tokenizer = tokenizer

    def __len__(self):
        """
        Function to return the total number of songs in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """
        Function to get a song from the dataset.
        """
        song_info = self.annotations.iloc[index] #which song ID/row is picked from the dataset as per the index

        spectrogram_path = song_info['spectrogram_path'] # columns from the df
        lyrics_path = song_info['lyrics_path'] # columns from the df
        valence = song_info['valence'] # columns from the df
        arousal = song_info['arousal'] # columns from the df

        #change spectorgram into a tensor
        spectrogram = np.load(spectrogram_path) #loading spectorgram from path saved in df
        spectrogram_tensor = torch.from_numpy(spectrogram).float() # changing the np array to tensor
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) #Adding a "channel" dimension for CNN

        #Load the lyric tokens
        encoded_lyrics = torch.load(lyrics_path, weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0) #remove the batch dimension from input ids so 1d
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0) #remove the batch dimension from attention mask so 1d


        labels = torch.tensor([valence, arousal], dtype=torch.float32) # extract labels


        return spectrogram_tensor, input_ids, attention_mask, labels

In [None]:
#load the csv for where the predefined splits are located in Google drive
split_folder_path = '/content/drive/MyDrive/dissertation/MERGE_Bimodal_Complete/tvt_dataframes/tvt_70_15_15/'

#read the files and load into variables
train_split_df = pd.read_csv(os.path.join(split_folder_path, 'tvt_70_15_15_train_bimodal_complete.csv'))
val_split_df = pd.read_csv(os.path.join(split_folder_path, 'tvt_70_15_15_validate_bimodal_complete.csv'))
test_split_df = pd.read_csv(os.path.join(split_folder_path, 'tvt_70_15_15_test_bimodal_complete.csv'))

id_column_name = 'song_id' #match the naming in the master data and replace in test/train/split
train_split_df.rename(columns={'Song': id_column_name}, inplace=True)
val_split_df.rename(columns={'Song': id_column_name}, inplace=True)
test_split_df.rename(columns={'Song': id_column_name}, inplace=True)

#filter the master dataset to create the three smaller datasets
train_df = pd.merge(master_df, train_split_df, on=id_column_name)
val_df = pd.merge(master_df, val_split_df, on=id_column_name)
test_df = pd.merge(master_df, test_split_df, on=id_column_name)

print(f"Total training samples: {len(train_df)}") # check these against the csv train should have 1552 songs
print(f"Total validation samples: {len(val_df)}") # check these against the csv train should have 332 songs
print(f"Total test samples: {len(test_df)}") # check these against the csv train should have 332 songs

# Create separate dataloaders and datasets.
train_dataset = MER_Dataset(annotations_df=train_df, tokenizer=tokenizer)
val_dataset = MER_Dataset(annotations_df=val_df, tokenizer=tokenizer)
test_dataset = MER_Dataset(annotations_df=test_df, tokenizer=tokenizer)

BATCH_SIZE = 16 #using for now, but can change this in the future
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) #shuffle training dataset but not others.
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#verify a batch
spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch = next(iter(train_loader))

print("\Verifying a batch from the new train_loader ---")
print(f"Spectrogram batch shape: {spectrogram_batch.shape}")
print(f"Input IDs batch shape: {input_ids_batch.shape}")
print(f"Labels batch shape: {labels_batch.shape}")