<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Dissertation_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:06
🔁 Restarting kernel...


In [None]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

environment.yml file created successfully.

 Creating environment
Channels:
 - pytorch
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... 

done

 'dissertation' environment is ready to use.


In [None]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel
import torch.optim as optim
import wandb
import subprocess

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

⏳ Cloning GitHub repository...
Cloning into 'Dissertation'...
remote: Enumerating objects: 107, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 107 (delta 48), reused 5 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (107/107), 592.03 KiB | 5.43 MiB/s, done.
Resolving deltas: 100% (48/48), done.
Repository cloned.
Mounted at /content/drive
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded.


In [None]:
class MER_Dataset(Dataset):
    """ Custom PyTorch Dataset for loading MER data. """
    def __init__(self, annotations_df, tokenizer):
        """ Creation of the Dataset from the dataframe (predefined splits in MERGE dataset) """
        self.annotations = annotations_df
        self.tokenizer = tokenizer

    def __len__(self):
        """
        Function to return the total number of songs in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """
        Function to get a song from the dataset.
        """
        song_info = self.annotations.iloc[index] #which song ID/row is picked from the dataset as per the index

        spectrogram_path = song_info['spectrogram_path'] # columns from the df
        lyrics_path = song_info['lyrics_path'] # columns from the df
        valence = song_info['valence'] # columns from the df
        arousal = song_info['arousal'] # columns from the df

        #change spectorgram into a tensor
        spectrogram = np.load(spectrogram_path) #loading spectorgram from path saved in df
        spectrogram_tensor = torch.from_numpy(spectrogram).float() # changing the np array to tensor
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) #Adding a "channel" dimension for CNN

        #Load the lyric tokens
        encoded_lyrics = torch.load(lyrics_path, weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0) #remove the batch dimension from input ids so 1d
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0) #remove the batch dimension from attention mask so 1d

        labels = torch.tensor([valence, arousal], dtype=torch.float32) # extract labels

        return spectrogram_tensor, input_ids, attention_mask, labels

In [None]:
class VGGish_Audio_Model(nn.Module):

    def __init__(self):
        super(VGGish_Audio_Model, self).__init__()
        '''
        A VGG-style model for the audio tower for a starting model.
        No longer trying to implement the method from MERGE paper as this had mistakes in the paper
        '''
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(512, 256), #Input features should be 512
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, 64) # Final feature vector size should be 64 - needs to match input of combined
        )

    def forward(self, x):
        x = self.features(x)
        #flatten the features for the classifier
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


In [None]:
class BimodalClassifier(nn.Module):
    """
    The final bimodal model. No longer using MERGE archtecture as
    transformer would be better. Also due to mistakes in the paper it is
    unclear what some of the parameters are.
    """
    def __init__(self):
        super(BimodalClassifier, self).__init__()

        #initiate audio tower
        self.audio_tower = VGGish_Audio_Model()

        #use transformer for lyrics (using bert base uncased for now, but may change)
        self.lyrics_tower = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.lyrics_tower.parameters():
            param.requires_grad = False

        # Define feature sizes from the previous step and from bert
        AUDIO_FEATURES_OUT = 64
        LYRICS_FEATURES_OUT = 768
        COMBINED_FEATURES = AUDIO_FEATURES_OUT + LYRICS_FEATURES_OUT

        self.classifier_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features=COMBINED_FEATURES, out_features=100),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(in_features=100, out_features=2) # 2 Outputs for Valence and Arousal
        )

    def forward(self, x_audio, input_ids, attention_mask):
        #process audio input
        audio_features = self.audio_tower(x_audio)

        #get lyric features
        lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)

        #use the embedding of the [CLS] token as the feature vector for whole lyrics
        lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]

        #combine the features from both towers
        combined_features = torch.cat((audio_features, lyrics_features), dim=1)

        #pass the combined features to the final classifier head
        output = self.classifier_head(combined_features)

        return output

In [None]:
#Data loading and prep

#get the paths to dissertation folder and new folder on colab
print("Starting data transfer from Google Drive to local Colab storage...")

#get paths for old file location and new colab one
gdrive_zip_path = '/content/drive/MyDrive/dissertation/merge_dataset_zipped.zip'
local_storage_path = '/content/local_dissertation_data/'
local_zip_path = os.path.join(local_storage_path, 'merge_dataset_zipped.zip')
os.makedirs(local_storage_path, exist_ok=True) # Ensure the destination directory exists

#Copy zip file from Drive to Colab
print("Copying single archive file from Google Drive...")
!rsync -ah --progress "{gdrive_zip_path}" "{local_storage_path}"

#get total number of files for progress
total_files = int(subprocess.check_output(f"zipinfo -1 {local_zip_path} | wc -l", shell=True))

#unzip the file
print("Extracting files locally... This will be quick!")
!unzip -o "{local_zip_path}" -d "{local_storage_path}" | tqdm --unit=files --total={total_files} > /dev/null

print("Data transfer and extraction complete.")

#load master data from new location
local_output_path = os.path.join(local_storage_path, 'merge_dataset/output_from_code/')
master_file_path = os.path.join(local_output_path, 'master_processed_file_list.csv')
master_df = pd.read_csv(master_file_path)

#update the paths in the csv
print("Updating dataframe paths to use fast local storage...")
gdrive_output_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_df['spectrogram_path'] = master_df['spectrogram_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
master_df['lyrics_path'] = master_df['lyrics_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
print("Dataframe paths updated.")

#load the data splits from the new path in the predefined splits folder tvt
local_split_folder_path = os.path.join(local_storage_path, 'merge_dataset/MERGE_Bimodal_Complete/tvt_dataframes/tvt_70_15_15/')
train_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_train_bimodal_complete.csv'))
val_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_validate_bimodal_complete.csv'))
test_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_test_bimodal_complete.csv'))
print("Split files loaded from local storage.")

#merge the files
id_column_name = 'song_id'
train_split_df.rename(columns={'Song': id_column_name}, inplace=True)
val_split_df.rename(columns={'Song': id_column_name}, inplace=True)
test_split_df.rename(columns={'Song': id_column_name}, inplace=True)

train_df = pd.merge(master_df, train_split_df, on=id_column_name)
val_df = pd.merge(master_df, val_split_df, on=id_column_name)
test_df = pd.merge(master_df, test_split_df, on=id_column_name)

#checking no files are lost in merging - and checking length of the dataframes.
print("\nchecking data")

#check no data lost in merge
if len(train_df) == len(train_split_df):
    print("Training split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Training split lost {len(train_split_df) - len(train_df)} songs during merge.")

if len(val_df) == len(val_split_df):
    print("Validation split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Validation split lost {len(val_split_df) - len(val_df)} songs during merge.")

if len(test_df) == len(test_split_df):
    print("Test split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Test split lost {len(test_split_df) - len(test_df)} songs during merge.")

#check length
expected_train_len = 1552
expected_val_len = 332
expected_test_len = 332

assert len(train_df) == expected_train_len, f"Expected {expected_train_len} training samples, but found {len(train_df)}"
assert len(val_df) == expected_val_len, f"Expected {expected_val_len} validation samples, but found {len(val_df)}"
assert len(test_df) == expected_test_len, f"Expected {expected_test_len} test samples, but found {len(test_df)}"

print(f"Final dataset lengths are correct: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})")
print("Data Check Complete")

#createthe datasets and loaders
train_dataset = MER_Dataset(annotations_df=train_df, tokenizer=tokenizer)
val_dataset = MER_Dataset(annotations_df=val_df, tokenizer=tokenizer)
test_dataset = MER_Dataset(annotations_df=test_df, tokenizer=tokenizer)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("DataLoaders created successfully.")

Starting data transfer from Google Drive to local Colab storage...
Copying single archive file from Google Drive...
sending incremental file list
Extracting files locally... This will be quick!
4442files [00:15, 279.22files/s]           
Data transfer and extraction complete.
Updating dataframe paths to use fast local storage...
Dataframe paths updated.
Split files loaded from local storage.

checking data
Training split: Merge successful. All songs accounted for.
Validation split: Merge successful. All songs accounted for.
Test split: Merge successful. All songs accounted for.
Final dataset lengths are correct: Train(1552), Val(332), Test(332)
Data Check Complete
DataLoaders created successfully.


In [None]:
# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA device.")
else:
    # If no GPU is found, print an error and stop execution by raising an error.
    raise RuntimeError("Error: No GPU found. This script requires a GPU to run.")


GPU is available. Using CUDA device.


In [None]:
#Final output model
model = BimodalClassifier()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss() # Using Mean Squared Error for regression

#training
wandb.init(project="dissertation-mer-regression")

NUM_EPOCHS = 50

for epoch in range(NUM_EPOCHS):
    #training
    model.train()
    total_train_loss = 0

    for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        spectrogram_batch = spectrogram_batch.to(device)
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        labels_batch = labels_batch.to(device)

        optimizer.zero_grad()
        outputs = model(spectrogram_batch, input_ids_batch, attention_mask_batch)
        loss = loss_fn(outputs, labels_batch)
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Training Loss: {avg_train_loss:.4f}")
    wandb.log({"epoch": epoch+1, "train_loss": avg_train_loss})

    #vaidate
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            spectrogram_batch = spectrogram_batch.to(device)
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)
            labels_batch = labels_batch.to(device)

            outputs = model(spectrogram_batch, input_ids_batch, attention_mask_batch)
            loss = loss_fn(outputs, labels_batch)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Validation Loss: {avg_val_loss:.4f}")
    wandb.log({"val_loss": avg_val_loss})

print("--- Pipeline Test Complete ---")



0,1
epoch,▁
train_loss,▁
val_loss,▁

0,1
epoch,1.0
train_loss,0.06497
val_loss,0.02505


Training Epoch 1:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 1/50, Average Training Loss: 0.0621


Validation Epoch 1:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 1/50, Average Validation Loss: 0.0353


Training Epoch 2:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 2/50, Average Training Loss: 0.0441


Validation Epoch 2:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 2/50, Average Validation Loss: 0.0265


Training Epoch 3:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 3/50, Average Training Loss: 0.0394


Validation Epoch 3:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 3/50, Average Validation Loss: 0.0221


Training Epoch 4:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 4/50, Average Training Loss: 0.0351


Validation Epoch 4:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 4/50, Average Validation Loss: 0.0215


Training Epoch 5:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 5/50, Average Training Loss: 0.0353


Validation Epoch 5:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 5/50, Average Validation Loss: 0.0221


Training Epoch 6:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 6/50, Average Training Loss: 0.0328


Validation Epoch 6:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 6/50, Average Validation Loss: 0.0218


Training Epoch 7:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 7/50, Average Training Loss: 0.0324


Validation Epoch 7:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 7/50, Average Validation Loss: 0.0246


Training Epoch 8:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 8/50, Average Training Loss: 0.0301


Validation Epoch 8:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 8/50, Average Validation Loss: 0.0233


Training Epoch 9:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 9/50, Average Training Loss: 0.0282


Validation Epoch 9:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 9/50, Average Validation Loss: 0.0214


Training Epoch 10:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 10/50, Average Training Loss: 0.0284


Validation Epoch 10:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 10/50, Average Validation Loss: 0.0207


Training Epoch 11:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 11/50, Average Training Loss: 0.0275


Validation Epoch 11:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 11/50, Average Validation Loss: 0.0219


Training Epoch 12:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 12/50, Average Training Loss: 0.0277


Validation Epoch 12:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 12/50, Average Validation Loss: 0.0207


Training Epoch 13:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 13/50, Average Training Loss: 0.0272


Validation Epoch 13:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 13/50, Average Validation Loss: 0.0202


Training Epoch 14:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 14/50, Average Training Loss: 0.0263


Validation Epoch 14:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 14/50, Average Validation Loss: 0.0203


Training Epoch 15:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 15/50, Average Training Loss: 0.0259


Validation Epoch 15:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 15/50, Average Validation Loss: 0.0219


Training Epoch 16:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 16/50, Average Training Loss: 0.0262


Validation Epoch 16:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 16/50, Average Validation Loss: 0.0216


Training Epoch 17:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 17/50, Average Training Loss: 0.0258


Validation Epoch 17:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 17/50, Average Validation Loss: 0.0202


Training Epoch 18:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 18/50, Average Training Loss: 0.0264


Validation Epoch 18:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 18/50, Average Validation Loss: 0.0201


Training Epoch 19:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 19/50, Average Training Loss: 0.0257


Validation Epoch 19:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 19/50, Average Validation Loss: 0.0202


Training Epoch 20:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 20/50, Average Training Loss: 0.0256


Validation Epoch 20:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 20/50, Average Validation Loss: 0.0207


Training Epoch 21:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 21/50, Average Training Loss: 0.0255


Validation Epoch 21:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 21/50, Average Validation Loss: 0.0200


Training Epoch 22:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 22/50, Average Training Loss: 0.0249


Validation Epoch 22:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 22/50, Average Validation Loss: 0.0208


Training Epoch 23:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 23/50, Average Training Loss: 0.0252


Validation Epoch 23:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 23/50, Average Validation Loss: 0.0202


Training Epoch 24:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 24/50, Average Training Loss: 0.0251


Validation Epoch 24:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 24/50, Average Validation Loss: 0.0203


Training Epoch 25:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 25/50, Average Training Loss: 0.0254


Validation Epoch 25:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 25/50, Average Validation Loss: 0.0197


Training Epoch 26:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 26/50, Average Training Loss: 0.0256


Validation Epoch 26:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 26/50, Average Validation Loss: 0.0199


Training Epoch 27:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 27/50, Average Training Loss: 0.0251


Validation Epoch 27:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 27/50, Average Validation Loss: 0.0199


Training Epoch 28:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 28/50, Average Training Loss: 0.0250


Validation Epoch 28:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 28/50, Average Validation Loss: 0.0201


Training Epoch 29:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 29/50, Average Training Loss: 0.0254


Validation Epoch 29:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 29/50, Average Validation Loss: 0.0205


Training Epoch 30:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 30/50, Average Training Loss: 0.0249


Validation Epoch 30:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 30/50, Average Validation Loss: 0.0202


Training Epoch 31:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 31/50, Average Training Loss: 0.0245


Validation Epoch 31:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 31/50, Average Validation Loss: 0.0200


Training Epoch 32:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 32/50, Average Training Loss: 0.0251


Validation Epoch 32:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 32/50, Average Validation Loss: 0.0194


Training Epoch 33:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 33/50, Average Training Loss: 0.0249


Validation Epoch 33:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 33/50, Average Validation Loss: 0.0200


Training Epoch 34:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 34/50, Average Training Loss: 0.0254


Validation Epoch 34:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 34/50, Average Validation Loss: 0.0198


Training Epoch 35:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 35/50, Average Training Loss: 0.0249


Validation Epoch 35:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 35/50, Average Validation Loss: 0.0200


Training Epoch 36:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 36/50, Average Training Loss: 0.0247


Validation Epoch 36:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 36/50, Average Validation Loss: 0.0204


Training Epoch 37:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 37/50, Average Training Loss: 0.0248


Validation Epoch 37:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 37/50, Average Validation Loss: 0.0196


Training Epoch 38:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 38/50, Average Training Loss: 0.0258


Validation Epoch 38:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 38/50, Average Validation Loss: 0.0201


Training Epoch 39:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 39/50, Average Training Loss: 0.0256


Validation Epoch 39:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 39/50, Average Validation Loss: 0.0196


Training Epoch 40:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 40/50, Average Training Loss: 0.0251


Validation Epoch 40:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 40/50, Average Validation Loss: 0.0201


Training Epoch 41:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 41/50, Average Training Loss: 0.0246


Validation Epoch 41:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 41/50, Average Validation Loss: 0.0195


Training Epoch 42:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 42/50, Average Training Loss: 0.0251


Validation Epoch 42:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 42/50, Average Validation Loss: 0.0197


Training Epoch 43:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 43/50, Average Training Loss: 0.0254


Validation Epoch 43:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 43/50, Average Validation Loss: 0.0200


Training Epoch 44:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 44/50, Average Training Loss: 0.0245


Validation Epoch 44:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 44/50, Average Validation Loss: 0.0195


Training Epoch 45:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 45/50, Average Training Loss: 0.0254


Validation Epoch 45:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 45/50, Average Validation Loss: 0.0195


Training Epoch 46:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 46/50, Average Training Loss: 0.0258


Validation Epoch 46:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 46/50, Average Validation Loss: 0.0200


Training Epoch 47:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 47/50, Average Training Loss: 0.0254


Validation Epoch 47:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 47/50, Average Validation Loss: 0.0197


Training Epoch 48:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 48/50, Average Training Loss: 0.0246


Validation Epoch 48:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 48/50, Average Validation Loss: 0.0202


Training Epoch 49:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 49/50, Average Training Loss: 0.0255


Validation Epoch 49:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 49/50, Average Validation Loss: 0.0199


Training Epoch 50:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 50/50, Average Training Loss: 0.0248


Validation Epoch 50:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 50/50, Average Validation Loss: 0.0197
--- Pipeline Test Complete ---


In [None]:
# --- EVALUATION ON THE TEST SET ---

# Make sure the model is in evaluation mode
model.eval()

all_labels = []
all_predictions = []

# No need to track gradients for evaluation
with torch.no_grad():
    for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(test_loader, desc="Testing"):
        spectrogram_batch = spectrogram_batch.to(device)
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)

        # Get model predictions
        outputs = model(spectrogram_batch, input_ids_batch, attention_mask_batch)

        # Store predictions and true labels
        all_predictions.append(outputs.cpu().numpy())
        all_labels.append(labels_batch.cpu().numpy())

# Combine predictions and labels from all batches
all_predictions = np.concatenate(all_predictions, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Calculate metrics using scikit-learn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Separate Valence and Arousal for individual analysis
valence_true, arousal_true = all_labels[:, 0], all_labels[:, 1]
valence_pred, arousal_pred = all_predictions[:, 0], all_predictions[:, 1]

# --- FINAL RESULTS ---
print("\n--- Test Set Evaluation Results ---")

# Valence Metrics
mse_v = mean_squared_error(valence_true, valence_pred)
mae_v = mean_absolute_error(valence_true, valence_pred)
r2_v = r2_score(valence_true, valence_pred)
print(f"Valence  -> MSE: {mse_v:.4f}, MAE: {mae_v:.4f}, R-squared: {r2_v:.4f}")

# Arousal Metrics
mse_a = mean_squared_error(arousal_true, arousal_pred)
mae_a = mean_absolute_error(arousal_true, arousal_pred)
r2_a = r2_score(arousal_true, arousal_pred)
print(f"Arousal  -> MSE: {mse_a:.4f}, MAE: {mae_a:.4f}, R-squared: {r2_a:.4f}")

# Log final metrics to wandb
wandb.log({
    "test_mse_valence": mse_v, "test_mae_valence": mae_v, "test_r2_valence": r2_v,
    "test_mse_arousal": mse_a, "test_mae_arousal": mae_a, "test_r2_arousal": r2_a
})

print("\n--- Evaluation Complete ---")

Testing:   0%|          | 0/21 [00:00<?, ?it/s]


--- Test Set Evaluation Results ---
Valence  -> MSE: 0.0260, MAE: 0.1283, R-squared: 0.4693
Arousal  -> MSE: 0.0092, MAE: 0.0761, R-squared: 0.3486

--- Evaluation Complete ---


In [None]:


print("\n💾 Saving the trained model weights...")

# Define the path to save the model in your Google Drive
save_path = '/content/drive/MyDrive/dissertation/bimodal_regression_model.pth'

# Save the model's learned parameters (its "state dictionary")
torch.save(model.state_dict(), save_path)

print(f"Model saved successfully to: {save_path}")


💾 Saving the trained model weights...
Model saved successfully to: /content/drive/MyDrive/dissertation/bimodal_regression_model.pth
