<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Similarity_Analysis_Audio_vs_Lyrics_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

In [None]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

In [None]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel
import torch.optim as optim
import wandb
import subprocess

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_decomposition import CCA
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
import types
import json
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.spatial.distance import euclidean

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
class MER_Dataset(Dataset):
    """ Custom PyTorch Dataset for loading MER data. """
    def __init__(self, annotations_df, tokenizer):
        """ Creation of the Dataset from the dataframe (predefined splits in MERGE dataset) """
        self.annotations = annotations_df
        self.tokenizer = tokenizer

    def __len__(self):
        """
        Function to return the total number of songs in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """
        Function to get a song from the dataset.
        """
        song_info = self.annotations.iloc[index] #which song ID/row is picked from the dataset as per the index

        spectrogram_path = song_info['spectrogram_path'] # columns from the df
        lyrics_path = song_info['lyrics_path'] # columns from the df
        valence = song_info['valence'] # columns from the df
        arousal = song_info['arousal'] # columns from the df

        #change spectorgram into a tensor
        spectrogram = np.load(spectrogram_path) #loading spectorgram from path saved in df
        spectrogram_tensor = torch.from_numpy(spectrogram).float() # changing the np array to tensor
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) #Adding a "channel" dimension for CNN

        #Load the lyric tokens
        encoded_lyrics = torch.load(lyrics_path, weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0) #remove the batch dimension from input ids so 1d array
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0) #remove the batch dimension from attention mask so 1d

        labels = torch.tensor([valence, arousal], dtype=torch.float32) # extract labels

        return spectrogram_tensor, input_ids, attention_mask, labels

In [None]:
class AttentionModule(nn.Module): #Addition from V1
    def __init__(self, feature_dim):
        super(AttentionModule, self).__init__()
        '''
        Attention mechanism to weight the importance of different features
        '''
        self.attention = nn.Sequential(
            nn.Linear(feature_dim, feature_dim // 4),  # input is 64 will map to16
            nn.ReLU(),
            nn.Linear(feature_dim // 4, feature_dim),  #reverts back to 64 from 16
            nn.Sigmoid()
        )

    def forward(self, x):
        # x shape: [batch_size, 64]
        attention_weights = self.attention(x)  # [batch_size, 64]
        weighted_features = x * attention_weights  # Element-wise multiplication
        return weighted_features

In [None]:
class VGGish_Audio_Model(nn.Module):
    '''As previous vERSION but adding in the following
      - Batch normalisation
      - Attention mechanism
      - Learning rate scheduling
      - early stopping'''

    def __init__(self):
        super(VGGish_Audio_Model, self).__init__()
        '''
        A VGG-style model for the audio tower for a starting model.
        No longer trying to implement the method from MERGE paper as this had mistakes in the paper
        V1.1 includes attention to see if this improves performance.
        V1.2  implements true VGG-style blocks with multiple convolutions per block.
        '''
        self.features = nn.Sequential(
            # Block 1 - 2 convolutions
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2 - 2 convolutions
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3 - 2 convolutions
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4 - 2 convolutions
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        self.dropout1 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(512, 256)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(0.5)
        self.attention = AttentionModule(256) #Add attention here from v2 (model 3)
        self.fc2 = nn.Linear(256, 64) # Final feature vector size should be 64 - needs to match input of combined

    def forward(self, x):
        x = self.features(x)
        # Flatten the features for the classifier
        x = x.view(x.size(0), -1)
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout2(x)
        x = self.attention(x)
        x = self.fc2(x)
        return x

In [None]:
class BimodalClassifier(nn.Module):
    """
    The final bimodal model. No longer using MERGE archtecture as
    transformer would be better. Also due to mistakes in the paper it is
    unclear what some of the parameters are.
    """
    def __init__(self):
        super(BimodalClassifier, self).__init__()

        #initiate audio tower
        self.audio_tower = VGGish_Audio_Model()

        #use transformer for lyrics (using bert base uncased for now, but may change)
        self.lyrics_tower = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.lyrics_tower.parameters():
            param.requires_grad = False

        # Define feature sizes from the previous step and from bert
        AUDIO_FEATURES_OUT = 64
        LYRICS_FEATURES_OUT = 768
        COMBINED_FEATURES = AUDIO_FEATURES_OUT + LYRICS_FEATURES_OUT

        self.classifier_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features=COMBINED_FEATURES, out_features=100),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(in_features=100, out_features=2) # 2 Outputs for Valence and Arousal
        )

    def forward(self, x_audio, input_ids, attention_mask):
        #process audio input
        audio_features = self.audio_tower(x_audio)

        #get lyric features
        lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)

        #use the embedding of the [CLS] token as the feature vector for whole lyrics
        lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]

        #combine the features from both towers
        combined_features = torch.cat((audio_features, lyrics_features), dim=1)

        #pass the combined features to the final classifier head
        output = self.classifier_head(combined_features)

        return output

In [None]:
def get_features(self, x_audio, input_ids, attention_mask):
    """
    Extract audio and lyrics features separately (before fusion).
    Returns: (audio_features, lyrics_features, predictions)
    """

    # Process audio input
    audio_features = self.audio_tower(x_audio)  # [batch_size, 64]

    # Get lyric features
    lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)
    lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]

    # Combine features and get predictions
    combined_features = torch.cat((audio_features, lyrics_features), dim=1)
    predictions = self.classifier_head(combined_features)

    return audio_features, lyrics_features, predictions

In [None]:
#Data loading and prep

#get the paths to dissertation folder and new folder on colab
print("Starting data transfer from Google Drive to local Colab storage...")

#get paths for old file location and new colab one
gdrive_zip_path = '/content/drive/MyDrive/dissertation/merge_dataset_zipped.zip'
local_storage_path = '/content/local_dissertation_data/'
local_zip_path = os.path.join(local_storage_path, 'merge_dataset_zipped.zip')
os.makedirs(local_storage_path, exist_ok=True) # Ensure the destination directory exists

#Copy zip file from Drive to Colab
print("Copying single archive file from Google Drive...")
!rsync -ah --progress "{gdrive_zip_path}" "{local_storage_path}"

#get total number of files for progress
total_files = int(subprocess.check_output(f"zipinfo -1 {local_zip_path} | wc -l", shell=True))

#unzip the file
print("Extracting files locally")
!unzip -o "{local_zip_path}" -d "{local_storage_path}" | tqdm --unit=files --total={total_files} > /dev/null

print("Data transfer and extraction complete.")

#load master data from new location
local_output_path = os.path.join(local_storage_path, 'merge_dataset/output_from_code/')
master_file_path = os.path.join(local_output_path, 'master_processed_file_list.csv')
master_df = pd.read_csv(master_file_path)

#checking the valence and arousal range in the dataset
print(f"\nValence range in data: [{master_df['valence'].min()}, {master_df['valence'].max()}]")
print(f"Arousal range in data: [{master_df['arousal'].min()}, {master_df['arousal'].max()}]")
print(f"Valence mean: {master_df['valence'].mean():.4f}, std: {master_df['valence'].std():.4f}")
print(f"Arousal mean: {master_df['arousal'].mean():.4f}, std: {master_df['arousal'].std():.4f}")
print(f"Total samples in master_df: {len(master_df)}")

# Verify its the right column - not quadrants
print(f"\nNumber of unique valence values: {master_df['valence'].nunique()}")
print(f"Number of unique arousal values: {master_df['arousal'].nunique()}")
print(f"Number of unique quadrant values: {master_df['quadrant'].nunique()}")

# Sample some actual values
print(f"\nSample valence values: {master_df['valence'].sample(10).values}")
print(f"Sample arousal values: {master_df['arousal'].sample(10).values}")

#update the paths in the csv
print("\nUpdating dataframe paths to use fast local storage...")
gdrive_output_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_df['spectrogram_path'] = master_df['spectrogram_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
master_df['lyrics_path'] = master_df['lyrics_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
print("Dataframe paths updated.")

#load the data splits from the new path in the predefined splits folder tvt
local_split_folder_path = os.path.join(local_storage_path, 'merge_dataset/MERGE_Bimodal_Complete/tvt_dataframes/tvt_70_15_15/')
train_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_train_bimodal_complete.csv'))
val_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_validate_bimodal_complete.csv'))
test_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_test_bimodal_complete.csv'))
print("\nSplit files loaded from local storage.")

#merge the files
id_column_name = 'song_id'
train_split_df.rename(columns={'Song': id_column_name}, inplace=True)
val_split_df.rename(columns={'Song': id_column_name}, inplace=True)
test_split_df.rename(columns={'Song': id_column_name}, inplace=True)

train_df = pd.merge(master_df, train_split_df, on=id_column_name)
val_df = pd.merge(master_df, val_split_df, on=id_column_name)
test_df = pd.merge(master_df, test_split_df, on=id_column_name)

#checking no files are lost in merging - and checking length of the dataframes.
print("\nchecking data")

#check no data lost in merge
if len(train_df) == len(train_split_df):
    print("\nTraining split: Merge successful. All songs accounted for.")
else:
    print(f"\nWARNING: Training split lost {len(train_split_df) - len(train_df)} songs during merge.")

if len(val_df) == len(val_split_df):
    print("Validation split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Validation split lost {len(val_split_df) - len(val_df)} songs during merge.")

if len(test_df) == len(test_split_df):
    print("Test split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Test split lost {len(test_split_df) - len(test_df)} songs during merge.")

#check length
expected_train_len = 1552
expected_val_len = 332
expected_test_len = 332

assert len(train_df) == expected_train_len, f"Expected {expected_train_len} training samples, but found {len(train_df)}"
assert len(val_df) == expected_val_len, f"Expected {expected_val_len} validation samples, but found {len(val_df)}"
assert len(test_df) == expected_test_len, f"Expected {expected_test_len} test samples, but found {len(test_df)}"

print(f"\nFinal dataset lengths are correct: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})")
print("Data Check Complete")

#createthe datasets and loaders
train_dataset = MER_Dataset(annotations_df=train_df, tokenizer=tokenizer)
val_dataset = MER_Dataset(annotations_df=val_df, tokenizer=tokenizer)
test_dataset = MER_Dataset(annotations_df=test_df, tokenizer=tokenizer)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\nDataLoaders created successfully.")

In [None]:
#select dataset for similarity analysis

analysis_df = test_df.copy()  #can change to train_df or val_df

print(f"\n✓ Selected dataset for similarity analysis: TEST SET")
print(f"  Total songs to analyze: {len(analysis_df)}")
print(f"  Song IDs: {analysis_df[id_column_name].head(10).tolist()}...")

In [None]:
# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA device.")
else:
    # If no GPU is found, print an error and stop execution by raising an error.
    raise RuntimeError("Error: No GPU found. This script requires a GPU to run.")


In [None]:
model = BimodalClassifier()
model.to(device)
#load model 4
model_path = '/content/drive/MyDrive/dissertation/bimodal_regression_model.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()  # Set to evaluation mode


# Add the method wfor getting features
model.get_features = types.MethodType(get_features, model)

print("Feature extraction added to model.")



In [None]:
def extract_features_from_dataset(model, dataloader, device):
    """
    Extract audio and lyrics features for all songs in the dataloader.
    """

    #Create lists to store results
    audio_features_list = []
    lyrics_features_list = []
    predictions_list = []
    ground_truth_list = []

    # Set model to evaluation mode
    model.eval()

    # Extract features without computing gradients
    with torch.no_grad():
        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(dataloader, desc="Extracting features"):
            # Move data to device
            spectrogram_batch = spectrogram_batch.to(device)
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)

            # Extract features
            audio_feat, lyrics_feat, preds = model.get_features(
                spectrogram_batch,
                input_ids_batch,
                attention_mask_batch
            )

            # Move to CPU and convert to numpy
            audio_features_list.append(audio_feat.cpu().numpy())
            lyrics_features_list.append(lyrics_feat.cpu().numpy())
            predictions_list.append(preds.cpu().numpy())
            ground_truth_list.append(labels_batch.cpu().numpy())

    # Concatenate all batches
    audio_features = np.concatenate(audio_features_list, axis=0)      # [N, 64]
    lyrics_features = np.concatenate(lyrics_features_list, axis=0)    # [N, 768]
    predictions = np.concatenate(predictions_list, axis=0)            # [N, 2]
    ground_truth = np.concatenate(ground_truth_list, axis=0)          # [N, 2]

    # Print summary
    print(f"\n✓ Feature extraction complete!")
    print(f"  Total songs processed: {len(audio_features)}")
    print(f"  Audio features shape:  {audio_features.shape}")
    print(f"  Lyrics features shape: {lyrics_features.shape}")
    print(f"  Predictions shape:     {predictions.shape}")
    print(f"  Ground truth shape:    {ground_truth.shape}")

    return {
        'audio_features': audio_features,
        'lyrics_features': lyrics_features,
        'predictions': predictions,
        'ground_truth': ground_truth
    }



In [None]:
# Extract features and predictions from the bimodal model
print("\n" + "="*70)
print("EXTRACTING BIMODAL FEATURES AND PREDICTIONS")
print("="*70)

features_dict = extract_features_from_dataset(model, test_loader, device)
audio_features = features_dict['audio_features']
lyrics_features = features_dict['lyrics_features']
predictions = features_dict['predictions']  # Needed for comparison in Cell 17
ground_truth_bimodal = features_dict['ground_truth']

print(f"\n✓ Bimodal predictions extracted and ready for comparison")

In [None]:
class AudioOnlyModel(nn.Module):
    """Audio-only model for MER prediction."""
    def __init__(self):
        super(AudioOnlyModel, self).__init__()
        self.audio_tower = VGGish_Audio_Model()  # Same as bimodal

        # Classifier head for audio only
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(64, 100),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(100, 2)  # Valence and Arousal
        )

    def forward(self, x_audio):
        audio_features = self.audio_tower(x_audio)
        predictions = self.classifier(audio_features)
        return predictions

class LyricsOnlyModel(nn.Module):
    """Lyrics-only model for MER prediction."""
    def __init__(self):
        super(LyricsOnlyModel, self).__init__()
        self.lyrics_tower = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.lyrics_tower.parameters():
            param.requires_grad = False

        # Classifier head for lyrics only
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(768, 100),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(100, 2)  # Valence and Arousal
        )

    def forward(self, input_ids, attention_mask):
        lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)
        lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]  # CLS token
        predictions = self.classifier(lyrics_features)
        return predictions


In [None]:
# Initialise models
audio_only_model = AudioOnlyModel().to(device)
lyrics_only_model = LyricsOnlyModel().to(device)

# Try to load pre-trained weights if they exist
audio_model_path = '/content/drive/MyDrive/dissertation/audio_only_model.pth'
lyrics_model_path = '/content/drive/MyDrive/dissertation/lyrics_only_model.pth'

# Flag to control whether to train
TRAIN_UNIMODAL = True  # Set to False if you want to load pre-trained weights instead

if not TRAIN_UNIMODAL:
    try:
        audio_only_model.load_state_dict(torch.load(audio_model_path, map_location=device))
        print("Loaded pre-trained audio-only model")
    except:
        print("No pre-trained audio-only model found")
        TRAIN_UNIMODAL = True

    try:
        lyrics_only_model.load_state_dict(torch.load(lyrics_model_path, map_location=device))
        print("Loaded pre-trained lyrics-only model")
    except:
        print("No pre-trained lyrics-only model found")
        TRAIN_UNIMODAL = True

if TRAIN_UNIMODAL:
    print("\n" + "="*70)
    print("TRAINING UNIMODAL MODELS")
    print("="*70)

    # Training configuration matching MODEL 4
    NUM_EPOCHS = 50
    optimizer_audio = optim.Adam(audio_only_model.parameters(), lr=0.001)
    optimizer_lyrics = optim.Adam(lyrics_only_model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    scheduler_audio = optim.lr_scheduler.StepLR(optimizer_audio, step_size=15, gamma=0.5)
    scheduler_lyrics = optim.lr_scheduler.StepLR(optimizer_lyrics, step_size=15, gamma=0.5)

    # Initialize W&B for both models
    wandb.init(project="dissertation-mer-unimodal", name="audio-only-training")

    # Early stopping setup for audio model
    best_val_loss_audio = float('inf')
    patience = 10
    patience_counter_audio = 0
    best_audio_state = None
    best_epoch_audio = 0

    print("\n--- Training Audio-Only Model ---\n")

    for epoch in range(NUM_EPOCHS):
        # Training
        audio_only_model.train()
        total_train_loss = 0

        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(train_loader, desc=f"Audio Training Epoch {epoch+1}"):
            spectrogram_batch = spectrogram_batch.to(device)
            labels_batch = labels_batch.to(device)

            optimizer_audio.zero_grad()
            outputs = audio_only_model(spectrogram_batch)
            loss = loss_fn(outputs, labels_batch)
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(audio_only_model.parameters(), max_norm=1.0)
            optimizer_audio.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Audio Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {avg_train_loss:.4f}")
        wandb.log({"audio_epoch": epoch+1, "audio_train_loss": avg_train_loss})

        # Validation
        audio_only_model.eval()
        total_val_loss = 0

        with torch.no_grad():
            for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(val_loader, desc=f"Audio Validation Epoch {epoch+1}"):
                spectrogram_batch = spectrogram_batch.to(device)
                labels_batch = labels_batch.to(device)

                outputs = audio_only_model(spectrogram_batch)
                loss = loss_fn(outputs, labels_batch)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Audio Epoch {epoch+1}/{NUM_EPOCHS}, Validation Loss: {avg_val_loss:.4f}")
        wandb.log({"audio_val_loss": avg_val_loss})

        # Early stopping
        if avg_val_loss < best_val_loss_audio:
            best_val_loss_audio = avg_val_loss
            best_epoch_audio = epoch + 1
            patience_counter_audio = 0
            best_audio_state = audio_only_model.state_dict().copy()
            print(f"✓ New best audio validation loss: {best_val_loss_audio:.4f}")
        else:
            patience_counter_audio += 1
            print(f"No improvement for {patience_counter_audio} epochs (patience: {patience})")

            if patience_counter_audio >= patience:
                print(f"Early stopping triggered! Best validation loss: {best_val_loss_audio:.4f}")
                audio_only_model.load_state_dict(best_audio_state)
                break

        scheduler_audio.step()
        current_lr = scheduler_audio.get_last_lr()[0]
        print(f"Learning Rate: {current_lr:.6f}")
        wandb.log({"audio_learning_rate": current_lr})

    print(f"\nAudio model training complete. Best val loss: {best_val_loss_audio:.4f} at epoch {best_epoch_audio}")

    # Finish audio W&B run
    wandb.finish()

    # Now train lyrics model
    wandb.init(project="dissertation-mer-unimodal", name="lyrics-only-training")

    best_val_loss_lyrics = float('inf')
    patience_counter_lyrics = 0
    best_lyrics_state = None
    best_epoch_lyrics = 0

    print("\n--- Training Lyrics-Only Model ---\n")

    for epoch in range(NUM_EPOCHS):
        # Training
        lyrics_only_model.train()
        total_train_loss = 0

        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(train_loader, desc=f"Lyrics Training Epoch {epoch+1}"):
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)
            labels_batch = labels_batch.to(device)

            optimizer_lyrics.zero_grad()
            outputs = lyrics_only_model(input_ids_batch, attention_mask_batch)
            loss = loss_fn(outputs, labels_batch)
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(lyrics_only_model.parameters(), max_norm=1.0)
            optimizer_lyrics.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Lyrics Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {avg_train_loss:.4f}")
        wandb.log({"lyrics_epoch": epoch+1, "lyrics_train_loss": avg_train_loss})

        # Validation
        lyrics_only_model.eval()
        total_val_loss = 0

        with torch.no_grad():
            for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(val_loader, desc=f"Lyrics Validation Epoch {epoch+1}"):
                input_ids_batch = input_ids_batch.to(device)
                attention_mask_batch = attention_mask_batch.to(device)
                labels_batch = labels_batch.to(device)

                outputs = lyrics_only_model(input_ids_batch, attention_mask_batch)
                loss = loss_fn(outputs, labels_batch)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Lyrics Epoch {epoch+1}/{NUM_EPOCHS}, Validation Loss: {avg_val_loss:.4f}")
        wandb.log({"lyrics_val_loss": avg_val_loss})

        # Early stopping
        if avg_val_loss < best_val_loss_lyrics:
            best_val_loss_lyrics = avg_val_loss
            best_epoch_lyrics = epoch + 1
            patience_counter_lyrics = 0
            best_lyrics_state = lyrics_only_model.state_dict().copy()
            print(f"✓ New best lyrics validation loss: {best_val_loss_lyrics:.4f}")
        else:
            patience_counter_lyrics += 1
            print(f"No improvement for {patience_counter_lyrics} epochs (patience: {patience})")

            if patience_counter_lyrics >= patience:
                print(f"Early stopping triggered! Best validation loss: {best_val_loss_lyrics:.4f}")
                lyrics_only_model.load_state_dict(best_lyrics_state)
                break

        scheduler_lyrics.step()
        current_lr = scheduler_lyrics.get_last_lr()[0]
        print(f"Learning Rate: {current_lr:.6f}")
        wandb.log({"lyrics_learning_rate": current_lr})

    print(f"\nLyrics model training complete. Best val loss: {best_val_loss_lyrics:.4f} at epoch {best_epoch_lyrics}")

    # Finish lyrics W&B run
    wandb.finish()

    print("\n" + "="*70)
    print("TRAINING COMPLETE")
    print("="*70)
    print(f"Audio model  - Best val loss: {best_val_loss_audio:.4f} at epoch {best_epoch_audio}")
    print(f"Lyrics model - Best val loss: {best_val_loss_lyrics:.4f} at epoch {best_epoch_lyrics}")

# Set to evaluation mode
audio_only_model.eval()
lyrics_only_model.eval()

In [None]:
def extract_unimodal_predictions(audio_model, lyrics_model, dataloader, device):
    """
    Get predictions from audio-only and lyrics-only models.
    """

    audio_preds_list = []
    lyrics_preds_list = []
    ground_truth_list = []

    audio_model.eval()
    lyrics_model.eval()

    with torch.no_grad():
        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(dataloader, desc="Extracting predictions"):
            # Move to device
            spectrogram_batch = spectrogram_batch.to(device)
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)

            # Get predictions from each modality
            audio_preds = audio_model(spectrogram_batch)
            lyrics_preds = lyrics_model(input_ids_batch, attention_mask_batch)

            # Store
            audio_preds_list.append(audio_preds.cpu().numpy())
            lyrics_preds_list.append(lyrics_preds.cpu().numpy())
            ground_truth_list.append(labels_batch.cpu().numpy())

    # Concatenate
    audio_predictions = np.concatenate(audio_preds_list, axis=0)
    lyrics_predictions = np.concatenate(lyrics_preds_list, axis=0)
    ground_truth = np.concatenate(ground_truth_list, axis=0)

    print(f"\n✓ Extraction complete!")
    print(f"  Audio predictions shape:  {audio_predictions.shape}")
    print(f"  Lyrics predictions shape: {lyrics_predictions.shape}")
    print(f"  Ground truth shape:       {ground_truth.shape}")

    return audio_predictions, lyrics_predictions, ground_truth

# Extract predictions
audio_predictions, lyrics_predictions, ground_truth = extract_unimodal_predictions(
    audio_only_model,
    lyrics_only_model,
    test_loader,
    device
)


In [None]:
def compute_unimodal_comparison(audio_preds, lyrics_preds, bimodal_preds, ground_truth):
    """
    Compare predictions between modalities using Euclidean distance.
    Also compute performance metrics for each modality.
    """
    print("\n" + "="*70)
    print("UNIMODAL VS BIMODAL COMPARISON")
    print("="*70)

    n_samples = len(audio_preds)

    # Compute Euclidean distances between predictions
    audio_lyrics_distances = []
    audio_bimodal_distances = []
    lyrics_bimodal_distances = []

    for i in range(n_samples):
        # Distance between audio and lyrics predictions
        d1 = euclidean(audio_preds[i], lyrics_preds[i])
        audio_lyrics_distances.append(d1)

        # Distance between audio and bimodal
        d2 = euclidean(audio_preds[i], bimodal_preds[i])
        audio_bimodal_distances.append(d2)

        # Distance between lyrics and bimodal
        d3 = euclidean(lyrics_preds[i], bimodal_preds[i])
        lyrics_bimodal_distances.append(d3)

    audio_lyrics_distances = np.array(audio_lyrics_distances)
    audio_bimodal_distances = np.array(audio_bimodal_distances)
    lyrics_bimodal_distances = np.array(lyrics_bimodal_distances)

    # Print distance statistics
    print(f"\n1. EUCLIDEAN DISTANCES BETWEEN PREDICTIONS:")
    print(f"   Audio ↔ Lyrics:   Mean={audio_lyrics_distances.mean():.4f}, Std={audio_lyrics_distances.std():.4f}")
    print(f"   Audio ↔ Bimodal:  Mean={audio_bimodal_distances.mean():.4f}, Std={audio_bimodal_distances.std():.4f}")
    print(f"   Lyrics ↔ Bimodal: Mean={lyrics_bimodal_distances.mean():.4f}, Std={lyrics_bimodal_distances.std():.4f}")

    # Compute performance metrics for each modality
    print(f"\n2. PERFORMANCE METRICS:")

    metrics = {}

    for name, preds in [('Audio-Only', audio_preds),
                        ('Lyrics-Only', lyrics_preds),
                        ('Bimodal', bimodal_preds)]:

        # Overall metrics
        mse = mean_squared_error(ground_truth, preds)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(ground_truth, preds)
        r2 = r2_score(ground_truth, preds)

        # Per-dimension metrics
        valence_rmse = np.sqrt(mean_squared_error(ground_truth[:, 0], preds[:, 0]))
        arousal_rmse = np.sqrt(mean_squared_error(ground_truth[:, 1], preds[:, 1]))

        valence_mae = mean_absolute_error(ground_truth[:, 0], preds[:, 0])
        arousal_mae = mean_absolute_error(ground_truth[:, 1], preds[:, 1])

        valence_r2 = r2_score(ground_truth[:, 0], preds[:, 0])
        arousal_r2 = r2_score(ground_truth[:, 1], preds[:, 1])

        metrics[name] = {
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'Valence_RMSE': valence_rmse,
            'Arousal_RMSE': arousal_rmse,
            'Valence_MAE': valence_mae,
            'Arousal_MAE': arousal_mae,
            'Valence_R2': valence_r2,
            'Arousal_R2': arousal_r2
        }

        print(f"\n   {name}:")
        print(f"     Overall  - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
        print(f"     Valence  - RMSE: {valence_rmse:.4f}, MAE: {valence_mae:.4f}, R²: {valence_r2:.4f}")
        print(f"     Arousal  - RMSE: {arousal_rmse:.4f}, MAE: {arousal_mae:.4f}, R²: {arousal_r2:.4f}")

    return metrics, audio_lyrics_distances, audio_bimodal_distances, lyrics_bimodal_distances

# Run comparison
metrics, audio_lyrics_dist, audio_bimodal_dist, lyrics_bimodal_dist = compute_unimodal_comparison(
    audio_predictions,
    lyrics_predictions,
    predictions,  # From bimodal model (Cell 14)
    ground_truth
)

In [None]:
# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Euclidean Distance Distributions
ax1 = fig.add_subplot(gs[0, :])
ax1.hist(audio_lyrics_dist, bins=30, alpha=0.6, label='Audio ↔ Lyrics', color='steelblue')
ax1.hist(audio_bimodal_dist, bins=30, alpha=0.6, label='Audio ↔ Bimodal', color='orange')
ax1.hist(lyrics_bimodal_dist, bins=30, alpha=0.6, label='Lyrics ↔ Bimodal', color='green')
ax1.set_xlabel('Euclidean Distance', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Distribution of Prediction Distances', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)

# 2. RMSE Comparison
ax2 = fig.add_subplot(gs[1, 0])
models = ['Audio-Only', 'Lyrics-Only', 'Bimodal']
rmse_vals = [metrics[m]['RMSE'] for m in models]
colors_rmse = ['steelblue', 'orange', 'green']
ax2.bar(models, rmse_vals, color=colors_rmse, alpha=0.7, edgecolor='black')
ax2.set_ylabel('RMSE', fontsize=12)
ax2.set_title('Overall RMSE Comparison', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
for i, v in enumerate(rmse_vals):
    ax2.text(i, v + 0.005, f'{v:.4f}', ha='center', fontsize=10)

# 3. R² Comparison
ax3 = fig.add_subplot(gs[1, 1])
r2_vals = [metrics[m]['R2'] for m in models]
ax3.bar(models, r2_vals, color=colors_rmse, alpha=0.7, edgecolor='black')
ax3.set_ylabel('R² Score', fontsize=12)
ax3.set_title('Overall R² Comparison', fontsize=12, fontweight='bold')
ax3.axhline(y=0, color='red', linestyle='--', linewidth=1)
ax3.grid(axis='y', alpha=0.3)
for i, v in enumerate(r2_vals):
    ax3.text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=10)

# 4. MAE Comparison
ax4 = fig.add_subplot(gs[1, 2])
mae_vals = [metrics[m]['MAE'] for m in models]
ax4.bar(models, mae_vals, color=colors_rmse, alpha=0.7, edgecolor='black')
ax4.set_ylabel('MAE', fontsize=12)
ax4.set_title('Overall MAE Comparison', fontsize=12, fontweight='bold')
ax4.grid(axis='y', alpha=0.3)
for i, v in enumerate(mae_vals):
    ax4.text(i, v + 0.003, f'{v:.4f}', ha='center', fontsize=10)

# 5. Valence RMSE Comparison
ax5 = fig.add_subplot(gs[2, 0])
val_rmse = [metrics[m]['Valence_RMSE'] for m in models]
ax5.bar(models, val_rmse, color=colors_rmse, alpha=0.7, edgecolor='black')
ax5.set_ylabel('RMSE', fontsize=12)
ax5.set_title('Valence RMSE', fontsize=12, fontweight='bold')
ax5.grid(axis='y', alpha=0.3)

# 6. Arousal RMSE Comparison
ax6 = fig.add_subplot(gs[2, 1])
aro_rmse = [metrics[m]['Arousal_RMSE'] for m in models]
ax6.bar(models, aro_rmse, color=colors_rmse, alpha=0.7, edgecolor='black')
ax6.set_ylabel('RMSE', fontsize=12)
ax6.set_title('Arousal RMSE', fontsize=12, fontweight='bold')
ax6.grid(axis='y', alpha=0.3)

# 7. Scatter: Audio vs Lyrics Predictions (Valence)
ax7 = fig.add_subplot(gs[2, 2])
ax7.scatter(audio_predictions[:, 0], lyrics_predictions[:, 0], alpha=0.5, s=20, color='purple')
ax7.plot([0, 1], [0, 1], 'r--', linewidth=2, label='Perfect agreement')
ax7.set_xlabel('Audio Valence Prediction', fontsize=10)
ax7.set_ylabel('Lyrics Valence Prediction', fontsize=10)
ax7.set_title('Audio vs Lyrics: Valence', fontsize=12, fontweight='bold')
ax7.legend()
ax7.grid(alpha=0.3)

plt.suptitle('Unimodal vs Bimodal Prediction Comparison', fontsize=16, fontweight='bold', y=0.995)
plt.show()

In [None]:
# Create comprehensive results DataFrame
results_df = analysis_df[[id_column_name, 'valence', 'arousal']].copy()

# Add predictions from all three models
results_df['audio_valence_pred'] = audio_predictions[:, 0]
results_df['audio_arousal_pred'] = audio_predictions[:, 1]
results_df['lyrics_valence_pred'] = lyrics_predictions[:, 0]
results_df['lyrics_arousal_pred'] = lyrics_predictions[:, 1]
results_df['bimodal_valence_pred'] = predictions[:, 0]
results_df['bimodal_arousal_pred'] = predictions[:, 1]

# Add Euclidean distances
results_df['audio_lyrics_distance'] = audio_lyrics_dist
results_df['audio_bimodal_distance'] = audio_bimodal_dist
results_df['lyrics_bimodal_distance'] = lyrics_bimodal_dist

# Add individual errors
results_df['audio_valence_error'] = np.abs(audio_predictions[:, 0] - ground_truth[:, 0])
results_df['audio_arousal_error'] = np.abs(audio_predictions[:, 1] - ground_truth[:, 1])
results_df['lyrics_valence_error'] = np.abs(lyrics_predictions[:, 0] - ground_truth[:, 0])
results_df['lyrics_arousal_error'] = np.abs(lyrics_predictions[:, 1] - ground_truth[:, 1])
results_df['bimodal_valence_error'] = np.abs(predictions[:, 0] - ground_truth[:, 0])
results_df['bimodal_arousal_error'] = np.abs(predictions[:, 1] - ground_truth[:, 1])

# Save to Excel
excel_path = '/content/drive/MyDrive/dissertation/unimodal_comparison_results.xlsx'
results_df.to_excel(excel_path, index=False, sheet_name='Predictions')

print(f"Results saved to Excel: {excel_path}")
print(f"\nColumns saved:")
print(f"  - Song ID and ground truth (valence, arousal)")
print(f"  - Audio predictions (valence, arousal)")
print(f"  - Lyrics predictions (valence, arousal)")
print(f"  - Bimodal predictions (valence, arousal)")
print(f"  - Euclidean distances between predictions")
print(f"  - Individual errors for each modality")

In [None]:
# Save unimodal model weights with descriptive names
output_dir = '/content/drive/MyDrive/dissertation/'

# Save audio-only model
audio_save_path = os.path.join(output_dir, 'audio_only_model_weights.pth')
torch.save(audio_only_model.state_dict(), audio_save_path)
print(f"Audio-only model saved: {audio_save_path}")

# Save lyrics-only model
lyrics_save_path = os.path.join(output_dir, 'lyrics_only_model_weights.pth')
torch.save(lyrics_only_model.state_dict(), lyrics_save_path)
print(f"Lyrics-only model saved: {lyrics_save_path}")

# Also save a summary JSON with metrics
metrics_summary = {
    'audio_only': metrics['Audio-Only'],
    'lyrics_only': metrics['Lyrics-Only'],
    'bimodal': metrics['Bimodal'],
    'distance_statistics': {
        'audio_lyrics_mean': float(audio_lyrics_dist.mean()),
        'audio_lyrics_std': float(audio_lyrics_dist.std()),
        'audio_bimodal_mean': float(audio_bimodal_dist.mean()),
        'audio_bimodal_std': float(audio_bimodal_dist.std()),
        'lyrics_bimodal_mean': float(lyrics_bimodal_dist.mean()),
        'lyrics_bimodal_std': float(lyrics_bimodal_dist.std())
    }
}

metrics_path = os.path.join(output_dir, 'unimodal_comparison_metrics.json')
with open(metrics_path, 'w') as f:
    json.dump(metrics_summary, f, indent=2)
print(f"Metrics summary saved: {metrics_path}")