<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Siamese_network_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

‚è¨ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
üì¶ Installing...
üìå Adjusting configuration...
ü©π Patching environment...
‚è≤ Done in 0:00:09
üîÅ Restarting kernel...


In [1]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

environment.yml file created successfully.

 Creating environment
Channels:
 - pytorch
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done

 'dissertation' environment is ready to use.


In [2]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("‚è≥ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel
import torch.optim as optim
import wandb
import subprocess
import shutil


print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_decomposition import CCA
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
import types
import json
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.spatial.distance import euclidean

import torch.nn.functional as F

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

‚è≥ Cloning GitHub repository...
Cloning into 'Dissertation'...
remote: Enumerating objects: 403, done.[K
remote: Counting objects: 100% (111/111), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 403 (delta 95), reused 64 (delta 64), pack-reused 292 (from 1)[K
Receiving objects: 100% (403/403), 5.67 MiB | 12.27 MiB/s, done.
Resolving deltas: 100% (217/217), done.
Repository cloned.
Mounted at /content/drive
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded.


In [11]:
class MER_Dataset(Dataset):
    """ Custom PyTorch Dataset with On-the-Fly Augmentation """
    def __init__(self, annotations_df, tokenizer, augment=False):
        self.annotations = annotations_df
        self.tokenizer = tokenizer
        self.augment = augment  # Only augment training data

    def __len__(self):
        return len(self.annotations)

    def augment_spectrogram(self, spec):
        """
        Apply Frequency Masking and Time Masking (SpecAugment)
        Input shape: (Freq, Time) -> 2D Array
        """
        # Frequency Masking (Masking horizontal strips)
        freq_mask_param = 10
        if spec.shape[0] > freq_mask_param:
            f = np.random.randint(0, freq_mask_param)
            f_start = np.random.randint(0, spec.shape[0] - f)
            spec[f_start:f_start+f, :] = 0  # Apply mask to rows

        # Time Masking (Masking vertical strips)
        time_mask_param = 20
        if spec.shape[1] > time_mask_param:
            t = np.random.randint(0, time_mask_param)
            t_start = np.random.randint(0, spec.shape[1] - t)
            spec[:, t_start:t_start+t] = 0  # Apply mask to columns

        # Add slight Gaussian Noise
        noise = np.random.randn(*spec.shape) * 0.01
        spec = spec + noise
        return spec

    def __getitem__(self, index):
        song_info = self.annotations.iloc[index]

        # Load Audio
        spectrogram = np.load(song_info['spectrogram_path'])

        # Apply Audio Augmentation
        if self.augment:
            spectrogram = self.augment_spectrogram(spectrogram)

        # Convert to Tensor and ADD Channel Dimension
        spectrogram_tensor = torch.from_numpy(spectrogram).float()
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) # Shape becomes [1, Freq, Time]

        #Load Lyrics
        encoded_lyrics = torch.load(song_info['lyrics_path'], weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0)
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0)

        # Apply Text Augmentation (If training) - Randomly mask 15% of tokens
        if self.augment:
            mask_prob = 0.15
            # Create a mask for tokens to hide (exclude [CLS] and [SEP])
            rand = torch.rand(input_ids.shape)
            mask_indices = (rand < mask_prob) & (input_ids != 101) & (input_ids != 102) & (input_ids != 0)
            input_ids[mask_indices] = 103 # mask id in bert

        labels = torch.tensor([song_info['valence'], song_info['arousal']], dtype=torch.float32)

        return spectrogram_tensor, input_ids, attention_mask, labels

In [12]:
class AttentionModule(nn.Module): #Addition from V1
    def __init__(self, feature_dim):
        super(AttentionModule, self).__init__()
        '''
        Attention mechanism to weight the importance of different features
        '''
        self.attention = nn.Sequential(
            nn.Linear(feature_dim, feature_dim // 4),  # input is 64 will map to16
            nn.ReLU(),
            nn.Linear(feature_dim // 4, feature_dim),  #reverts back to 64 from 16
            nn.Sigmoid()
        )

    def forward(self, x):
        # x shape: [batch_size, 64]
        attention_weights = self.attention(x)  # [batch_size, 64]
        weighted_features = x * attention_weights  # Element-wise multiplication
        return weighted_features

In [13]:
class VGGish_Audio_Model(nn.Module):
    '''As previous vERSION but adding in the following
      - Batch normalisation
      - Attention mechanism
      - Learning rate scheduling
      - early stopping'''

    def __init__(self):
        super(VGGish_Audio_Model, self).__init__()
        '''
        A VGG-style model for the audio tower for a starting model.
        No longer trying to implement the method from MERGE paper as this had mistakes in the paper
        V1.1 includes attention to see if this improves performance.
        V1.2  implements true VGG-style blocks with multiple convolutions per block.
        '''
        self.features = nn.Sequential(
            # Block 1 - 2 convolutions
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2 - 2 convolutions
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3 - 2 convolutions
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4 - 2 convolutions
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        self.dropout1 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(512, 256)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(0.5)
        self.attention = AttentionModule(256) #Add attention here from v2 (model 3)
        self.fc2 = nn.Linear(256, 64) # Final feature vector size should be 64 - needs to match input of combined

    def forward(self, x):
        x = self.features(x)
        # Flatten the features for the classifier
        x = x.view(x.size(0), -1)
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout2(x)
        x = self.attention(x)
        x = self.fc2(x)
        return x

In [14]:
#Data loading and prep

#get the paths to dissertation folder and new folder on colab
print("Starting data transfer from Google Drive to local Colab storage...")

#get paths for old file location and new colab one
gdrive_zip_path = '/content/drive/MyDrive/dissertation/merge_dataset_zipped.zip'
local_storage_path = '/content/local_dissertation_data/'
local_zip_path = os.path.join(local_storage_path, 'merge_dataset_zipped.zip')
os.makedirs(local_storage_path, exist_ok=True) # Ensure the destination directory exists

#Copy zip file from Drive to Colab
print("Copying single archive file from Google Drive...")
!rsync -ah --progress "{gdrive_zip_path}" "{local_storage_path}"

#get total number of files for progress
total_files = int(subprocess.check_output(f"zipinfo -1 {local_zip_path} | wc -l", shell=True))

#unzip the file
print("Extracting files locally")
!unzip -o "{local_zip_path}" -d "{local_storage_path}" | tqdm --unit=files --total={total_files} > /dev/null

print("Data transfer and extraction complete.")

#load master data from new location
local_output_path = os.path.join(local_storage_path, 'merge_dataset/output_from_code/')
master_file_path = os.path.join(local_output_path, 'master_processed_file_list.csv')
master_df = pd.read_csv(master_file_path)

#checking the valence and arousal range in the dataset
print(f"\nValence range in data: [{master_df['valence'].min()}, {master_df['valence'].max()}]")
print(f"Arousal range in data: [{master_df['arousal'].min()}, {master_df['arousal'].max()}]")
print(f"Valence mean: {master_df['valence'].mean():.4f}, std: {master_df['valence'].std():.4f}")
print(f"Arousal mean: {master_df['arousal'].mean():.4f}, std: {master_df['arousal'].std():.4f}")
print(f"Total samples in master_df: {len(master_df)}")

# Verify its the right column - not quadrants
print(f"\nNumber of unique valence values: {master_df['valence'].nunique()}")
print(f"Number of unique arousal values: {master_df['arousal'].nunique()}")
print(f"Number of unique quadrant values: {master_df['quadrant'].nunique()}")

# Sample some actual values
print(f"\nSample valence values: {master_df['valence'].sample(10).values}")
print(f"Sample arousal values: {master_df['arousal'].sample(10).values}")

#update the paths in the csv
print("\nUpdating dataframe paths to use fast local storage...")
gdrive_output_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_df['spectrogram_path'] = master_df['spectrogram_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
master_df['lyrics_path'] = master_df['lyrics_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
print("Dataframe paths updated.")

#load the data splits from the new path in the predefined splits folder tvt
local_split_folder_path = os.path.join(local_storage_path, 'merge_dataset/MERGE_Bimodal_Complete/tvt_dataframes/tvt_70_15_15/')
train_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_train_bimodal_complete.csv'))
val_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_validate_bimodal_complete.csv'))
test_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_test_bimodal_complete.csv'))
print("\nSplit files loaded from local storage.")

#merge the files
id_column_name = 'song_id'
train_split_df.rename(columns={'Song': id_column_name}, inplace=True)
val_split_df.rename(columns={'Song': id_column_name}, inplace=True)
test_split_df.rename(columns={'Song': id_column_name}, inplace=True)

train_df = pd.merge(master_df, train_split_df, on=id_column_name)
val_df = pd.merge(master_df, val_split_df, on=id_column_name)
test_df = pd.merge(master_df, test_split_df, on=id_column_name)

#checking no files are lost in merging - and checking length of the dataframes.
print("\nchecking data")

#check no data lost in merge
if len(train_df) == len(train_split_df):
    print("\nTraining split: Merge successful. All songs accounted for.")
else:
    print(f"\nWARNING: Training split lost {len(train_split_df) - len(train_df)} songs during merge.")

if len(val_df) == len(val_split_df):
    print("Validation split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Validation split lost {len(val_split_df) - len(val_df)} songs during merge.")

if len(test_df) == len(test_split_df):
    print("Test split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Test split lost {len(test_split_df) - len(test_df)} songs during merge.")

#check length
expected_train_len = 1552
expected_val_len = 332
expected_test_len = 332

assert len(train_df) == expected_train_len, f"Expected {expected_train_len} training samples, but found {len(train_df)}"
assert len(val_df) == expected_val_len, f"Expected {expected_val_len} validation samples, but found {len(val_df)}"
assert len(test_df) == expected_test_len, f"Expected {expected_test_len} test samples, but found {len(test_df)}"

print(f"\nFinal dataset lengths are correct: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})")
print("Data Check Complete")

#createthe datasets and loaders - WITH ENABLE FOR TRAIN
train_dataset = MER_Dataset(annotations_df=train_df, tokenizer=tokenizer, augment=True)
val_dataset = MER_Dataset(annotations_df=val_df, tokenizer=tokenizer, augment=False)
test_dataset = MER_Dataset(annotations_df=test_df, tokenizer=tokenizer, augment=False)

# Increase Batch Size to 32 from 16 previous
BATCH_SIZE = 32

print(f"Creating DataLoaders with Batch Size: {BATCH_SIZE}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\nDataLoaders updated with Augmentation and larger Batch Size.")

Starting data transfer from Google Drive to local Colab storage...
Copying single archive file from Google Drive...
sending incremental file list
Extracting files locally
4442files [00:15, 284.05files/s]           
Data transfer and extraction complete.

Valence range in data: [0.0187499999999999, 0.9875]
Arousal range in data: [0.0625, 0.975]
Valence mean: 0.5050, std: 0.2311
Arousal mean: 0.4823, std: 0.1395
Total samples in master_df: 2216

Number of unique valence values: 465
Number of unique arousal values: 443
Number of unique quadrant values: 4

Sample valence values: [0.75375  0.2675   0.73625  0.1375   0.249375 0.66     0.3125   0.3425
 0.7125   0.26875 ]
Sample arousal values: [0.54375  0.595    0.0875   0.5725   0.300625 0.26125  0.285    0.36875
 0.476875 0.515   ]

Updating dataframe paths to use fast local storage...
Dataframe paths updated.

Split files loaded from local storage.

checking data

Training split: Merge successful. All songs accounted for.
Validation split:

In [15]:
# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA device.")
else:
    # If no GPU is found, print an error and stop execution by raising an error.
    raise RuntimeError("Error: No GPU found. This script requires a GPU to run.")


GPU is available. Using CUDA device.


In [17]:

print("\n" + "="*70)
print("BUILDING SIAMESE ARCHITECTURE (FIXED)")
print("="*70)

class SiameseNetwork(nn.Module):
    def __init__(self, audio_tower_class, feature_dim=128):
        super(SiameseNetwork, self).__init__()

        # Audio Tower
        self.audio_tower = audio_tower_class()
        self.audio_projector = nn.Sequential(
            nn.Linear(64, feature_dim),
            nn.BatchNorm1d(feature_dim),
            nn.ReLU(),
            nn.Linear(feature_dim, feature_dim)
        )

        # Lyrics Tower
        self.lyrics_tower = AutoModel.from_pretrained('bert-base-uncased')


        #Freeze everything first (Safety default)
        for param in self.lyrics_tower.parameters():
            param.requires_grad = False

        #nfreeze the last 2 encoder layers
        for layer in self.lyrics_tower.encoder.layer[-2:]:
            for param in layer.parameters():
                param.requires_grad = True

        # Unfreeze the Pooler layer
        for param in self.lyrics_tower.pooler.parameters():
            param.requires_grad = True
        # --------------------------------------------

        self.lyrics_projector = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, feature_dim)
        )

    def forward_audio(self, spec):
        feat = self.audio_tower(spec)
        emb = self.audio_projector(feat)
        return F.normalize(emb, p=2, dim=1)

    def forward_lyrics(self, input_ids, mask):
        out = self.lyrics_tower(input_ids=input_ids, attention_mask=mask)
        feat = out.last_hidden_state[:, 0, :]
        emb = self.lyrics_projector(feat)
        return F.normalize(emb, p=2, dim=1)

    def forward(self, spec, input_ids, mask):
        z_audio = self.forward_audio(spec)
        z_lyrics = self.forward_lyrics(input_ids, mask)
        return z_audio, z_lyrics

class ContrastiveLoss(nn.Module):
    """
    The Loss Function.
    - Pulls matching (positive) pairs together.
    - Pushes mismatched (negative) pairs apart.
    """
    def __init__(self, temperature=0.1):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, audio_embeddings, lyrics_embeddings):
        # Calculate Similarity Matrix (Audio x Lyrics)
        logits = torch.matmul(audio_embeddings, lyrics_embeddings.T) / self.temperature  # divide by temperature to sharpen the probability distribution

        # The "Correct Answer" is the diagonal (0, 1, 2...)
        labels = torch.arange(logits.shape[0]).to(logits.device)

        # Calculate Loss Audio->Lyrics and Lyrics->Audio (Symmetric Loss)
        loss_a = self.criterion(logits, labels)
        loss_l = self.criterion(logits.T, labels)

        return (loss_a + loss_l) / 2

# Re-Initialise the Model
siamese_model = SiameseNetwork(VGGish_Audio_Model).to(device)

# Set Temperature to 0.1
contrastive_loss = ContrastiveLoss(temperature=0.1).to(device)

print("‚úì Siamese Model Updated: BERT partially unfrozen.")
print("‚úì Contrastive Loss class defined and ready.")


BUILDING SIAMESE ARCHITECTURE (FIXED)
‚úì Siamese Model Updated: BERT partially unfrozen.
‚úì Contrastive Loss class defined and ready.


In [18]:

print("\n" + "="*70)
print("TRAINING SIAMESE NETWORK (CONTRASTIVE LEARNING)")
print("="*70)

# --- CONFIGURATION ---
NUM_EPOCHS = 30 # Contrastive converges faster than regression usually
LEARNING_RATE = 1e-4 # Lower learning rate is better for Siamese stability
PATIENCE = 5 # Early stopping

#Optimiser
optimizer = optim.Adam(siamese_model.parameters(), lr=LEARNING_RATE)

#scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

#start Tracking
best_val_loss = float('inf')
patience_counter = 0

# Start W&B
wandb.init(project="dissertation-siamese", name="contrastive-training-v1")

for epoch in range(NUM_EPOCHS):
    # --- TRAINING LOOP ---
    siamese_model.train()
    total_train_loss = 0

    for spec, input_ids, mask, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]"):
        # Move to GPU
        spec = spec.to(device)
        input_ids = input_ids.to(device)
        mask = mask.to(device)

        optimizer.zero_grad()

        # Forward Pass (Get the two vectors)
        z_audio, z_lyrics = siamese_model(spec, input_ids, mask)

        #Calculate Contrastive Loss
        # (Pull matching pair together, push others in batch apart)
        loss = contrastive_loss(z_audio, z_lyrics)

        #Backward Pass
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # --- VALIDATION LOOP ---
    siamese_model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for spec, input_ids, mask, _ in tqdm(val_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Val]"):
            spec = spec.to(device)
            input_ids = input_ids.to(device)
            mask = mask.to(device)

            # Get embeddings
            z_audio, z_lyrics = siamese_model(spec, input_ids, mask)

            # Calculate Loss
            loss = contrastive_loss(z_audio, z_lyrics)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # --- LOGGING ---
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    wandb.log({"train_loss": avg_train_loss, "val_loss": avg_val_loss, "epoch": epoch+1})
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current Learning Rate: {current_lr}")
    wandb.log({"learning_rate": current_lr})

    # --- EARLY STOPPING ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save the best model
        torch.save(siamese_model.state_dict(), 'best_siamese_augmented.pth')
        print(f"  ‚úì New best model saved! (Loss: {best_val_loss:.4f})")
    else:
        patience_counter += 1
        print(f"  No improvement ({patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print("  Early stopping triggered")
            break

    # Update Learning Rate
    scheduler.step(avg_val_loss)

print("Training Complete.")
wandb.finish()


TRAINING SIAMESE NETWORK (CONTRASTIVE LEARNING)


Epoch 1/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:32<00:00,  1.46it/s]
Epoch 1/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.63it/s]


Epoch 1 | Train Loss: 3.4523 | Val Loss: 3.3648
Current Learning Rate: 0.0001
  ‚úì New best model saved! (Loss: 3.3648)


Epoch 2/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 2/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.61it/s]


Epoch 2 | Train Loss: 3.0832 | Val Loss: 3.1551
Current Learning Rate: 0.0001
  ‚úì New best model saved! (Loss: 3.1551)


Epoch 3/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 3/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.63it/s]


Epoch 3 | Train Loss: 2.8981 | Val Loss: 2.8938
Current Learning Rate: 0.0001
  ‚úì New best model saved! (Loss: 2.8938)


Epoch 4/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.53it/s]
Epoch 4/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.65it/s]


Epoch 4 | Train Loss: 2.8058 | Val Loss: 2.9045
Current Learning Rate: 0.0001
  No improvement (1/5)


Epoch 5/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.53it/s]
Epoch 5/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.64it/s]


Epoch 5 | Train Loss: 2.7381 | Val Loss: 2.8286
Current Learning Rate: 0.0001
  ‚úì New best model saved! (Loss: 2.8286)


Epoch 6/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 6/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.64it/s]


Epoch 6 | Train Loss: 2.6582 | Val Loss: 2.7969
Current Learning Rate: 0.0001
  ‚úì New best model saved! (Loss: 2.7969)


Epoch 7/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 7/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.65it/s]


Epoch 7 | Train Loss: 2.5612 | Val Loss: 2.9426
Current Learning Rate: 0.0001
  No improvement (1/5)


Epoch 8/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:30<00:00,  1.55it/s]
Epoch 8/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.65it/s]


Epoch 8 | Train Loss: 2.5005 | Val Loss: 3.0470
Current Learning Rate: 0.0001
  No improvement (2/5)


Epoch 9/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.55it/s]
Epoch 9/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.66it/s]


Epoch 9 | Train Loss: 2.4170 | Val Loss: 2.8236
Current Learning Rate: 0.0001
  No improvement (3/5)


Epoch 10/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.55it/s]
Epoch 10/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.64it/s]


Epoch 10 | Train Loss: 2.2658 | Val Loss: 2.8645
Current Learning Rate: 5e-05
  No improvement (4/5)


Epoch 11/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 11/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.66it/s]


Epoch 11 | Train Loss: 2.1916 | Val Loss: 2.7757
Current Learning Rate: 5e-05
  ‚úì New best model saved! (Loss: 2.7757)


Epoch 12/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.55it/s]
Epoch 12/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.66it/s]


Epoch 12 | Train Loss: 2.0854 | Val Loss: 2.9568
Current Learning Rate: 5e-05
  No improvement (1/5)


Epoch 13/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.53it/s]
Epoch 13/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.64it/s]


Epoch 13 | Train Loss: 2.0708 | Val Loss: 3.3100
Current Learning Rate: 5e-05
  No improvement (2/5)


Epoch 14/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.53it/s]
Epoch 14/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.63it/s]


Epoch 14 | Train Loss: 2.0064 | Val Loss: 2.9901
Current Learning Rate: 5e-05
  No improvement (3/5)


Epoch 15/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.53it/s]
Epoch 15/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.63it/s]


Epoch 15 | Train Loss: 1.8672 | Val Loss: 3.0768
Current Learning Rate: 2.5e-05
  No improvement (4/5)


Epoch 16/30 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:31<00:00,  1.54it/s]
Epoch 16/30 [Val]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.64it/s]

Epoch 16 | Train Loss: 1.8437 | Val Loss: 3.0272
Current Learning Rate: 2.5e-05
  No improvement (5/5)
  Early stopping triggered
Training Complete.





0,1
epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà
learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÅ‚ñÅ
train_loss,‚ñà‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ
val_loss,‚ñà‚ñÜ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÉ‚ñÑ‚ñÇ‚ñÇ‚ñÅ‚ñÉ‚ñá‚ñÑ‚ñÖ‚ñÑ

0,1
epoch,16.0
learning_rate,3e-05
train_loss,1.84369
val_loss,3.02719


In [19]:
print("\n" + "="*70)
print("TESTING AUGMENTED MODEL: THE 'MATCHING GAME'")
print("="*70)

#Load the Best AUGMENTED Model
weights_path = 'best_siamese_augmented.pth'
if not os.path.exists(weights_path):
    print("Local weights not found, checking Drive...")
    weights_path = '/content/drive/MyDrive/dissertation/siamese_augmented_model.pth'

if os.path.exists(weights_path):
    siamese_model.load_state_dict(torch.load(weights_path, map_location=device))
    print(f"‚úì Loaded weights from: {weights_path}")
else:
    print("‚ö† ERROR: Weights not found! Did training finish?")

siamese_model.eval()

# Extract Embeddings for the Test Set (No Augmentation here!)
test_audio_embeddings = []
test_lyrics_embeddings = []

print("Extracting Test Set Embeddings...")
with torch.no_grad():
    for spec, input_ids, mask, _ in tqdm(test_loader):
        spec = spec.to(device)
        input_ids = input_ids.to(device)
        mask = mask.to(device)

        z_a, z_l = siamese_model(spec, input_ids, mask)

        test_audio_embeddings.append(z_a.cpu().numpy())
        test_lyrics_embeddings.append(z_l.cpu().numpy())

# Concatenate
a_emb = np.concatenate(test_audio_embeddings)
l_emb = np.concatenate(test_lyrics_embeddings)

# Calculate Full Distance Matrix
print("Calculating all distances...")
dist_matrix = cdist(a_emb, l_emb, metric='euclidean')

# Calculate Ranks
ranks = []
for i in range(len(dist_matrix)):
    # Distance to the correct lyrics is at index [i]

    dists = dist_matrix[i]
    sorted_indices = np.argsort(dists)
    rank = np.where(sorted_indices == i)[0][0] + 1
    ranks.append(rank)

#Calculate Metrics
recall_at_1 = np.mean(np.array(ranks) == 1) * 100
recall_at_5 = np.mean(np.array(ranks) <= 5) * 100
recall_at_10 = np.mean(np.array(ranks) <= 10) * 100
median_rank = np.median(ranks)

print("-" * 40)
print(f"RESULTS FOR AUGMENTED MODEL:")
print(f"Recall@1 (Exact Match): {recall_at_1:.2f}%  (Prev: 1.2%)")
print(f"Recall@5 (Top 5):       {recall_at_5:.2f}%  (Prev: 9.3%)")
print(f"Median Rank:            {median_rank:.0f}")
print("-" * 40)

# 6. Plot
plt.figure(figsize=(10, 6))
sns.histplot(ranks, bins=50, color='teal', kde=True)
plt.title(f"Rank Distribution (Augmented Model)\nRecall@1: {recall_at_1:.1f}%")
plt.xlabel("Rank (1 is perfect)")
plt.xlim(0, 100) # Zoom in on the top 100
plt.grid(True, alpha=0.3)

save_path = '/content/drive/MyDrive/dissertation/siamese_augmented_rank_plot.png'
plt.savefig(save_path)
print(f"Graph saved to: {save_path}")
plt.show()


TESTING AUGMENTED MODEL: THE 'MATCHING GAME'
‚úì Loaded weights from: best_siamese_augmented.pth
Extracting Test Set Embeddings...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:02<00:00,  3.78it/s]

Calculating all distances...





NameError: name 'cdist' is not defined

In [20]:

from scipy.spatial.distance import cdist
print("\n" + "="*70)
print("GENERATING DETAILED EXCEL REPORT (AUGMENTED MODEL)")
print("="*70)

# Calculate the full Distance Matrix (Audio x Lyrics)
print("Calculating full distance matrix...")
dist_matrix = cdist(a_emb, l_emb, metric='euclidean')

# Prepare Lists for DataFrame
ranks = []
correct_distances = []
nearest_neighbor_dists = []
is_top_1 = []
is_top_5 = []
is_top_10 = []

num_songs = len(dist_matrix)

print(f"Analyzing {num_songs} songs...")

for i in range(num_songs):
    # Get all distances for Audio song 'i'
    dists_for_song = dist_matrix[i]

    # Distance to the CORRECT lyrics (which is at index i)
    correct_dist = dists_for_song[i]
    correct_distances.append(correct_dist)

    # Sort to find Rank
    sorted_indices = np.argsort(dists_for_song)

    # Find Rank (add 1 because python is 0-indexed)
    rank = np.where(sorted_indices == i)[0][0] + 1
    ranks.append(rank)

    # Distance to the "Winner" (Closest Match)
    nearest_neighbor_dists.append(dists_for_song[sorted_indices[0]])

    # Recall Metrics
    is_top_1.append(1 if rank == 1 else 0)
    is_top_5.append(1 if rank <= 5 else 0)
    is_top_10.append(1 if rank <= 10 else 0)

#Create DataFrame
results_df = test_df[['song_id', 'valence', 'arousal', 'quadrant']].copy()

results_df['augment_model_rank'] = ranks
results_df['is_exact_match'] = is_top_1
results_df['in_top_5'] = is_top_5
results_df['dist_to_correct'] = correct_distances
results_df['dist_to_closest'] = nearest_neighbor_dists

#Calculate Summary Stats for verification
mean_rank = np.mean(ranks)
recall_1 = np.mean(is_top_1) * 100
recall_5 = np.mean(is_top_5) * 100

print(f"\n--- REPORT SUMMARY ---")
print(f"Mean Rank: {mean_rank:.1f}")
print(f"Recall@1:  {recall_1:.2f}%")
print(f"Recall@5:  {recall_5:.2f}%")

# Save to Excel
excel_path = '/content/drive/MyDrive/dissertation/siamese_augmented_results.xlsx'
try:
    results_df.to_excel(excel_path, index=False)
    print(f"\n‚úÖ Excel report successfully saved to: {excel_path}")
except Exception as e:
    print(f"Error saving Excel file: {e}")


GENERATING DETAILED EXCEL REPORT (AUGMENTED MODEL)
Calculating full distance matrix...
Analyzing 332 songs...

--- REPORT SUMMARY ---
Mean Rank: 61.9
Recall@1:  1.51%
Recall@5:  7.23%

‚úÖ Excel report successfully saved to: /content/drive/MyDrive/dissertation/siamese_augmented_results.xlsx
