<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Similarity_Analysis_Audio_vs_Lyrics_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

✨🍰✨ Everything looks OK!


In [None]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

environment.yml file created successfully.

 Creating environment

CondaValueError: prefix already exists: /usr/local/envs/dissertation



In [None]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel
import torch.optim as optim
import wandb
import subprocess

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_decomposition import CCA
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
import types
import json

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

⏳ Cloning GitHub repository...
fatal: destination path 'Dissertation' already exists and is not an empty directory.
Repository cloned.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizer loaded.


In [None]:
class MER_Dataset(Dataset):
    """ Custom PyTorch Dataset for loading MER data. """
    def __init__(self, annotations_df, tokenizer):
        """ Creation of the Dataset from the dataframe (predefined splits in MERGE dataset) """
        self.annotations = annotations_df
        self.tokenizer = tokenizer

    def __len__(self):
        """
        Function to return the total number of songs in the dataset.
        """
        return len(self.annotations)

    def __getitem__(self, index):
        """
        Function to get a song from the dataset.
        """
        song_info = self.annotations.iloc[index] #which song ID/row is picked from the dataset as per the index

        spectrogram_path = song_info['spectrogram_path'] # columns from the df
        lyrics_path = song_info['lyrics_path'] # columns from the df
        valence = song_info['valence'] # columns from the df
        arousal = song_info['arousal'] # columns from the df

        #change spectorgram into a tensor
        spectrogram = np.load(spectrogram_path) #loading spectorgram from path saved in df
        spectrogram_tensor = torch.from_numpy(spectrogram).float() # changing the np array to tensor
        spectrogram_tensor = spectrogram_tensor.unsqueeze(0) #Adding a "channel" dimension for CNN

        #Load the lyric tokens
        encoded_lyrics = torch.load(lyrics_path, weights_only=False)
        input_ids = encoded_lyrics['input_ids'].squeeze(0) #remove the batch dimension from input ids so 1d array
        attention_mask = encoded_lyrics['attention_mask'].squeeze(0) #remove the batch dimension from attention mask so 1d

        labels = torch.tensor([valence, arousal], dtype=torch.float32) # extract labels

        return spectrogram_tensor, input_ids, attention_mask, labels

In [None]:
class AttentionModule(nn.Module): #Addition from V1
    def __init__(self, feature_dim):
        super(AttentionModule, self).__init__()
        '''
        Attention mechanism to weight the importance of different features
        '''
        self.attention = nn.Sequential(
            nn.Linear(feature_dim, feature_dim // 4),  # input is 64 will map to16
            nn.ReLU(),
            nn.Linear(feature_dim // 4, feature_dim),  #reverts back to 64 from 16
            nn.Sigmoid()
        )

    def forward(self, x):
        # x shape: [batch_size, 64]
        attention_weights = self.attention(x)  # [batch_size, 64]
        weighted_features = x * attention_weights  # Element-wise multiplication
        return weighted_features

In [None]:
class VGGish_Audio_Model(nn.Module):
    '''As previous VQ but adding in the following
      - Batch normalisation
      - Attention mechanism
      - Learning rate scheduling
      - early stopping'''

    def __init__(self):
        super(VGGish_Audio_Model, self).__init__()
        '''
        A VGG-style model for the audio tower for a starting model.
        No longer trying to implement the method from MERGE paper as this had mistakes in the paper
        V1.1 includes attention to see if this improves performance.
        '''
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64), #Addition from V1
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128), #Addition from V1
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256), #Addition from V1
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512), #Addition from V1
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.dropout1 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(512, 256)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(0.5)
        self.attention = AttentionModule(256) #Add attention here from v2 (model 3)
        self.fc2 = nn.Linear(256, 64) # Final feature vector size should be 64 - needs to match input of combined

    def forward(self, x):
        x = self.features(x)
        #flatten the features for the classifier
        x = x.view(x.size(0), -1)
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout2(x)
        x = self.attention(x)
        x = self.fc2(x)
        return x

In [None]:
class BimodalClassifier(nn.Module):
    """
    The final bimodal model. No longer using MERGE archtecture as
    transformer would be better. Also due to mistakes in the paper it is
    unclear what some of the parameters are.
    """
    def __init__(self):
        super(BimodalClassifier, self).__init__()

        #initiate audio tower
        self.audio_tower = VGGish_Audio_Model()

        #use transformer for lyrics (using bert base uncased for now, but may change)
        self.lyrics_tower = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.lyrics_tower.parameters():
            param.requires_grad = False

        # Define feature sizes from the previous step and from bert
        AUDIO_FEATURES_OUT = 64
        LYRICS_FEATURES_OUT = 768
        COMBINED_FEATURES = AUDIO_FEATURES_OUT + LYRICS_FEATURES_OUT

        self.classifier_head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features=COMBINED_FEATURES, out_features=100),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(in_features=100, out_features=2) # 2 Outputs for Valence and Arousal
        )

    def forward(self, x_audio, input_ids, attention_mask):
        #process audio input
        audio_features = self.audio_tower(x_audio)

        #get lyric features
        lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)

        #use the embedding of the [CLS] token as the feature vector for whole lyrics
        lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]

        #combine the features from both towers
        combined_features = torch.cat((audio_features, lyrics_features), dim=1)

        #pass the combined features to the final classifier head
        output = self.classifier_head(combined_features)

        return output

In [None]:
def get_features(self, x_audio, input_ids, attention_mask):
    """
    Extract audio and lyrics features separately (before fusion).
    Returns: (audio_features, lyrics_features, predictions)
    """

    # Process audio input
    audio_features = self.audio_tower(x_audio)  # [batch_size, 64]

    # Get lyric features
    lyrics_outputs = self.lyrics_tower(input_ids=input_ids, attention_mask=attention_mask)
    lyrics_features = lyrics_outputs.last_hidden_state[:, 0, :]  # [batch_size, 768]

    # Combine features and get predictions
    combined_features = torch.cat((audio_features, lyrics_features), dim=1)
    predictions = self.classifier_head(combined_features)

    return audio_features, lyrics_features, predictions

In [None]:
#Data loading and prep

#get the paths to dissertation folder and new folder on colab
print("Starting data transfer from Google Drive to local Colab storage...")

#get paths for old file location and new colab one
gdrive_zip_path = '/content/drive/MyDrive/dissertation/merge_dataset_zipped.zip'
local_storage_path = '/content/local_dissertation_data/'
local_zip_path = os.path.join(local_storage_path, 'merge_dataset_zipped.zip')
os.makedirs(local_storage_path, exist_ok=True) # Ensure the destination directory exists

#Copy zip file from Drive to Colab
print("Copying single archive file from Google Drive...")
!rsync -ah --progress "{gdrive_zip_path}" "{local_storage_path}"

#get total number of files for progress
total_files = int(subprocess.check_output(f"zipinfo -1 {local_zip_path} | wc -l", shell=True))

#unzip the file
print("Extracting files locally")
!unzip -o "{local_zip_path}" -d "{local_storage_path}" | tqdm --unit=files --total={total_files} > /dev/null

print("Data transfer and extraction complete.")

#load master data from new location
local_output_path = os.path.join(local_storage_path, 'merge_dataset/output_from_code/')
master_file_path = os.path.join(local_output_path, 'master_processed_file_list.csv')
master_df = pd.read_csv(master_file_path)

#checking the valence and arousal range in the dataset
print(f"\nValence range in data: [{master_df['valence'].min()}, {master_df['valence'].max()}]")
print(f"Arousal range in data: [{master_df['arousal'].min()}, {master_df['arousal'].max()}]")
print(f"Valence mean: {master_df['valence'].mean():.4f}, std: {master_df['valence'].std():.4f}")
print(f"Arousal mean: {master_df['arousal'].mean():.4f}, std: {master_df['arousal'].std():.4f}")
print(f"Total samples in master_df: {len(master_df)}")

# Verify its the right column - not quadrants
print(f"\nNumber of unique valence values: {master_df['valence'].nunique()}")
print(f"Number of unique arousal values: {master_df['arousal'].nunique()}")
print(f"Number of unique quadrant values: {master_df['quadrant'].nunique()}")

# Sample some actual values
print(f"\nSample valence values: {master_df['valence'].sample(10).values}")
print(f"Sample arousal values: {master_df['arousal'].sample(10).values}")

#update the paths in the csv
print("\nUpdating dataframe paths to use fast local storage...")
gdrive_output_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_df['spectrogram_path'] = master_df['spectrogram_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
master_df['lyrics_path'] = master_df['lyrics_path'].str.replace(gdrive_output_path, local_output_path, regex=False)
print("Dataframe paths updated.")

#load the data splits from the new path in the predefined splits folder tvt
local_split_folder_path = os.path.join(local_storage_path, 'merge_dataset/MERGE_Bimodal_Complete/tvt_dataframes/tvt_70_15_15/')
train_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_train_bimodal_complete.csv'))
val_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_validate_bimodal_complete.csv'))
test_split_df = pd.read_csv(os.path.join(local_split_folder_path, 'tvt_70_15_15_test_bimodal_complete.csv'))
print("\nSplit files loaded from local storage.")

#merge the files
id_column_name = 'song_id'
train_split_df.rename(columns={'Song': id_column_name}, inplace=True)
val_split_df.rename(columns={'Song': id_column_name}, inplace=True)
test_split_df.rename(columns={'Song': id_column_name}, inplace=True)

train_df = pd.merge(master_df, train_split_df, on=id_column_name)
val_df = pd.merge(master_df, val_split_df, on=id_column_name)
test_df = pd.merge(master_df, test_split_df, on=id_column_name)

#checking no files are lost in merging - and checking length of the dataframes.
print("\nchecking data")

#check no data lost in merge
if len(train_df) == len(train_split_df):
    print("\nTraining split: Merge successful. All songs accounted for.")
else:
    print(f"\nWARNING: Training split lost {len(train_split_df) - len(train_df)} songs during merge.")

if len(val_df) == len(val_split_df):
    print("Validation split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Validation split lost {len(val_split_df) - len(val_df)} songs during merge.")

if len(test_df) == len(test_split_df):
    print("Test split: Merge successful. All songs accounted for.")
else:
    print(f"WARNING: Test split lost {len(test_split_df) - len(test_df)} songs during merge.")

#check length
expected_train_len = 1552
expected_val_len = 332
expected_test_len = 332

assert len(train_df) == expected_train_len, f"Expected {expected_train_len} training samples, but found {len(train_df)}"
assert len(val_df) == expected_val_len, f"Expected {expected_val_len} validation samples, but found {len(val_df)}"
assert len(test_df) == expected_test_len, f"Expected {expected_test_len} test samples, but found {len(test_df)}"

print(f"\nFinal dataset lengths are correct: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})")
print("Data Check Complete")

#createthe datasets and loaders
train_dataset = MER_Dataset(annotations_df=train_df, tokenizer=tokenizer)
val_dataset = MER_Dataset(annotations_df=val_df, tokenizer=tokenizer)
test_dataset = MER_Dataset(annotations_df=test_df, tokenizer=tokenizer)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\nDataLoaders created successfully.")

Starting data transfer from Google Drive to local Colab storage...
Copying single archive file from Google Drive...
sending incremental file list
merge_dataset_zipped.zip
          1.23G 100%   41.94MB/s    0:00:27 (xfr#1, to-chk=0/1)
Extracting files locally
4442files [00:16, 274.49files/s]           
Data transfer and extraction complete.

Valence range in data: [0.0187499999999999, 0.9875]
Arousal range in data: [0.0625, 0.975]
Valence mean: 0.5050, std: 0.2311
Arousal mean: 0.4823, std: 0.1395
Total samples in master_df: 2216

Number of unique valence values: 465
Number of unique arousal values: 443
Number of unique quadrant values: 4

Sample valence values: [0.6125   0.365    0.80625  0.1875   0.619375 0.705    0.81375  0.26875
 0.359375 0.305   ]
Sample arousal values: [0.431875 0.39875  0.50625  0.300625 0.56     0.925    0.38625  0.564375
 0.47375  0.41125 ]

Updating dataframe paths to use fast local storage...
Dataframe paths updated.

Split files loaded from local storage.



In [None]:

#select dataset for similarity analysis

analysis_df = test_df.copy()  #can change to train_df or val_df

print(f"\n✓ Selected dataset for similarity analysis: TEST SET")
print(f"  Total songs to analyze: {len(analysis_df)}")
print(f"  Song IDs: {analysis_df[id_column_name].head(10).tolist()}...")

In [None]:
# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA device.")
else:
    # If no GPU is found, print an error and stop execution by raising an error.
    raise RuntimeError("Error: No GPU found. This script requires a GPU to run.")


GPU is available. Using CUDA device.


In [None]:
model = BimodalClassifier()
model.to(device)
#load model 4
model_path = '/content/drive/MyDrive/dissertation/bimodal_regression_model.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()  # Set to evaluation mode


# Add the method we defined earlier to the model instance
model.get_features = types.MethodType(get_features, model)

print("Feature extraction added to model.")



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgemwrigley[0m ([33mgemwrigley-university-of-bath[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training Epoch 1:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 1/50, Average Training Loss: 0.0534


Validation Epoch 1:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 1/50, Average Validation Loss: 0.0263
✓ New best validation loss: 0.0263
Learning Rate: 0.001000


Training Epoch 2:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 2/50, Average Training Loss: 0.0407


Validation Epoch 2:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 2/50, Average Validation Loss: 0.0356
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 3:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 3/50, Average Training Loss: 0.0355


Validation Epoch 3:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 3/50, Average Validation Loss: 0.0236
✓ New best validation loss: 0.0236
Learning Rate: 0.001000


Training Epoch 4:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 4/50, Average Training Loss: 0.0346


Validation Epoch 4:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 4/50, Average Validation Loss: 0.0225
✓ New best validation loss: 0.0225
Learning Rate: 0.001000


Training Epoch 5:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 5/50, Average Training Loss: 0.0304


Validation Epoch 5:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 5/50, Average Validation Loss: 0.0210
✓ New best validation loss: 0.0210
Learning Rate: 0.001000


Training Epoch 6:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 6/50, Average Training Loss: 0.0285


Validation Epoch 6:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 6/50, Average Validation Loss: 0.0216
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 7:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 7/50, Average Training Loss: 0.0285


Validation Epoch 7:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 7/50, Average Validation Loss: 0.0204
✓ New best validation loss: 0.0204
Learning Rate: 0.001000


Training Epoch 8:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 8/50, Average Training Loss: 0.0268


Validation Epoch 8:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 8/50, Average Validation Loss: 0.0194
✓ New best validation loss: 0.0194
Learning Rate: 0.001000


Training Epoch 9:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 9/50, Average Training Loss: 0.0261


Validation Epoch 9:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 9/50, Average Validation Loss: 0.0206
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 10:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 10/50, Average Training Loss: 0.0257


Validation Epoch 10:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 10/50, Average Validation Loss: 0.0222
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 11:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 11/50, Average Training Loss: 0.0244


Validation Epoch 11:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 11/50, Average Validation Loss: 0.0188
✓ New best validation loss: 0.0188
Learning Rate: 0.001000


Training Epoch 12:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 12/50, Average Training Loss: 0.0236


Validation Epoch 12:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 12/50, Average Validation Loss: 0.0236
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 13:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 13/50, Average Training Loss: 0.0235


Validation Epoch 13:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 13/50, Average Validation Loss: 0.0197
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.001000


Training Epoch 14:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 14/50, Average Training Loss: 0.0241


Validation Epoch 14:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 14/50, Average Validation Loss: 0.0179
✓ New best validation loss: 0.0179
Learning Rate: 0.001000


Training Epoch 15:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 15/50, Average Training Loss: 0.0236


Validation Epoch 15:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 15/50, Average Validation Loss: 0.0205
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 16:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 16/50, Average Training Loss: 0.0231


Validation Epoch 16:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 16/50, Average Validation Loss: 0.0194
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 17:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 17/50, Average Training Loss: 0.0225


Validation Epoch 17:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 17/50, Average Validation Loss: 0.0182
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 18:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 18/50, Average Training Loss: 0.0218


Validation Epoch 18:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 18/50, Average Validation Loss: 0.0178
✓ New best validation loss: 0.0178
Learning Rate: 0.000500


Training Epoch 19:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 19/50, Average Training Loss: 0.0206


Validation Epoch 19:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 19/50, Average Validation Loss: 0.0183
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 20:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 20/50, Average Training Loss: 0.0217


Validation Epoch 20:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 20/50, Average Validation Loss: 0.0196
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 21:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 21/50, Average Training Loss: 0.0215


Validation Epoch 21:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 21/50, Average Validation Loss: 0.0169
✓ New best validation loss: 0.0169
Learning Rate: 0.000500


Training Epoch 22:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 22/50, Average Training Loss: 0.0205


Validation Epoch 22:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 22/50, Average Validation Loss: 0.0175
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 23:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 23/50, Average Training Loss: 0.0208


Validation Epoch 23:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 23/50, Average Validation Loss: 0.0170
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 24:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 24/50, Average Training Loss: 0.0220


Validation Epoch 24:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 24/50, Average Validation Loss: 0.0175
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 25:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 25/50, Average Training Loss: 0.0217


Validation Epoch 25:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 25/50, Average Validation Loss: 0.0170
No improvement for 4 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 26:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 26/50, Average Training Loss: 0.0212


Validation Epoch 26:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 26/50, Average Validation Loss: 0.0164
✓ New best validation loss: 0.0164
Learning Rate: 0.000500


Training Epoch 27:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 27/50, Average Training Loss: 0.0210


Validation Epoch 27:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 27/50, Average Validation Loss: 0.0170
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 28:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 28/50, Average Training Loss: 0.0211


Validation Epoch 28:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 28/50, Average Validation Loss: 0.0169
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 29:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 29/50, Average Training Loss: 0.0208


Validation Epoch 29:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 29/50, Average Validation Loss: 0.0183
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000500


Training Epoch 30:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 30/50, Average Training Loss: 0.0204


Validation Epoch 30:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 30/50, Average Validation Loss: 0.0179
No improvement for 4 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 31:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 31/50, Average Training Loss: 0.0203


Validation Epoch 31:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 31/50, Average Validation Loss: 0.0164
✓ New best validation loss: 0.0164
Learning Rate: 0.000250


Training Epoch 32:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 32/50, Average Training Loss: 0.0200


Validation Epoch 32:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 32/50, Average Validation Loss: 0.0168
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 33:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 33/50, Average Training Loss: 0.0205


Validation Epoch 33:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 33/50, Average Validation Loss: 0.0166
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 34:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 34/50, Average Training Loss: 0.0206


Validation Epoch 34:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 34/50, Average Validation Loss: 0.0162
✓ New best validation loss: 0.0162
Learning Rate: 0.000250


Training Epoch 35:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 35/50, Average Training Loss: 0.0204


Validation Epoch 35:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 35/50, Average Validation Loss: 0.0166
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 36:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 36/50, Average Training Loss: 0.0201


Validation Epoch 36:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 36/50, Average Validation Loss: 0.0163
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 37:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 37/50, Average Training Loss: 0.0202


Validation Epoch 37:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 37/50, Average Validation Loss: 0.0165
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 38:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 38/50, Average Training Loss: 0.0209


Validation Epoch 38:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 38/50, Average Validation Loss: 0.0161
✓ New best validation loss: 0.0161
Learning Rate: 0.000250


Training Epoch 39:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 39/50, Average Training Loss: 0.0202


Validation Epoch 39:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 39/50, Average Validation Loss: 0.0174
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 40:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 40/50, Average Training Loss: 0.0194


Validation Epoch 40:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 40/50, Average Validation Loss: 0.0167
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 41:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 41/50, Average Training Loss: 0.0200


Validation Epoch 41:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 41/50, Average Validation Loss: 0.0162
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 42:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 42/50, Average Training Loss: 0.0194


Validation Epoch 42:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 42/50, Average Validation Loss: 0.0163
No improvement for 4 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 43:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 43/50, Average Training Loss: 0.0206


Validation Epoch 43:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 43/50, Average Validation Loss: 0.0161
✓ New best validation loss: 0.0161
Learning Rate: 0.000250


Training Epoch 44:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 44/50, Average Training Loss: 0.0196


Validation Epoch 44:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 44/50, Average Validation Loss: 0.0167
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000250


Training Epoch 45:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 45/50, Average Training Loss: 0.0199


Validation Epoch 45:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 45/50, Average Validation Loss: 0.0160
✓ New best validation loss: 0.0160
Learning Rate: 0.000125


Training Epoch 46:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 46/50, Average Training Loss: 0.0185


Validation Epoch 46:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 46/50, Average Validation Loss: 0.0157
✓ New best validation loss: 0.0157
Learning Rate: 0.000125


Training Epoch 47:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 47/50, Average Training Loss: 0.0191


Validation Epoch 47:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 47/50, Average Validation Loss: 0.0158
No improvement for 1 epochs (patience: 10)
Learning Rate: 0.000125


Training Epoch 48:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 48/50, Average Training Loss: 0.0198


Validation Epoch 48:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 48/50, Average Validation Loss: 0.0160
No improvement for 2 epochs (patience: 10)
Learning Rate: 0.000125


Training Epoch 49:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 49/50, Average Training Loss: 0.0189


Validation Epoch 49:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 49/50, Average Validation Loss: 0.0165
No improvement for 3 epochs (patience: 10)
Learning Rate: 0.000125


Training Epoch 50:   0%|          | 0/97 [00:00<?, ?it/s]

Epoch 50/50, Average Training Loss: 0.0192


Validation Epoch 50:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 50/50, Average Validation Loss: 0.0164
No improvement for 4 epochs (patience: 10)
Learning Rate: 0.000125
--- Training Completed All Epochs ---
Final best validation loss: 0.0157
--- Pipeline Test Complete ---


In [None]:
def extract_features_from_dataset(model, dataloader, device):
    """
    Extract audio and lyrics features for all songs in the dataloader.
    """
    print("\n" + "="*70)
    print("EXTRACTING FEATURES FROM DATASET")
    print("="*70)

    #Create lists to store results
    audio_features_list = []
    lyrics_features_list = []
    predictions_list = []
    ground_truth_list = []

    # Set model to evaluation mode
    model.eval()

    # Extract features without computing gradients
    with torch.no_grad():
        for spectrogram_batch, input_ids_batch, attention_mask_batch, labels_batch in tqdm(dataloader, desc="Extracting features"):
            # Move data to device
            spectrogram_batch = spectrogram_batch.to(device)
            input_ids_batch = input_ids_batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)

            # Extract features
            audio_feat, lyrics_feat, preds = model.get_features(
                spectrogram_batch,
                input_ids_batch,
                attention_mask_batch
            )

            # Move to CPU and convert to numpy
            audio_features_list.append(audio_feat.cpu().numpy())
            lyrics_features_list.append(lyrics_feat.cpu().numpy())
            predictions_list.append(preds.cpu().numpy())
            ground_truth_list.append(labels_batch.cpu().numpy())

    # Concatenate all batches
    audio_features = np.concatenate(audio_features_list, axis=0)      # [N, 64]
    lyrics_features = np.concatenate(lyrics_features_list, axis=0)    # [N, 768]
    predictions = np.concatenate(predictions_list, axis=0)            # [N, 2]
    ground_truth = np.concatenate(ground_truth_list, axis=0)          # [N, 2]

    # Print summary
    print(f"\n✓ Feature extraction complete!")
    print(f"  Total songs processed: {len(audio_features)}")
    print(f"  Audio features shape:  {audio_features.shape}")
    print(f"  Lyrics features shape: {lyrics_features.shape}")
    print(f"  Predictions shape:     {predictions.shape}")
    print(f"  Ground truth shape:    {ground_truth.shape}")

    return {
        'audio_features': audio_features,
        'lyrics_features': lyrics_features,
        'predictions': predictions,
        'ground_truth': ground_truth
    }

print("Feature extraction function defined.")


In [None]:
features_dict = extract_features_from_dataset(model, test_loader, device)

# Store variable names that match MODEL 4
audio_features = features_dict['audio_features']
lyrics_features = features_dict['lyrics_features']
predictions = features_dict['predictions']
ground_truth = features_dict['ground_truth']

print("\n✓ Features stored in variables:")
print("  - audio_features")
print("  - lyrics_features")
print("  - predictions")
print("  - ground_truth")

In [None]:
def compute_cosine_similarity_analysis(audio_features, lyrics_features):
    """
    Compute pairwise cosine similarities.
    """

    # Audio-to-audio similarity
    audio_sim = cosine_similarity(audio_features, audio_features)

    # Lyrics-to-lyrics similarity
    lyrics_sim = cosine_similarity(lyrics_features, lyrics_features)

    # CROSS-MODAL similarity [N, N]
    cross_modal_sim = cosine_similarity(audio_features, lyrics_features)

    # Extract diagonal (self-similarity)
    self_similarity = np.diag(cross_modal_sim)

    # Extract off-diagonal (cross-song similarity)
    mask = np.ones_like(cross_modal_sim, dtype=bool)
    np.fill_diagonal(mask, False)
    cross_song_sim = cross_modal_sim[mask]

    # Print results
    print(f"\n1. SELF-SIMILARITY (audio vs own lyrics):")
    print(f"   Mean:  {self_similarity.mean():.4f}")
    print(f"   Std:   {self_similarity.std():.4f}")
    print(f"   Range: [{self_similarity.min():.4f}, {self_similarity.max():.4f}]")

    print(f"\n2. CROSS-SONG SIMILARITY (audio_i vs lyrics_j, i≠j):")
    print(f"   Mean:  {cross_song_sim.mean():.4f}")
    print(f"   Std:   {cross_song_sim.std():.4f}")

    print(f"\n3. WITHIN-MODALITY SIMILARITY:")
    print(f"   Audio-to-audio mean:   {audio_sim[mask].mean():.4f}")
    print(f"   Lyrics-to-lyrics mean: {lyrics_sim[mask].mean():.4f}")

    # Interpretation
    print(f"\n4. INTERPRETATION:")
    if self_similarity.mean() > 0.7:
        print(f"   ✓ STRONG alignment: Audio and lyrics are highly similar")
    elif self_similarity.mean() > 0.5:
        print(f"   ✓ MODERATE alignment: Some similarity between audio and lyrics")
    else:
        print(f"   ! WEAK alignment: Audio and lyrics encode different information")

    return audio_sim, lyrics_sim, cross_modal_sim, self_similarity

# Run analysis
audio_sim, lyrics_sim, cross_modal_sim, self_sim = compute_cosine_similarity_analysis(
    audio_features,
    lyrics_features
)


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot audio similarity
im1 = axes[0].imshow(audio_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
axes[0].set_title('Audio-to-Audio Similarity', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Song Index')
axes[0].set_ylabel('Song Index')
plt.colorbar(im1, ax=axes[0], fraction=0.046)

# Plot lyrics similarity
im2 = axes[1].imshow(lyrics_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
axes[1].set_title('Lyrics-to-Lyrics Similarity', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Song Index')
axes[1].set_ylabel('Song Index')
plt.colorbar(im2, ax=axes[1], fraction=0.046)

# Plot cross-modal similarity (KEY PLOT)
im3 = axes[2].imshow(cross_modal_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
axes[2].set_title('Audio-to-Lyrics Cross-Modal Similarity', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Lyrics Index')
axes[2].set_ylabel('Audio Index')
plt.colorbar(im3, ax=axes[2], fraction=0.046)

plt.tight_layout()
plt.show()

# Plot histogram of self-similarity
plt.figure(figsize=(10, 6))
plt.hist(self_sim, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
plt.axvline(self_sim.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {self_sim.mean():.3f}')
plt.xlabel('Cosine Similarity (audio vs own lyrics)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Self-Similarity Scores', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

CCA

In [None]:
def perform_cca_analysis(audio_features, lyrics_features, n_components=10):
    """
    Canonical Correlation Analysis between audio and lyrics.
    """

    # Initialise CCA
    cca = CCA(n_components=n_components, max_iter=1000)

    # Fit CCA to learn transformations
    cca.fit(audio_features, lyrics_features)

    # Transform features to canonical space
    audio_canonical, lyrics_canonical = cca.transform(audio_features, lyrics_features)

    # Compute correlation for each canonical component
    correlations = []
    for i in range(n_components):
        corr, _ = pearsonr(audio_canonical[:, i], lyrics_canonical[:, i])
        correlations.append(corr)

    correlations = np.array(correlations)

    # Print results
    print(f"\nCanonical correlations (n={n_components}):")
    for i, corr in enumerate(correlations):
        print(f"  Component {i+1}: {corr:.4f}")

    print(f"\nSummary statistics:")
    print(f"  Mean correlation: {correlations.mean():.4f}")
    print(f"  Max correlation:  {correlations.max():.4f}")
    print(f"  Std:              {correlations.std():.4f}")

    # Interpretation
    print(f"\nINTERPRETATION:")
    if correlations[0] > 0.7:
        print(f"  STRONG shared structure: First component correlation = {correlations[0]:.3f}")
    elif correlations[0] > 0.5:
        print(f"  MODERATE shared structure: Some shared latent dimensions")
    else:
        print(f"  LIMITED shared structure: Modalities may be complementary")

    return cca, correlations, audio_canonical, lyrics_canonical

# Run CCA
cca_model, cca_corrs, audio_can, lyrics_can = perform_cca_analysis(
    audio_features,
    lyrics_features,
    n_components=10
)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart of canonical correlations
axes[0].bar(range(1, len(cca_corrs) + 1), cca_corrs, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].axhline(y=0.5, color='red', linestyle='--', linewidth=2, label='Moderate (0.5)')
axes[0].axhline(y=0.7, color='darkred', linestyle='--', linewidth=2, label='Strong (0.7)')
axes[0].set_xlabel('Canonical Component', fontsize=12)
axes[0].set_ylabel('Correlation Coefficient', fontsize=12)
axes[0].set_title('Canonical Correlations', fontsize=14, fontweight='bold')
axes[0].set_ylim([0, 1])
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Scatter plot of first canonical component
axes[1].scatter(audio_can[:, 0], lyrics_can[:, 0], alpha=0.6, s=50, color='purple', edgecolors='black', linewidth=0.5)
axes[1].set_xlabel('Audio Canonical Component 1', fontsize=12)
axes[1].set_ylabel('Lyrics Canonical Component 1', fontsize=12)
axes[1].set_title(f'First Canonical Component (r={cca_corrs[0]:.3f})', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Add correlation line
z = np.polyfit(audio_can[:, 0], lyrics_can[:, 0], 1)
p = np.poly1d(z)
axes[1].plot(audio_can[:, 0], p(audio_can[:, 0]), "r--", linewidth=2, label='Linear fit')
axes[1].legend()

plt.tight_layout()
plt.show()



CMR

In [None]:
def cross_modal_retrieval_analysis(audio_features, lyrics_features, top_k=5):
    """
    Perform cross-modal retrieval task.
    """

    # Compute cross-modal similarity matrix
    sim_matrix = cosine_similarity(audio_features, lyrics_features)
    n_samples = len(audio_features)

    # Audio → Lyrics retrieval
    audio_to_lyrics_top_k = np.argsort(sim_matrix, axis=1)[:, ::-1][:, :top_k]

    # Check if correct match is in top-k
    audio_to_lyrics_hits = []
    for i in range(n_samples):
        if i in audio_to_lyrics_top_k[i]:
            audio_to_lyrics_hits.append(1)
        else:
            audio_to_lyrics_hits.append(0)

    audio_to_lyrics_acc = np.mean(audio_to_lyrics_hits)

    # Lyrics → Audio retrieval
    lyrics_to_audio_top_k = np.argsort(sim_matrix.T, axis=1)[:, ::-1][:, :top_k]

    lyrics_to_audio_hits = []
    for i in range(n_samples):
        if i in lyrics_to_audio_top_k[i]:
            lyrics_to_audio_hits.append(1)
        else:
            lyrics_to_audio_hits.append(0)

    lyrics_to_audio_acc = np.mean(lyrics_to_audio_hits)

    # Top-1 (exact match)
    audio_to_lyrics_top1 = np.argmax(sim_matrix, axis=1)
    top1_acc = np.mean(audio_to_lyrics_top1 == np.arange(n_samples))

    # Top-10
    if n_samples >= 10:
        audio_to_lyrics_top_10 = np.argsort(sim_matrix, axis=1)[:, ::-1][:, :10]
        top10_hits = [i in audio_to_lyrics_top_10[i] for i in range(n_samples)]
        top10_acc = np.mean(top10_hits)
    else:
        top10_acc = None

    # Print results
    print(f"\n1. RETRIEVAL ACCURACY:")
    print(f"   Audio → Lyrics (Top-{top_k}): {audio_to_lyrics_acc:.2%}")
    print(f"   Lyrics → Audio (Top-{top_k}): {lyrics_to_audio_acc:.2%}")

    print(f"\n2. ADDITIONAL METRICS:")
    print(f"   Top-1 accuracy (exact match):  {top1_acc:.2%}")
    if top10_acc:
        print(f"   Top-10 accuracy:               {top10_acc:.2%}")

    # Interpretation
    print(f"\n3. INTERPRETATION:")
    if audio_to_lyrics_acc > 0.5:
        print(f"   GOOD alignment: Audio features predict matching lyrics well")
    elif audio_to_lyrics_acc > 0.2:
        print(f"   MODERATE alignment: Some predictive power")
    else:
        print(f"   WEAK alignment: Limited cross-modal predictability")

    print(f"\n   Meaning: {audio_to_lyrics_acc:.1%} of the time, given a song's audio,")
    print(f"   the correct lyrics are in the top-{top_k} most similar lyrics.")

    return {
        'audio_to_lyrics_acc': audio_to_lyrics_acc,
        'lyrics_to_audio_acc': lyrics_to_audio_acc,
        'top1_acc': top1_acc,
        'top10_acc': top10_acc
    }

# Run retrieval analysis
retrieval_results = cross_modal_retrieval_analysis(
    audio_features,
    lyrics_features,
    top_k=5
)

In [None]:
k_values = [1, 2, 3, 5, 10, 20]
accuracies = []

sim_matrix = cosine_similarity(audio_features, lyrics_features)
n_samples = len(audio_features)

# Compute accuracy for different k values
for k in k_values:
    if k <= n_samples:
        top_k_indices = np.argsort(sim_matrix, axis=1)[:, ::-1][:, :k]
        hits = [i in top_k_indices[i] for i in range(n_samples)]
        accuracies.append(np.mean(hits))
    else:
        accuracies.append(None)

# Plot
plt.figure(figsize=(10, 6))
valid_k = [k for k, acc in zip(k_values, accuracies) if acc is not None]
valid_acc = [acc for acc in accuracies if acc is not None]

plt.plot(valid_k, valid_acc, marker='o', linewidth=2, markersize=8, color='steelblue')
plt.xlabel('Top-K', fontsize=12)
plt.ylabel('Retrieval Accuracy', fontsize=12)
plt.title('Cross-Modal Retrieval Accuracy (Audio → Lyrics)', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.ylim([0, 1])

# Add value labels
for k, acc in zip(valid_k, valid_acc):
    plt.text(k, acc + 0.03, f'{acc:.2%}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Create output directory
output_dir = '/content/drive/MyDrive/dissertation/similarity_analysis_results/'
os.makedirs(output_dir, exist_ok=True)

# Save similarity matrices
np.save(os.path.join(output_dir, 'audio_similarity_matrix.npy'), audio_sim)
np.save(os.path.join(output_dir, 'lyrics_similarity_matrix.npy'), lyrics_sim)
np.save(os.path.join(output_dir, 'cross_modal_similarity_matrix.npy'), cross_modal_sim)

# Save CCA results
np.save(os.path.join(output_dir, 'cca_correlations.npy'), cca_corrs)
np.save(os.path.join(output_dir, 'audio_canonical.npy'), audio_can)
np.save(os.path.join(output_dir, 'lyrics_canonical.npy'), lyrics_can)

# Save extracted features
np.save(os.path.join(output_dir, 'audio_features.npy'), audio_features)
np.save(os.path.join(output_dir, 'lyrics_features.npy'), lyrics_features)

# Create summary CSV with per-song similarity scores
results_df = analysis_df[[id_column_name, 'valence', 'arousal']].copy()
results_df['self_similarity'] = self_sim
results_df['valence_predicted'] = predictions[:, 0]
results_df['arousal_predicted'] = predictions[:, 1]
results_df.to_csv(os.path.join(output_dir, 'similarity_summary.csv'), index=False)

# Save metrics summary as JSON
metrics = {
    'dataset': 'test_set',
    'n_songs': len(analysis_df),
    'mean_self_similarity': float(self_sim.mean()),
    'std_self_similarity': float(self_sim.std()),
    'cca_correlation_1': float(cca_corrs[0]),
    'cca_mean_correlation': float(cca_corrs.mean()),
    'retrieval_audio_to_lyrics': float(retrieval_results['audio_to_lyrics_acc']),
    'retrieval_lyrics_to_audio': float(retrieval_results['lyrics_to_audio_acc']),
    'retrieval_top1': float(retrieval_results['top1_acc'])
}

with open(os.path.join(output_dir, 'metrics_summary.json'), 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"✓ Results saved to: {output_dir}")
print(f"\nFiles created:")
print(f"  - audio_similarity_matrix.npy")
print(f"  - lyrics_similarity_matrix.npy")
print(f"  - cross_modal_similarity_matrix.npy")
print(f"  - cca_correlations.npy")
print(f"  - audio_canonical.npy")
print(f"  - lyrics_canonical.npy")
print(f"  - audio_features.npy")
print(f"  - lyrics_features.npy")
print(f"  - similarity_summary.csv")
print(f"  - metrics_summary.json")
