# Similarity Analysis: Lyrics vs Music Outputs

This notebook performs comprehensive similarity analysis between:
- **Audio features** (64-dim from VGGish)
- **Lyrics features** (768-dim from BERT)
- **Emotion predictions** (valence & arousal)

## Methods Implemented:
1. **Cosine Similarity** - Direct feature comparison
2. **Canonical Correlation Analysis** - Find correlated components
3. **Cross-Modal Retrieval** - Retrieval accuracy between modalities
4. **Prediction Correlation** - Compare emotion predictions

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_decomposition import CCA
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## Step 1: Load Your Trained Model and Extract Features

We need to extract intermediate features (before fusion) from both modalities.

In [None]:
# Load your trained bimodal model
# REPLACE THIS PATH with your actual model path
model_path = '/content/drive/MyDrive/dissertation/bimodal_regression_model.pth'

# Define your model architecture (copy from your training notebook)
# This is a placeholder - you need to use your actual BimodalClassifier class
from transformers import BertModel, BertTokenizer

class VGGish_Audio_Model(nn.Module):
    """Your audio model architecture"""
    def __init__(self, num_classes=64):
        super(VGGish_Audio_Model, self).__init__()
        # Block 1: Input channels=1 (grayscale spectrogram) → 64 feature maps
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Block 2: 64 → 128 feature maps
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Block 3: 128 → 256 feature maps
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Block 4: 256 → 512 feature maps (no pooling)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(512)
        
        # Global pooling to fixed size output
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Classifier layers: 512 → 256 → 64
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # x shape: [batch_size, 1, 128, 1292] - mel-spectrogram
        
        # Block 1: Convolution → BatchNorm → ReLU → MaxPool
        x = self.conv1(x)           # [B, 64, 128, 1292]
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool1(x)           # [B, 64, 64, 646]
        
        # Block 2
        x = self.conv2(x)           # [B, 128, 64, 646]
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool2(x)           # [B, 128, 32, 323]
        
        # Block 3
        x = self.conv3(x)           # [B, 256, 32, 323]
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool3(x)           # [B, 256, 16, 161]
        
        # Block 4 (no pooling)
        x = self.conv4(x)           # [B, 512, 16, 161]
        x = self.bn4(x)
        x = self.relu(x)
        
        # Global average pooling: collapse spatial dimensions
        x = self.adaptive_pool(x)   # [B, 512, 1, 1]
        x = x.view(x.size(0), -1)   # Flatten to [B, 512]
        
        # Fully connected layers
        x = self.fc1(x)             # [B, 256]
        x = self.relu(x)
        x = self.fc2(x)             # [B, 64] - audio feature vector
        
        return x

class BimodalClassifier(nn.Module):
    """Combined audio + lyrics model"""
    def __init__(self, audio_feature_dim=64, text_feature_dim=768, num_emotions=2):
        super(BimodalClassifier, self).__init__()
        
        # Audio tower: VGGish CNN
        self.audio_model = VGGish_Audio_Model(num_classes=audio_feature_dim)
        
        # Lyrics tower: BERT (frozen)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False  # Freeze BERT weights
        
        # Combined dimension: 64 (audio) + 768 (lyrics) = 832
        combined_dim = audio_feature_dim + text_feature_dim
        
        # Fusion classifier: 832 → 100 → 2 (valence, arousal)
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 100),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(100, num_emotions)  # Outputs: [valence, arousal]
        )
        
    def forward(self, spectrogram, input_ids, attention_mask):
        # Extract audio features
        # spectrogram: [batch_size, 1, 128, 1292]
        audio_features = self.audio_model(spectrogram)  # [B, 64]
        
        # Extract lyrics features from BERT [CLS] token
        # input_ids: [batch_size, 512], attention_mask: [batch_size, 512]
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lyrics_features = bert_output.last_hidden_state[:, 0, :]  # [B, 768] - [CLS] token
        
        # Concatenate audio and lyrics features
        combined_features = torch.cat((audio_features, lyrics_features), dim=1)  # [B, 832]
        
        # Predict emotions
        emotions = self.classifier(combined_features)  # [B, 2]
        
        return emotions, audio_features, lyrics_features
    
    def get_audio_features(self, spectrogram):
        """Extract only audio features (for similarity analysis)"""
        return self.audio_model(spectrogram)
    
    def get_lyrics_features(self, input_ids, attention_mask):
        """Extract only lyrics features (for similarity analysis)"""
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return bert_output.last_hidden_state[:, 0, :]

# Load the trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize model
model = BimodalClassifier(audio_feature_dim=64, text_feature_dim=768, num_emotions=2)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()  # Set to evaluation mode

print("Model loaded successfully!")

## Step 2: Load Dataset and Extract Features

We'll extract features for all songs in the test set.

In [None]:
# Load your dataset (use the same dataloader from training)
# REPLACE THESE PATHS with your actual paths
csv_path = '/content/drive/MyDrive/dissertation/merge_dataset/output_from_code/master_processed_file_list.csv'
tvt_path = '/content/drive/MyDrive/dissertation/merge_dataset/tvt_70_15_15_*.csv'  # Use appropriate split file

# Load metadata
# Line 1: Read the CSV file containing all processed song information
df = pd.read_csv(csv_path)

# Line 2: Display first few rows to verify data structure
print(f"Total songs: {len(df)}")
print(df.head())

# For demonstration, let's use a subset (e.g., first 100 songs)
# Line 3: Select subset for faster analysis (remove .head(100) to use all data)
df_subset = df.head(100)

# Initialize BERT tokenizer
# Line 4: Create tokenizer object - converts text to BERT-compatible tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def extract_features_from_dataset(model, df, device, tokenizer):
    """
    Extract audio and lyrics features for all songs in the dataset.
    
    Args:
        model: Trained BimodalClassifier
        df: DataFrame with spectrogram paths, lyrics paths, and labels
        device: torch device (cuda or cpu)
        tokenizer: BERT tokenizer
    
    Returns:
        Dictionary containing:
        - audio_features: [N, 64] numpy array
        - lyrics_features: [N, 768] numpy array
        - predictions: [N, 2] numpy array (valence, arousal)
        - ground_truth: [N, 2] numpy array (true valence, arousal)
        - song_ids: list of song IDs
    """
    # Line 1: Initialize empty lists to store extracted features
    audio_features_list = []
    lyrics_features_list = []
    predictions_list = []
    ground_truth_list = []
    song_ids = []
    
    # Line 2: Disable gradient computation for faster inference (we're not training)
    model.eval()
    with torch.no_grad():
        # Line 3: Loop through each song in the dataframe with progress bar
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
            try:
                # Line 4: Load pre-processed spectrogram from .npy file
                # Shape: [128, 1292] - mel-spectrogram
                spectrogram = np.load(row['spectrogram_path'])
                
                # Line 5: Convert numpy array to PyTorch tensor and add batch dimension
                # [128, 1292] → [1, 1, 128, 1292] (batch=1, channels=1)
                spectrogram = torch.FloatTensor(spectrogram).unsqueeze(0).unsqueeze(0).to(device)
                
                # Line 6: Load pre-tokenized lyrics from .pt file
                lyrics_tokens = torch.load(row['lyrics_path'])
                
                # Line 7: Extract input_ids and attention_mask, move to device
                # input_ids: token IDs [1, 512]
                # attention_mask: indicates which tokens are real vs padding [1, 512]
                input_ids = lyrics_tokens['input_ids'].to(device)
                attention_mask = lyrics_tokens['attention_mask'].to(device)
                
                # Line 8: Forward pass through the model
                # Returns: predictions [1, 2], audio_feat [1, 64], lyrics_feat [1, 768]
                preds, audio_feat, lyrics_feat = model(spectrogram, input_ids, attention_mask)
                
                # Line 9: Move tensors to CPU and convert to numpy arrays
                # Remove batch dimension with squeeze() and detach from computation graph
                audio_features_list.append(audio_feat.squeeze().cpu().numpy())    # [64]
                lyrics_features_list.append(lyrics_feat.squeeze().cpu().numpy())  # [768]
                predictions_list.append(preds.squeeze().cpu().numpy())            # [2]
                
                # Line 10: Store ground truth valence and arousal values
                ground_truth_list.append([row['valence'], row['arousal']])        # [2]
                
                # Line 11: Store song identifier for reference
                song_ids.append(row['song_id'])
                
            except Exception as e:
                # Line 12: Print error message if processing fails for a song
                print(f"Error processing {row['song_id']}: {e}")
                continue
    
    # Line 13: Convert lists of arrays to single numpy arrays
    # Stack along axis 0 to create [N, feature_dim] arrays
    audio_features = np.stack(audio_features_list)      # [N, 64]
    lyrics_features = np.stack(lyrics_features_list)    # [N, 768]
    predictions = np.stack(predictions_list)            # [N, 2]
    ground_truth = np.stack(ground_truth_list)          # [N, 2]
    
    # Line 14: Print summary statistics
    print(f"\nExtracted features for {len(song_ids)} songs")
    print(f"Audio features shape: {audio_features.shape}")
    print(f"Lyrics features shape: {lyrics_features.shape}")
    print(f"Predictions shape: {predictions.shape}")
    
    # Line 15: Return dictionary with all extracted information
    return {
        'audio_features': audio_features,
        'lyrics_features': lyrics_features,
        'predictions': predictions,
        'ground_truth': ground_truth,
        'song_ids': song_ids
    }

# Extract features
features_dict = extract_features_from_dataset(model, df_subset, device, tokenizer)

## Method 1: Cosine Similarity Analysis

**What it does**: Measures the angle between feature vectors. Values range from -1 (opposite) to 1 (identical).

**When to use**: To see if songs with similar audio have similar lyrics encoding.

In [None]:
def compute_pairwise_similarity(audio_features, lyrics_features):
    """
    Compute pairwise cosine similarity between audio and lyrics.
    
    Returns:
        - audio_similarity: [N, N] matrix - similarity between audio vectors
        - lyrics_similarity: [N, N] matrix - similarity between lyrics vectors
        - cross_modal_similarity: [N, N] matrix - similarity between audio[i] and lyrics[j]
    """
    # Line 1: Compute cosine similarity between all pairs of audio feature vectors
    # For N songs, creates NxN matrix where entry [i,j] = similarity(audio_i, audio_j)
    # Diagonal elements are 1.0 (song compared to itself)
    audio_similarity = cosine_similarity(audio_features, audio_features)
    
    # Line 2: Same for lyrics features
    # Entry [i,j] = similarity(lyrics_i, lyrics_j)
    lyrics_similarity = cosine_similarity(lyrics_features, lyrics_features)
    
    # Line 3: CROSS-MODAL similarity - compare audio to lyrics
    # Entry [i,j] = similarity(audio_i, lyrics_j)
    # This tells us: for song i, how similar is its audio to song j's lyrics?
    cross_modal_similarity = cosine_similarity(audio_features, lyrics_features)
    
    # Line 4: Print statistics about the similarity distributions
    print(f"Audio similarity - Mean: {audio_similarity.mean():.3f}, Std: {audio_similarity.std():.3f}")
    print(f"Lyrics similarity - Mean: {lyrics_similarity.mean():.3f}, Std: {lyrics_similarity.std():.3f}")
    print(f"Cross-modal similarity - Mean: {cross_modal_similarity.mean():.3f}, Std: {cross_modal_similarity.std():.3f}")
    
    # Line 5: Return all three similarity matrices
    return audio_similarity, lyrics_similarity, cross_modal_similarity

# Compute similarities
audio_sim, lyrics_sim, cross_modal_sim = compute_pairwise_similarity(
    features_dict['audio_features'], 
    features_dict['lyrics_features']
)

In [None]:
# Visualize similarity matrices
def plot_similarity_matrices(audio_sim, lyrics_sim, cross_modal_sim):
    """
    Create heatmaps to visualize similarity patterns.
    """
    # Line 1: Create a figure with 3 subplots side by side
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Line 2: Plot audio similarity matrix
    # vmin=0, vmax=1 sets color scale from 0 (blue) to 1 (red)
    # cmap='coolwarm' uses blue for low, red for high values
    im1 = axes[0].imshow(audio_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
    axes[0].set_title('Audio-to-Audio Similarity', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Song Index')
    axes[0].set_ylabel('Song Index')
    plt.colorbar(im1, ax=axes[0])
    
    # Line 3: Plot lyrics similarity matrix
    im2 = axes[1].imshow(lyrics_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
    axes[1].set_title('Lyrics-to-Lyrics Similarity', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Song Index')
    axes[1].set_ylabel('Song Index')
    plt.colorbar(im2, ax=axes[1])
    
    # Line 4: Plot cross-modal similarity matrix
    # This is the KEY matrix: shows how audio features relate to lyrics features
    im3 = axes[2].imshow(cross_modal_sim, cmap='coolwarm', vmin=0, vmax=1, aspect='auto')
    axes[2].set_title('Audio-to-Lyrics Cross-Modal Similarity', fontsize=14, fontweight='bold')
    axes[2].set_xlabel('Lyrics Index')
    axes[2].set_ylabel('Audio Index')
    plt.colorbar(im3, ax=axes[2])
    
    # Line 5: Adjust layout and display
    plt.tight_layout()
    plt.show()
    
    # Line 6: Analyze the diagonal of cross-modal similarity
    # Diagonal = similarity between song's OWN audio and lyrics
    # High diagonal values = audio and lyrics are aligned
    diagonal_similarity = np.diag(cross_modal_sim)
    print(f"\nSelf-similarity (audio vs own lyrics):")
    print(f"  Mean: {diagonal_similarity.mean():.3f}")
    print(f"  Std: {diagonal_similarity.std():.3f}")
    print(f"  Min: {diagonal_similarity.min():.3f}")
    print(f"  Max: {diagonal_similarity.max():.3f}")

plot_similarity_matrices(audio_sim, lyrics_sim, cross_modal_sim)

## Method 2: Canonical Correlation Analysis (CCA)

**What it does**: Finds linear transformations of audio and lyrics features that maximize correlation.

**When to use**: To discover if there are latent shared dimensions between modalities.

In [None]:
def perform_cca_analysis(audio_features, lyrics_features, n_components=10):
    """
    Perform Canonical Correlation Analysis.
    
    Args:
        audio_features: [N, 64] audio feature matrix
        lyrics_features: [N, 768] lyrics feature matrix
        n_components: number of canonical components to extract
    
    Returns:
        cca: fitted CCA model
        correlations: correlation coefficient for each component
    """
    # Line 1: Initialize CCA with specified number of components
    # n_components: how many correlated dimensions to find
    # max_iter: maximum iterations for optimization
    cca = CCA(n_components=n_components, max_iter=1000)
    
    # Line 2: Fit CCA to find transformations that maximize correlation
    # Learns weights W_audio and W_lyrics such that:
    #   audio_features @ W_audio and lyrics_features @ W_lyrics are maximally correlated
    cca.fit(audio_features, lyrics_features)
    
    # Line 3: Transform features into canonical space
    # audio_canonical: [N, n_components] - audio projected onto canonical dimensions
    # lyrics_canonical: [N, n_components] - lyrics projected onto canonical dimensions
    audio_canonical, lyrics_canonical = cca.transform(audio_features, lyrics_features)
    
    # Line 4: Compute correlation for each canonical component
    # For each of the n_components dimensions, calculate how correlated they are
    correlations = []
    for i in range(n_components):
        # Line 5: Pearson correlation between i-th canonical dimension
        # pearsonr returns (correlation_coefficient, p_value)
        corr, _ = pearsonr(audio_canonical[:, i], lyrics_canonical[:, i])
        correlations.append(corr)
    
    # Line 6: Convert to numpy array for easier handling
    correlations = np.array(correlations)
    
    # Line 7: Print results
    print(f"\nCCA Results (n_components={n_components}):")
    for i, corr in enumerate(correlations):
        print(f"  Component {i+1}: {corr:.4f}")
    print(f"\nMean correlation: {correlations.mean():.4f}")
    print(f"Max correlation: {correlations.max():.4f}")
    
    # Line 8: Return CCA model and correlations
    return cca, correlations, audio_canonical, lyrics_canonical

# Perform CCA
cca_model, cca_correlations, audio_can, lyrics_can = perform_cca_analysis(
    features_dict['audio_features'],
    features_dict['lyrics_features'],
    n_components=10
)

In [None]:
# Visualize CCA correlations
def plot_cca_results(correlations):
    """
    Plot canonical correlations as a bar chart.
    """
    # Line 1: Create bar chart of canonical correlations
    plt.figure(figsize=(10, 6))
    
    # Line 2: Create bars for each component
    # x-axis: component index, y-axis: correlation value
    plt.bar(range(1, len(correlations) + 1), correlations, color='steelblue', alpha=0.7)
    
    # Line 3: Add horizontal line at 0.5 for reference
    # Correlations above 0.5 indicate moderate to strong relationship
    plt.axhline(y=0.5, color='red', linestyle='--', label='Moderate correlation (0.5)')
    
    # Line 4: Labels and formatting
    plt.xlabel('Canonical Component', fontsize=12)
    plt.ylabel('Correlation Coefficient', fontsize=12)
    plt.title('Canonical Correlations Between Audio and Lyrics Features', fontsize=14, fontweight='bold')
    plt.ylim([0, 1])
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Line 5: Interpretation guidance
    print("\nInterpretation:")
    print("- High correlations (>0.7): Strong shared latent dimensions between audio and lyrics")
    print("- Moderate correlations (0.5-0.7): Some shared structure")
    print("- Low correlations (<0.5): Modalities encode different information")

plot_cca_results(cca_correlations)

## Method 3: Cross-Modal Retrieval Analysis

**What it does**: For each song's audio, find the most similar lyrics (and vice versa). Measures retrieval accuracy.

**When to use**: To test if one modality can predict the other.

In [None]:
def cross_modal_retrieval(audio_features, lyrics_features, top_k=5):
    """
    Perform cross-modal retrieval: given audio, find matching lyrics and vice versa.
    
    Args:
        audio_features: [N, 64]
        lyrics_features: [N, 768]
        top_k: how many top matches to retrieve
    
    Returns:
        Dictionary with retrieval results
    """
    # Line 1: Compute cross-modal similarity matrix
    # sim[i, j] = similarity between audio_i and lyrics_j
    similarity_matrix = cosine_similarity(audio_features, lyrics_features)
    
    n_samples = len(audio_features)
    
    # === AUDIO-TO-LYRICS RETRIEVAL ===
    # Line 2: For each audio, find top-k most similar lyrics
    # np.argsort returns indices that would sort array (lowest to highest)
    # [:, ::-1] reverses to get highest to lowest
    # [:, :top_k] keeps only top-k matches
    audio_to_lyrics_indices = np.argsort(similarity_matrix, axis=1)[:, ::-1][:, :top_k]
    
    # Line 3: Check if correct match (diagonal) is in top-k
    # For song i, correct match is lyrics i (when i == j)
    audio_to_lyrics_accuracy = []
    for i in range(n_samples):
        # Line 4: Check if i (correct lyrics index) is in the top-k retrieved indices
        if i in audio_to_lyrics_indices[i]:
            audio_to_lyrics_accuracy.append(1)  # Correct
        else:
            audio_to_lyrics_accuracy.append(0)  # Incorrect
    
    # Line 5: Calculate accuracy (proportion of correct retrievals)
    audio_to_lyrics_acc = np.mean(audio_to_lyrics_accuracy)
    
    # === LYRICS-TO-AUDIO RETRIEVAL ===
    # Line 6: Transpose similarity matrix for lyrics-to-audio direction
    # Now rows = lyrics, columns = audio
    lyrics_to_audio_indices = np.argsort(similarity_matrix.T, axis=1)[:, ::-1][:, :top_k]
    
    # Line 7: Same accuracy check for lyrics-to-audio
    lyrics_to_audio_accuracy = []
    for i in range(n_samples):
        if i in lyrics_to_audio_indices[i]:
            lyrics_to_audio_accuracy.append(1)
        else:
            lyrics_to_audio_accuracy.append(0)
    
    # Line 8: Calculate accuracy
    lyrics_to_audio_acc = np.mean(lyrics_to_audio_accuracy)
    
    # Line 9: Print results
    print(f"\nCross-Modal Retrieval Results (Top-{top_k}):")
    print(f"  Audio → Lyrics accuracy: {audio_to_lyrics_acc:.2%}")
    print(f"  Lyrics → Audio accuracy: {lyrics_to_audio_acc:.2%}")
    print(f"\nInterpretation:")
    print(f"  {audio_to_lyrics_acc:.1%} of times, given a song's audio, the correct lyrics")
    print(f"  are in the top-{top_k} most similar lyrics based on feature similarity.")
    
    # Line 10: Return detailed results
    return {
        'audio_to_lyrics_accuracy': audio_to_lyrics_acc,
        'lyrics_to_audio_accuracy': lyrics_to_audio_acc,
        'audio_to_lyrics_indices': audio_to_lyrics_indices,
        'lyrics_to_audio_indices': lyrics_to_audio_indices
    }

# Perform retrieval analysis
retrieval_results = cross_modal_retrieval(
    features_dict['audio_features'],
    features_dict['lyrics_features'],
    top_k=5
)

## Method 4: Prediction-Level Correlation

**What it does**: Compares emotion predictions when using only audio vs only lyrics vs both.

**When to use**: To see if audio and lyrics agree on the predicted emotion.

In [None]:
def analyze_prediction_agreement(features_dict, model, device):
    """
    Analyze how audio-only and lyrics-only predictions correlate.
    
    NOTE: This requires training separate audio-only and lyrics-only models,
    or using zero vectors for the other modality.
    """
    # Line 1: Extract features
    audio_features = features_dict['audio_features']
    lyrics_features = features_dict['lyrics_features']
    combined_predictions = features_dict['predictions']  # [N, 2] - from bimodal model
    ground_truth = features_dict['ground_truth']
    
    # Line 2: Compute correlation between audio features and predictions
    # For each audio feature dimension, correlate with valence and arousal predictions
    audio_valence_corr = []
    audio_arousal_corr = []
    
    for dim in range(audio_features.shape[1]):  # 64 dimensions
        # Line 3: Correlate this audio dimension with valence predictions
        corr_val, _ = pearsonr(audio_features[:, dim], combined_predictions[:, 0])
        audio_valence_corr.append(corr_val)
        
        # Line 4: Correlate this audio dimension with arousal predictions
        corr_ar, _ = pearsonr(audio_features[:, dim], combined_predictions[:, 1])
        audio_arousal_corr.append(corr_ar)
    
    # Line 5: Same for lyrics features (768 dimensions)
    lyrics_valence_corr = []
    lyrics_arousal_corr = []
    
    for dim in range(lyrics_features.shape[1]):
        corr_val, _ = pearsonr(lyrics_features[:, dim], combined_predictions[:, 0])
        lyrics_valence_corr.append(corr_val)
        
        corr_ar, _ = pearsonr(lyrics_features[:, dim], combined_predictions[:, 1])
        lyrics_arousal_corr.append(corr_ar)
    
    # Line 6: Convert to arrays
    audio_valence_corr = np.abs(audio_valence_corr)  # Use absolute value
    audio_arousal_corr = np.abs(audio_arousal_corr)
    lyrics_valence_corr = np.abs(lyrics_valence_corr)
    lyrics_arousal_corr = np.abs(lyrics_arousal_corr)
    
    # Line 7: Print summary statistics
    print("\nFeature-Prediction Correlation Analysis:")
    print(f"\nAudio features correlation with predictions:")
    print(f"  Valence - Mean: {audio_valence_corr.mean():.3f}, Max: {audio_valence_corr.max():.3f}")
    print(f"  Arousal - Mean: {audio_arousal_corr.mean():.3f}, Max: {audio_arousal_corr.max():.3f}")
    
    print(f"\nLyrics features correlation with predictions:")
    print(f"  Valence - Mean: {lyrics_valence_corr.mean():.3f}, Max: {lyrics_valence_corr.max():.3f}")
    print(f"  Arousal - Mean: {lyrics_arousal_corr.mean():.3f}, Max: {lyrics_arousal_corr.max():.3f}")
    
    # Line 8: Compare which modality has stronger predictive features
    if audio_valence_corr.mean() > lyrics_valence_corr.mean():
        print(f"\n→ Audio features have stronger correlation with valence predictions")
    else:
        print(f"\n→ Lyrics features have stronger correlation with valence predictions")
    
    if audio_arousal_corr.mean() > lyrics_arousal_corr.mean():
        print(f"→ Audio features have stronger correlation with arousal predictions")
    else:
        print(f"→ Lyrics features have stronger correlation with arousal predictions")
    
    # Line 9: Return results for plotting
    return {
        'audio_valence_corr': audio_valence_corr,
        'audio_arousal_corr': audio_arousal_corr,
        'lyrics_valence_corr': lyrics_valence_corr,
        'lyrics_arousal_corr': lyrics_arousal_corr
    }

# Analyze prediction agreement
prediction_analysis = analyze_prediction_agreement(features_dict, model, device)

In [None]:
# Visualize feature importance
def plot_feature_importance(prediction_analysis):
    """
    Plot which features correlate most with emotion predictions.
    """
    # Line 1: Create subplots for audio and lyrics
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Line 2: Plot audio feature correlations with valence
    axes[0, 0].bar(range(len(prediction_analysis['audio_valence_corr'])), 
                    prediction_analysis['audio_valence_corr'], 
                    color='steelblue', alpha=0.7)
    axes[0, 0].set_title('Audio Feature Correlation with Valence', fontweight='bold')
    axes[0, 0].set_xlabel('Audio Feature Dimension (0-63)')
    axes[0, 0].set_ylabel('Absolute Correlation')
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    # Line 3: Plot audio feature correlations with arousal
    axes[0, 1].bar(range(len(prediction_analysis['audio_arousal_corr'])), 
                    prediction_analysis['audio_arousal_corr'], 
                    color='coral', alpha=0.7)
    axes[0, 1].set_title('Audio Feature Correlation with Arousal', fontweight='bold')
    axes[0, 1].set_xlabel('Audio Feature Dimension (0-63)')
    axes[0, 1].set_ylabel('Absolute Correlation')
    axes[0, 1].grid(axis='y', alpha=0.3)
    
    # Line 4: Plot lyrics feature correlations with valence (sample every 10th for visibility)
    sampled_lyrics_val = prediction_analysis['lyrics_valence_corr'][::10]
    axes[1, 0].bar(range(len(sampled_lyrics_val)), sampled_lyrics_val, 
                    color='green', alpha=0.7)
    axes[1, 0].set_title('Lyrics Feature Correlation with Valence (sampled)', fontweight='bold')
    axes[1, 0].set_xlabel('Lyrics Feature Dimension (sampled every 10th)')
    axes[1, 0].set_ylabel('Absolute Correlation')
    axes[1, 0].grid(axis='y', alpha=0.3)
    
    # Line 5: Plot lyrics feature correlations with arousal
    sampled_lyrics_ar = prediction_analysis['lyrics_arousal_corr'][::10]
    axes[1, 1].bar(range(len(sampled_lyrics_ar)), sampled_lyrics_ar, 
                    color='purple', alpha=0.7)
    axes[1, 1].set_title('Lyrics Feature Correlation with Arousal (sampled)', fontweight='bold')
    axes[1, 1].set_xlabel('Lyrics Feature Dimension (sampled every 10th)')
    axes[1, 1].set_ylabel('Absolute Correlation')
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    # Line 6: Adjust layout and display
    plt.tight_layout()
    plt.show()

plot_feature_importance(prediction_analysis)

## Summary Report

Generate a comprehensive summary of all similarity analyses.

In [None]:
def generate_similarity_report(audio_sim, lyrics_sim, cross_modal_sim, 
                                 cca_correlations, retrieval_results, 
                                 features_dict):
    """
    Generate comprehensive similarity analysis report.
    """
    print("="*80)
    print(" "*20 + "SIMILARITY ANALYSIS REPORT")
    print("="*80)
    
    # Line 1: Dataset summary
    print(f"\n1. DATASET SUMMARY")
    print(f"{'-'*80}")
    print(f"   Total songs analyzed: {len(features_dict['song_ids'])}")
    print(f"   Audio feature dimension: {features_dict['audio_features'].shape[1]}")
    print(f"   Lyrics feature dimension: {features_dict['lyrics_features'].shape[1]}")
    
    # Line 2: Cosine similarity results
    print(f"\n2. COSINE SIMILARITY ANALYSIS")
    print(f"{'-'*80}")
    
    # Line 3: Self-similarity (diagonal of cross-modal matrix)
    self_similarity = np.diag(cross_modal_sim)
    print(f"   Within-song similarity (audio vs own lyrics):")
    print(f"     Mean: {self_similarity.mean():.4f}")
    print(f"     Std:  {self_similarity.std():.4f}")
    print(f"     Range: [{self_similarity.min():.4f}, {self_similarity.max():.4f}]")
    
    # Line 4: Off-diagonal similarity (cross-song)
    mask = np.ones_like(cross_modal_sim, dtype=bool)
    np.fill_diagonal(mask, False)
    off_diagonal = cross_modal_sim[mask]
    print(f"\n   Cross-song similarity (audio_i vs lyrics_j, i≠j):")
    print(f"     Mean: {off_diagonal.mean():.4f}")
    print(f"     Std:  {off_diagonal.std():.4f}")
    
    # Line 5: CCA results
    print(f"\n3. CANONICAL CORRELATION ANALYSIS")
    print(f"{'-'*80}")
    print(f"   Top 3 canonical correlations:")
    for i in range(min(3, len(cca_correlations))):
        print(f"     Component {i+1}: {cca_correlations[i]:.4f}")
    print(f"   Mean correlation: {cca_correlations.mean():.4f}")
    
    # Line 6: Retrieval results
    print(f"\n4. CROSS-MODAL RETRIEVAL")
    print(f"{'-'*80}")
    print(f"   Audio → Lyrics (Top-5 accuracy): {retrieval_results['audio_to_lyrics_accuracy']:.2%}")
    print(f"   Lyrics → Audio (Top-5 accuracy): {retrieval_results['lyrics_to_audio_accuracy']:.2%}")
    
    # Line 7: Interpretation
    print(f"\n5. INTERPRETATION")
    print(f"{'-'*80}")
    
    # Line 8: Assess agreement level
    if self_similarity.mean() > 0.7:
        agreement = "STRONG"
    elif self_similarity.mean() > 0.5:
        agreement = "MODERATE"
    else:
        agreement = "WEAK"
    
    print(f"   Agreement between audio and lyrics: {agreement}")
    print(f"   (Based on mean self-similarity of {self_similarity.mean():.3f})")
    
    # Line 9: CCA interpretation
    if cca_correlations[0] > 0.7:
        print(f"\n   CCA reveals STRONG shared latent structure between modalities.")
        print(f"   The first canonical component has correlation {cca_correlations[0]:.3f}.")
    elif cca_correlations[0] > 0.5:
        print(f"\n   CCA reveals MODERATE shared latent structure between modalities.")
    else:
        print(f"\n   CCA reveals LIMITED shared latent structure between modalities.")
        print(f"   Audio and lyrics may encode complementary rather than redundant information.")
    
    # Line 10: Retrieval interpretation
    if retrieval_results['audio_to_lyrics_accuracy'] > 0.5:
        print(f"\n   Cross-modal retrieval shows GOOD alignment:")
        print(f"   Given audio, we can identify matching lyrics {retrieval_results['audio_to_lyrics_accuracy']:.1%} of the time.")
    else:
        print(f"\n   Cross-modal retrieval shows LIMITED alignment:")
        print(f"   Audio features alone are insufficient to reliably identify matching lyrics.")
    
    print("\n" + "="*80)

# Generate report
generate_similarity_report(audio_sim, lyrics_sim, cross_modal_sim, 
                            cca_correlations, retrieval_results, 
                            features_dict)

## Export Results

Save similarity matrices and metrics for further analysis.

In [None]:
# Save results to files
output_dir = '/content/drive/MyDrive/dissertation/similarity_analysis_results/'

# Line 1: Create output directory if it doesn't exist
import os
os.makedirs(output_dir, exist_ok=True)

# Line 2: Save similarity matrices as numpy arrays
np.save(os.path.join(output_dir, 'audio_similarity_matrix.npy'), audio_sim)
np.save(os.path.join(output_dir, 'lyrics_similarity_matrix.npy'), lyrics_sim)
np.save(os.path.join(output_dir, 'cross_modal_similarity_matrix.npy'), cross_modal_sim)

# Line 3: Save CCA results
np.save(os.path.join(output_dir, 'cca_correlations.npy'), cca_correlations)

# Line 4: Save features for future use
np.save(os.path.join(output_dir, 'audio_features.npy'), features_dict['audio_features'])
np.save(os.path.join(output_dir, 'lyrics_features.npy'), features_dict['lyrics_features'])

# Line 5: Create summary CSV with per-song similarity scores
results_df = pd.DataFrame({
    'song_id': features_dict['song_ids'],
    'self_similarity': np.diag(cross_modal_sim),
    'true_valence': features_dict['ground_truth'][:, 0],
    'true_arousal': features_dict['ground_truth'][:, 1],
    'pred_valence': features_dict['predictions'][:, 0],
    'pred_arousal': features_dict['predictions'][:, 1]
})

# Line 6: Save CSV
results_df.to_csv(os.path.join(output_dir, 'similarity_analysis_summary.csv'), index=False)

print(f"Results saved to: {output_dir}")
print(f"Files created:")
print(f"  - audio_similarity_matrix.npy")
print(f"  - lyrics_similarity_matrix.npy")
print(f"  - cross_modal_similarity_matrix.npy")
print(f"  - cca_correlations.npy")
print(f"  - audio_features.npy")
print(f"  - lyrics_features.npy")
print(f"  - similarity_analysis_summary.csv")