In [13]:
import torch
import torch.nn as nn
import numpy as np
import os
import torch.nn.functional as F

In [14]:
import sys
sys.path.append('../')

from modules.cross_attention import CrossAttention

In [15]:
def load_npy_files(directory):
    file_list = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.npy')]
    feature_vectors = [(file, torch.tensor(np.load(file))) for file in file_list]
    return feature_vectors

In [16]:
# Linear transformation to match dimensions
class LinearTransformations(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearTransformations, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)

In [17]:
# Stage 1 of SMCA
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v):
    cross_attn = CrossAttention(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [18]:
# Stage 2 of SMCA - Model A: Stage 1 Output as Query
def SMCAStage2_ModelA(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v):
    cross_attn = CrossAttention(modalityAlphaBeta.shape[-1], modalityGamma.shape[-1], d_out_kq, d_out_v)
    multimodal_representation = cross_attn(modalityAlphaBeta, modalityGamma)
    return multimodal_representation

In [19]:
# Stage 2 of SMCA - Model B: Stage 1 Output as Key-Value
def SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v):
    cross_attn = CrossAttention(modalityGamma.shape[-1], modalityAlphaBeta.shape[-1], d_out_kq, d_out_v)
    multimodal_representation = cross_attn(modalityGamma, modalityAlphaBeta)
    return multimodal_representation

In [21]:
if __name__ == "__main__":
    torch.manual_seed(42)
    
    # Load .npy files
    video_features = load_npy_files(r'D:\Projects\Thesis\Video')
    audio_features = load_npy_files(r'D:\Projects\SMCA\misc\audio_fe\logmel_spectrograms')
    text_features = load_npy_files(r'D:\Projects\SMCA\misc\textStream_BERT\feature_vectors')

    # Select the first file from each modality directories (for testing)
    video_file_name, video_features = video_features[0]
    audio_file_name, audio_features = audio_features[0]
    text_file_name, text_features = text_features[0]

    # Print the file names
    print("\nSelected File Names:")
    print("Audio file:", audio_file_name)
    print("Video file:", video_file_name)
    print("Text file:", text_file_name)
    
    # Reshape features
    video_features = video_features.unsqueeze(0)  # Add batch dimension
    text_features = text_features.unsqueeze(0)    # Add batch dimension

    # # Randomize assignment of Alpha, Beta, Gamma
    # modalityAlpha, modalityBeta, modalityGamma = randomize_modalities(audio_features, video_features, text_features)

    # Manual assignment of modalities
    modalityAlpha = audio_features
    modalityBeta = text_features
    modalityGamma = video_features

    # Apply linear transformation to match dimensions
    linear_transform_Alpha = LinearTransformations(modalityAlpha.shape[-1], 768)
    linear_transform_Beta = LinearTransformations(modalityBeta.shape[-1], 768)
    linear_transform_Gamma = LinearTransformations(modalityGamma.shape[-1], 768)

    modalityAlpha = linear_transform_Alpha(modalityAlpha)
    modalityBeta = linear_transform_Beta(modalityBeta)
    modalityGamma = linear_transform_Gamma(modalityGamma)

    # Determine the output dimensions
    d_out_kq = 768  # Final transformed dimension
    d_out_v = 768

    # Stage 1: Bimodal Representation
    modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v)
    
    # Stage 2, Model A: Multimodal Representation (using AlphaBeta as Query)
    final_representation_A = SMCAStage2_ModelA(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v)
    
    # Stage 2, Model B: Multimodal Representation (using AlphaBeta as Key-Value)
    final_representation_B = SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v)

    print("Modality Alpha Shape:", modalityAlpha.shape)
    print("Modality Beta Shape:", modalityBeta.shape)
    print("Modality Gamma Shape:", modalityGamma.shape)
    print("Stage 1 Bimodal Representation Shape:", modalityAlphaBeta.shape)
    print("Final Multimodal Representation (Model A) Shape:", final_representation_A.shape)
    print("Final Multimodal Representation (Model B) Shape:", final_representation_B.shape)



Selected File Names:
Audio file: D:\Projects\SMCA\misc\audio_fe\logmel_spectrograms\feature_tt0021814.npy
Video file: D:\Projects\Thesis\Video\tt0021814_features.npy
Text file: D:\Projects\SMCA\misc\textStream_BERT\feature_vectors\tt0021814.npy
Modality Alpha Shape: torch.Size([1, 197, 768])
Modality Beta Shape: torch.Size([1, 768])
Modality Gamma Shape: torch.Size([1, 95, 768])
Stage 1 Bimodal Representation Shape: torch.Size([1, 197, 768])
Final Multimodal Representation (Model A) Shape: torch.Size([1, 197, 768])
Final Multimodal Representation (Model B) Shape: torch.Size([1, 95, 768])


In [22]:
print("Audio features shape:", audio_features.shape)
print("Video features shape:", video_features.shape)
print("Text features shape:", text_features.shape)

Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([1, 95, 768])
Text features shape: torch.Size([1, 1024])
