# Multimodal Early Fusion

## Requirements

Multimodal: Given a query, these retrieval systems selects the tracks that are most similar to the query track, based on more
than one complementary feature (i.e., of different modalities). For this purpose, you will use fusion strategies based on
* Late fusion
*  Early fusion

You will hence implement two different multimodal retrieval systems

In [1]:
import yaml
import torch
import os, sys
from tqdm import tqdm

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent

from RetrievalAlgorithm.src.utils.data_loading import load_all_tsv_files_from_path
from RetrievalAlgorithm.src.score_calculation_modules.cosine_similarity_module import CosineSimilarityModule
from RetrievalAlgorithm.src.multimodal_early_fusion_calculations import calculate_multimodal_similarity

In [2]:
with open('RetrievalAlgorithm/config/experiment_config.yaml', 'r') as file:
    multimodal_config_dict = yaml.safe_load(file)['multimodal']

## Module initialization

In [3]:
unimodal_module_path = 'RetrievalAlgorithm/modules/cosine_similarity_module.pt'

if os.path.exists(unimodal_module_path):
    unimodal_module = torch.jit.load(unimodal_module_path)
else:
    unimodal_module = CosineSimilarityModule()
    unimodal_module = torch.jit.script(unimodal_module)
    unimodal_module.save(unimodal_module_path)

## Data Loading

In [4]:
dataset_files_dict = load_all_tsv_files_from_path(path_to_dataset='Dataset')

Loading .tsv files: 100%|██████████| 9/9 [00:00<00:00, 960.63it/s]


## Calculating Multimodal Similarity Scores

In [5]:
from RetrievalAlgorithm.src.normalization import *


norm_names_types_list = [
    ('raw', NormalizationModule),
    ('min_max', MinMaxNormalizationModule),
    ('max_abs', MaxAbsNormalizationModule),
    ('standard', StandardNormalizationModule),
    ('robust', RobustNormalizationModule),
]
sim_scores_df_list = []

### Lyrics + Audio

In [6]:
for norm_name, norm_module_type in norm_names_types_list:
    print('=' * 100)
    print('Normalization name:', norm_name)

    datasets_list = [dataset_files_dict['id_lyrics_bert_mmsr.tsv'],
                     dataset_files_dict['id_mfcc_bow_mmsr.tsv']]

    lyrics_audio_similarity_scores_df = calculate_multimodal_similarity(
        datasets_df=datasets_list,
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=multimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'lyrics_audio', lyrics_audio_similarity_scores_df))

Normalization name: raw


Batches: 100%|██████████| 8404/8404 [01:59<00:00, 70.59it/s] 


Normalization name: min_max


Batches: 100%|██████████| 8404/8404 [02:18<00:00, 60.73it/s]


Normalization name: max_abs


Batches: 100%|██████████| 8404/8404 [02:07<00:00, 65.75it/s] 


Normalization name: standard


Batches: 100%|██████████| 8404/8404 [02:10<00:00, 64.59it/s] 


Normalization name: robust


Batches: 100%|██████████| 8404/8404 [02:11<00:00, 64.10it/s]


### Lyrics + Videoclips

In [7]:
for norm_name, norm_module_type in norm_names_types_list:
    print('=' * 100)
    print('Normalization name:', norm_name)

    datasets_list = [dataset_files_dict['id_lyrics_bert_mmsr.tsv'],
                     dataset_files_dict['id_vgg19_mmsr.tsv']]

    lyrics_videoclips_similarity_scores_df = calculate_multimodal_similarity(
        datasets_df=datasets_list,
        calculation_module=unimodal_module,
        batch_size=multimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'lyrics_videoclips', lyrics_videoclips_similarity_scores_df))

Normalization name: raw


Batches: 100%|██████████| 8404/8404 [09:17<00:00, 15.08it/s]


Normalization name: min_max


Batches: 100%|██████████| 8404/8404 [09:19<00:00, 15.01it/s]


Normalization name: max_abs


Batches: 100%|██████████| 8404/8404 [09:18<00:00, 15.06it/s]


Normalization name: standard


Batches: 100%|██████████| 8404/8404 [09:07<00:00, 15.36it/s]


Normalization name: robust


Batches: 100%|██████████| 8404/8404 [09:11<00:00, 15.24it/s]


### Audio + Videoclips

In [8]:
for norm_name, norm_module_type in norm_names_types_list:
    print('=' * 100)
    print('Normalization name:', norm_name)

    datasets_list = [dataset_files_dict['id_mfcc_bow_mmsr.tsv'],
                     dataset_files_dict['id_vgg19_mmsr.tsv']]

    audio_videoclips_similarity_scores_df = calculate_multimodal_similarity(
        datasets_df=datasets_list,
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=multimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'audio_videoclips', audio_videoclips_similarity_scores_df))

Normalization name: raw


Batches: 100%|██████████| 8404/8404 [08:56<00:00, 15.66it/s]


Normalization name: min_max


Batches: 100%|██████████| 8404/8404 [12:33<00:00, 11.15it/s]


Normalization name: max_abs


Batches: 100%|██████████| 8404/8404 [10:48<00:00, 12.95it/s]


Normalization name: standard


Batches: 100%|██████████| 8404/8404 [11:36<00:00, 12.07it/s]


Normalization name: robust


Batches: 100%|██████████| 8404/8404 [12:02<00:00, 11.64it/s]


### Lyrics + Audio + Videoclips

In [9]:
for norm_name, norm_module_type in norm_names_types_list:
    print('='*100)
    print('Normalization name:', norm_name)

    datasets_list = [dataset_files_dict['id_lyrics_bert_mmsr.tsv'],
                     dataset_files_dict['id_mfcc_bow_mmsr.tsv'],
                     dataset_files_dict['id_vgg19_mmsr.tsv']]

    lyrics_audio_videoclips_similarity_scores_df = calculate_multimodal_similarity(
        datasets_df=datasets_list,
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=multimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'lyrics_audio_videoclips', lyrics_audio_videoclips_similarity_scores_df))

Normalization name: raw


Batches: 100%|██████████| 8404/8404 [09:41<00:00, 14.46it/s]


Normalization name: min_max


Batches: 100%|██████████| 8404/8404 [13:13<00:00, 10.59it/s]


Normalization name: max_abs


Batches: 100%|██████████| 8404/8404 [11:21<00:00, 12.33it/s]


Normalization name: standard


Batches: 100%|██████████| 8404/8404 [13:10<00:00, 10.62it/s]


Normalization name: robust


Batches: 100%|██████████| 8404/8404 [13:00<00:00, 10.77it/s]


## Merge and Save results

In [10]:
target_dir = 'RetrievalAlgorithm/results/multimodal/early_fusion'
os.makedirs(target_dir, exist_ok=True)

for norm_name, feature_name, sim_scores_df in tqdm(sim_scores_df_list, desc='Saving similarity scores'):
    output_path = os.path.join(target_dir, norm_name)
    os.makedirs(output_path, exist_ok=True)
    output_path = os.path.join(output_path, f'unimodal{norm_name}_{feature_name}_similarity_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)

Saving similarity scores: 100%|██████████| 20/20 [01:28<00:00,  4.42s/it]
