# Late Fusion - Max Scores

NOTE: The scores must be standard (z-score) normalized before taking the maximum so each dimension (corresponding to a unimodal cosine similarity score) has the same stats:
<br>
Mean: $\mu (s) = 0$
<br>
Standard Deviation: $\sigma (s) = 1$

In [1]:
import os, sys
import pandas as pd
from typing import Dict
from tqdm import tqdm

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent
print('Current path:',  os.getcwd())

Current path: D:\University\7th_Semester\MMSR25-26-Group-E


## Load Data

In [2]:
def load_parquet_files_from_dir(parquet_target_dir: str) -> Dict[str, pd.DataFrame]:
    parquet_files = [
        f for f in tqdm(os.listdir(parquet_target_dir), desc='Parquet Files')
        if f.endswith('.parquet')
    ]

    return {
        filename: pd.read_parquet(os.path.join(parquet_target_dir, filename))
        for filename in parquet_files
    }

## Calculate Max Scores

In [3]:
from RetrievalAlgorithm.src.multimodal_late_fusion_max_score_calculations import calculate_late_fusion_max_scores

In [4]:
norm_names = ['raw', 'min_max', 'max_abs', 'robust', 'standard']
sim_scores_df_list = []

for norm_name in norm_names:
    print('=' * 100)
    print('Normalization name: ', norm_name)
    unimodal_scores_dfs_dict = load_parquet_files_from_dir(parquet_target_dir=f'RetrievalAlgorithm/results/unimodal/{norm_name}')

    audio_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_audio_similarity_scores.parquet']
    lyrics_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_lyrics_similarity_scores.parquet']
    video_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_video_similarity_scores.parquet']

    unimodal_score_dfs = [audio_df, lyrics_df, video_df]

    sim_scores_df_list.append((norm_name, calculate_late_fusion_max_scores(score_dfs=unimodal_score_dfs)))

Normalization name:  raw


Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 37449.14it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:13<00:00,  4.67s/it]
Merging score DataFrames: 2it [00:13,  6.69s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  6.94it/s]


Normalization name:  min_max


Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 14154.01it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:14<00:00,  4.70s/it]
Merging score DataFrames: 2it [00:13,  6.64s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  5.42it/s]


Normalization name:  max_abs


Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 42224.54it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:14<00:00,  4.72s/it]
Merging score DataFrames: 2it [00:13,  6.63s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  5.58it/s]


Normalization name:  robust


Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 45923.04it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:14<00:00,  4.75s/it]
Merging score DataFrames: 2it [00:13,  6.62s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  6.98it/s]


Normalization name:  standard


Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 24244.53it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:14<00:00,  4.76s/it]
Merging score DataFrames: 2it [00:13,  6.84s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  6.72it/s]


## Save results

In [5]:
target_dir = 'RetrievalAlgorithm/results/multimodal/late_fusion/max_score'
os.makedirs(target_dir, exist_ok=True)

for norm_name, sim_scores_df in tqdm(sim_scores_df_list, desc='Saving similarity scores'):
    os.makedirs(target_dir, exist_ok=True)
    output_path = os.path.join(target_dir, f'multimodal_{norm_name}_max_similarity_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)

Saving similarity scores: 100%|██████████| 5/5 [00:20<00:00,  4.19s/it]
