# Late Fusion - Max Scores

NOTE: The scores must be standard (z-score) normalized before taking the maximum so each dimension (corresponding to a unimodal cosine similarity score) has the same stats:
<br>
Mean: $\mu (s) = 0$
<br>
Standard Deviation: $\sigma (s) = 1$

In [1]:
import os, sys
from tqdm import tqdm

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent
print('Current path:',  os.getcwd())

from RetrievalAlgorithm.src.utils.data_loading import load_parquet_files_from_dir

Current path: D:\University\7th_Semester\MMSR25-26-Group-E


## Calculate Max Scores

In [2]:
from RetrievalAlgorithm.src.multimodal_late_fusion_calculations import calculate_late_fusion_max_scores

In [3]:
norm_names = ['raw', 'min_max', 'max_abs', 'robust', 'standard']
sim_scores_df_list = []

for norm_name in norm_names:
    print('=' * 100)
    print('Normalization name: ', norm_name)
    unimodal_scores_dfs_dict = load_parquet_files_from_dir(parquet_target_dir=f'RetrievalAlgorithm/results/unimodal/{norm_name}')

    audio_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_audio_similarity_scores.parquet']
    lyrics_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_lyrics_similarity_scores.parquet']
    video_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_video_similarity_scores.parquet']

    unimodal_score_dfs = [audio_df, lyrics_df, video_df]

    sim_scores_df_list.append((norm_name, calculate_late_fusion_max_scores(score_dfs=unimodal_score_dfs)))

Normalization name:  raw


Loading Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 25115.59it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:16<00:00,  5.35s/it]
Merging score DataFrames: 2it [00:14,  7.10s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  6.72it/s]


Normalization name:  min_max


Loading Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 39321.60it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:16<00:00,  5.52s/it]
Merging score DataFrames: 2it [00:16,  8.32s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  5.39it/s]


Normalization name:  max_abs


Loading Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 32682.89it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:18<00:00,  6.11s/it]
Merging score DataFrames: 2it [00:15,  7.58s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  5.31it/s]


Normalization name:  robust


Loading Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 24385.49it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:15<00:00,  5.08s/it]
Merging score DataFrames: 2it [00:12,  6.44s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  7.18it/s]


Normalization name:  standard


Loading Parquet Files: 100%|██████████| 3/3 [00:00<00:00, 35246.25it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:15<00:00,  5.29s/it]
Merging score DataFrames: 2it [00:14,  7.34s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]


## Save results

In [4]:
target_dir = 'RetrievalAlgorithm/results/multimodal/late_fusion/max_score'
os.makedirs(target_dir, exist_ok=True)

for norm_name, sim_scores_df in tqdm(sim_scores_df_list, desc='Saving similarity scores'):
    os.makedirs(target_dir, exist_ok=True)
    output_path = os.path.join(target_dir, f'multimodal_{norm_name}_max_similarity_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)

Saving similarity scores: 100%|██████████| 5/5 [00:32<00:00,  6.60s/it]
