# Late Fusion - Reciprocal Rank Fusion (RRF)

In [1]:
import os, sys
from tqdm import tqdm

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent
print('Current path:',  os.getcwd())

from RetrievalAlgorithm.src.utils.data_loading import load_parquet_files_from_dir
from RetrievalAlgorithm.src.multimodal_late_fusion_calculations import calculate_late_fusion_rrf_scores

Current path: D:\University\7th_Semester\multimedia_search_and_retrieval\MMSR25-26-Group-E


## Calculate RRF Scores

In [2]:
norm_names = ['max_abs', 'min_max', 'raw', 'robust', 'standard']
sim_scores_df_list = []

for norm_name in norm_names:
    print('='*100)
    print('Norm name:', norm_name)

    unimodal_scores_dfs_dict = load_parquet_files_from_dir(parquet_target_dir=f'RetrievalAlgorithm/results/unimodal/{norm_name}')

    audio_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_audio_similarity_scores.parquet']
    lyrics_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_lyrics_similarity_scores.parquet']
    video_df = unimodal_scores_dfs_dict[f'unimodal_{norm_name}_video_similarity_scores.parquet']

    unimodal_score_dfs = [audio_df, lyrics_df, video_df]
    sim_scores_df_list.append((norm_name,  calculate_late_fusion_rrf_scores(score_dfs=unimodal_score_dfs, k=60)))

Norm name: max_abs


Loading Parquet Files: 100%|██████████| 9/9 [00:00<00:00, 156633.76it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:12<00:00,  4.08s/it]
Merging score DataFrames: 2it [00:11,  5.70s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  8.88it/s]


Norm name: min_max


Loading Parquet Files: 100%|██████████| 9/9 [00:00<00:00, 156633.76it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:13<00:00,  4.55s/it]
Merging score DataFrames: 2it [00:11,  5.94s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  7.62it/s]


Norm name: raw


Loading Parquet Files: 100%|██████████| 9/9 [00:00<00:00, 162011.74it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:13<00:00,  4.63s/it]
Merging score DataFrames: 2it [00:11,  5.90s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  5.42it/s]


Norm name: robust


Loading Parquet Files: 100%|██████████| 9/9 [00:00<00:00, 141381.03it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:13<00:00,  4.62s/it]
Merging score DataFrames: 2it [00:11,  5.56s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  8.03it/s]


Norm name: standard


Loading Parquet Files: 100%|██████████| 9/9 [00:00<00:00, 205156.17it/s]
Ordering ids in DataFrames: 100%|██████████| 3/3 [00:12<00:00,  4.09s/it]
Merging score DataFrames: 2it [00:10,  5.34s/it]
Normalizing numerical columns: 100%|██████████| 3/3 [00:00<00:00,  8.91it/s]


## Save results

In [3]:
target_dir = 'RetrievalAlgorithm/results/multimodal/late_fusion/rrf'
os.makedirs(target_dir, exist_ok=True)

for norm_name, sim_scores_df in tqdm(sim_scores_df_list, desc='Saving similarity scores'):
    os.makedirs(target_dir, exist_ok=True)
    output_path = os.path.join(target_dir, f'multimodal_{norm_name}_rrf_similarity_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)

Saving similarity scores: 100%|██████████| 5/5 [00:51<00:00, 10.38s/it]
