# NN-based

In [1]:
import torch
import os, sys
from tqdm import tqdm

from RetrievalAlgorithm.src.normalization import NormalizationModule

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent
print('Current path:',  os.getcwd())

from RetrievalAlgorithm.src.utils.data_loading import load_all_tsv_files_from_path
from RetrievalAlgorithm.src.score_calculation_modules.cosine_similarity_module import CosineSimilarityModule
from RetrievalAlgorithm.src.unimodal_calculations import calculate_unimodal_similarity
from RetrievalAlgorithm.src.multimodal_late_fusion_calculations import calculate_late_fusion_max_scores, calculate_late_fusion_avg_scores

Current path: D:\University\7th_Semester\multimedia_search_and_retrieval\MMSR25-26-Group-E


## Model Initialization

In [2]:
unimodal_module_path = 'RetrievalAlgorithm/modules/cosine_similarity_module.pt'

if os.path.exists(unimodal_module_path):
    unimodal_module = torch.jit.load(unimodal_module_path)
else:
    unimodal_module = CosineSimilarityModule()
    unimodal_module = torch.jit.script(unimodal_module)
    unimodal_module.save(unimodal_module_path)

## Data Loading

In [3]:
NN_embeddings_variations = {
    'lyrics_bert_lyrics_bert_padding': [],
    'lyrics_bert_mfcc_bow_padding': [],
    'mfcc_bow_mfcc_bow_padding': [],
    'vgg19_lyrics_bert_padding': [],
    'vgg19_mfcc_bow_padding': [],
    'vgg19_vgg19_padding': [],
}
max_scores_results_dict, avg_scores_results_dict = {}, {}

for NN_based_variation_name in NN_embeddings_variations.keys():
    NN_embeddings_variations[NN_based_variation_name] = tuple(load_all_tsv_files_from_path(path_to_dataset=f'Dataset/NN_based/{NN_based_variation_name}', n_jobs=1).values())

Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 17.77it/s]
Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 22.73it/s]
Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 19.68it/s]
Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 16.11it/s]
Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 18.46it/s]
Loading .tsv files: 100%|██████████| 2/2 [00:00<00:00, 17.75it/s]


## Calculate Scores

In [4]:
for NN_embeddings_variation_name, (f1_embeddings, f2_embeddings) in NN_embeddings_variations.items():
    print('-'*100)
    print('NN-based variation:', NN_embeddings_variation_name)

    f1_scores_df = calculate_unimodal_similarity(
            dataset_df=f1_embeddings,
            calculation_module=unimodal_module,
            normalization_module_type=NormalizationModule,
            batch_size=1024,
            include_reverse_pairs=False,
            include_self_pairs=True,
        )
    f1_mean = f1_scores_df['score'].mean()
    f1_std = f1_scores_df['score'].std(ddof=0)
    f1_scores_df['score'] = (f1_scores_df['score'] - f1_mean) / f1_std

    f2_scores_df = calculate_unimodal_similarity(
            dataset_df=f2_embeddings,
            calculation_module=unimodal_module,
            normalization_module_type=NormalizationModule,
            batch_size=1024,
            include_reverse_pairs=False,
            include_self_pairs=True,
        )
    f2_mean = f2_scores_df['score'].mean()
    f2_std = f2_scores_df['score'].std(ddof=0)
    f2_scores_df['score'] = (f2_scores_df['score'] - f2_mean) / f2_std

    curr_NN_embeddings_scores_lists = [f1_scores_df, f2_scores_df]

    max_scores_results_dict[NN_embeddings_variation_name] = calculate_late_fusion_max_scores(
        score_dfs=curr_NN_embeddings_scores_lists
    )

    avg_scores_results_dict[NN_embeddings_variation_name] = calculate_late_fusion_avg_scores(
        score_dfs=curr_NN_embeddings_scores_lists
    )

----------------------------------------------------------------------------------------------------
NN-based variation: lyrics_bert_lyrics_bert_padding


Batches:   0%|          | 0/8404 [00:05<?, ?it/s]


KeyboardInterrupt: 

## Save Files

In [None]:
target_dir = 'RetrievalAlgorithm/results/NN-based/pretrained/max_scores'
os.makedirs(target_dir, exist_ok=True)

# Max Scores
for variation_name, sim_scores_df in tqdm(max_scores_results_dict.items(), desc='Saving NN-based max similarity scores'):
    os.makedirs(target_dir, exist_ok=True)
    output_path = os.path.join(target_dir, f'NN_based_{variation_name}_max_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)

In [None]:
target_dir = 'RetrievalAlgorithm/results/NN-based/pretrained/avg_scores'
os.makedirs(target_dir, exist_ok=True)

# Max Scores
for variation_name, sim_scores_df in tqdm(avg_scores_results_dict.items(), desc='Saving NN-based average similarity scores'):
    print(variation_name)
    #print(sim_scores_df)
    os.makedirs(target_dir, exist_ok=True)
    output_path = os.path.join(target_dir, f'NN_based_{variation_name}_avg_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)