# Unimodal

## Requirements

Given a query, these retrieval systems selects the tracks that are most similar to the query track. The similarity is measured as cosine similarity between the feature of the query and the feature of the target track:
$$ sim(query, target) = cos(feature(query), feature(target)) $$

You will use the following features, hence implementing three different unimodal retrieval systems:

* Lyrics: `id_lyrics_bert_mmsr.tsv`
* Audio:`id_mfcc_bow_mmsr.tsv`
* Videoclips: `id_vgg19_mmsr.tsv`

In [None]:
import torch
import yaml
import os, sys
from tqdm import tqdm

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent
print('Current path:',  os.getcwd())

from RetrievalAlgorithm.src.utils.data_loading import load_all_tsv_files_from_path
from RetrievalAlgorithm.src.score_calculation_modules.cosine_similarity_module import CosineSimilarityModule
from RetrievalAlgorithm.src.unimodal_calculations import calculate_unimodal_similarity

In [None]:
with open('RetrievalAlgorithm/config/experiment_config.yaml', 'r') as file:
    unimodal_config_dict = yaml.safe_load(file)['unimodal']

## Module initialization

In [None]:
unimodal_module_path = 'RetrievalAlgorithm/modules/cosine_similarity_module.pt'

if os.path.exists(unimodal_module_path):
    unimodal_module = torch.jit.load(unimodal_module_path)
else:
    unimodal_module = CosineSimilarityModule()
    unimodal_module = torch.jit.script(unimodal_module)
    unimodal_module.save(unimodal_module_path)

## Data Loading

In [None]:
dataset_files_dict = load_all_tsv_files_from_path(path_to_dataset='Dataset', n_jobs=1)

## Calculating Unimodal Similarity Scores

In [None]:
from RetrievalAlgorithm.src.normalization import *


norm_names_types_list = [
    ('raw', NormalizationModule),
    ('min_max', MinMaxNormalizationModule),
    ('max_abs', MaxAbsNormalizationModule),
    ('standard', StandardNormalizationModule),
    ('robust', RobustNormalizationModule),
]
sim_scores_df_list = []

### Lyrics

In [None]:
for norm_name, norm_module_type in norm_names_types_list:
    print('='*100)
    print('Normalization name:', norm_name)
    lyrics_similarity_scores_df = calculate_unimodal_similarity(
        dataset_df=dataset_files_dict['id_lyrics_bert_mmsr.tsv'],
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=unimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'lyrics', lyrics_similarity_scores_df))

### Audio

In [None]:
for norm_name, norm_module_type in norm_names_types_list:
    print('='*100)
    print('Normalization name:', norm_name)

    audio_similarity_scores_df = calculate_unimodal_similarity(
        dataset_df=dataset_files_dict['id_mfcc_bow_mmsr.tsv'],
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=unimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'audio', audio_similarity_scores_df))

### Videoclips

In [None]:
for norm_name, norm_module_type in norm_names_types_list:
    print('='*100)
    print('Normalization name:', norm_name)

    videoclips_similarity_scores_df = calculate_unimodal_similarity(
        dataset_df=dataset_files_dict['id_vgg19_mmsr.tsv'],
        calculation_module=unimodal_module,
        normalization_module_type=norm_module_type,
        batch_size=unimodal_config_dict['batch_size'],
        include_reverse_pairs=False,
        include_self_pairs=True,
    )

    sim_scores_df_list.append((norm_name, 'video', videoclips_similarity_scores_df))

## Merge and Save results

In [None]:
target_dir = 'RetrievalAlgorithm/results/unimodal'
os.makedirs(target_dir, exist_ok=True)

for norm_name, feature_name, sim_scores_df in tqdm(sim_scores_df_list, desc='Saving similarity scores'):
    output_path = os.path.join(target_dir, norm_name)
    os.makedirs(output_path, exist_ok=True)
    output_path = os.path.join(output_path, f'unimodal_{norm_name}_{feature_name}_similarity_scores.parquet')
    sim_scores_df.to_parquet(output_path, index=False)