In [None]:
# Load the autoreload extension
%load_ext autoreload

# Automatically reload modules before executing code
%autoreload 2

# Table Retrieval Ranking

In [None]:
from pathlib import Path
from benchmark_src.results_processing.plots.plot_utils import get_task_df, get_list_of_all_runs
from benchmark_src.results_processing import ranking
from benchmark_src.results_processing.plots import table_retrieval_plots


In [None]:
results_folder = Path("../../results")
assert results_folder.exists(), f"Could not find results folder at {results_folder}"

plots_folder = results_folder / "plots" / "table_retrieval"
plots_folder.mkdir(parents=True, exist_ok=True)

In [None]:
task_df = get_task_df(results_folder=results_folder, task_name="table_retrieval")

In [None]:
task_df.groupby(['task', 'Approach', 'Configuration'])['dataset'].nunique().reset_index(name='num_datasets')

In [None]:
task_df

## Configure which approaches to include

In [None]:
# get list as ouptput and copy it to the next cell to comment out approaches not to be plotted
get_list_of_all_runs(task_df)

In [None]:
# Use the list to filter the original task_df
filtered_task_df = task_df


## Overview over the results -> num datasets per approach we have results for


In [None]:
# overview
filtered_task_df.groupby(['task', 'Approach', 'Configuration'])['dataset'].nunique().reset_index(name='num_datasets')


## Average rank and num best datasets


In [None]:
dataset_dominance_df = ranking.compute_dominance_and_avg_rank(filtered_task_df, metric_col='Recall@1_mean')
# sort by "avg_rank"
dataset_dominance_df = dataset_dominance_df.sort_values(by='avg_rank', ascending=True)
dataset_dominance_df.style


## ELO Scores


In [None]:
task_metrics_map = ranking.build_task_metrics_map(filtered_task_df)
print("Task metrics map (scores the elo metrics are based on):", task_metrics_map)

elo_df = ranking.get_elo_scores_for_task("table_retrieval", filtered_task_df, task_metrics_map)
elo_df = elo_df.sort_values(by='elo_score_task', ascending=False)
elo_df.style


### ELO Scores who beats whom?


In [None]:
pairwise_df = ranking.compute_pairwise_wins(filtered_task_df, metric_col='Recall@1_mean')
pairwise_df.style


## Results averaged over the datasets


In [None]:
table_retrieval_plots.create_barplot(df=filtered_task_df, results_folder=plots_folder)


## Bar plots per dataset


In [None]:
table_retrieval_plots.create_barplot_datasets(df=filtered_task_df, results_folder=plots_folder)


## Recall Growth Curve


In [None]:
table_retrieval_plots.create_recall_growth_curve(df=filtered_task_df, results_folder=plots_folder)


## Model Leaderboard (MRR@10 by Dataset)


In [None]:
table_retrieval_plots.create_model_leaderboard(df=filtered_task_df, results_folder=plots_folder)
