## Ablation Study for PaperSeek

This notebook presents an ablation study for the PaperSeek pipeline, examining how adding additional synthetic CPs affect the retrieval performance across various scientific topics.

1. The effect of rerankers on retrieval performance using differnet number of HyDe.

In [None]:
from notebooks.ablations.utils import TOPICS_UNDER_10K
from collections import defaultdict
from tqdm.auto import tqdm
from textwrap import shorten
import plotly.express as px
from glob import glob
import polars as pl
import pandas as pd
import numpy as np
eval_df = pd.read_excel("data/eval_cps.xlsx")

In [None]:
def parse_results(files: list[str], under_10k: bool, top_n: int) -> pd.DataFrame:
    results = defaultdict(list)
    for file in files:
        df = pd.read_parquet(file)
        topics = list(TOPICS_UNDER_10K) if under_10k else df["topic"].unique()
        if "HYDE" in file:
            hyde_n = file.split("_HYDE")[1][0]
            file_name = f"HyDE={hyde_n}"
        else:
            file_name = file.split("-")[-1].split(".")[0]
            
        results["Topic"] = topics

        for topic in topics:
            top_n = TOPICS_UNDER_10K[topic] if under_10k else top_n
            ids = set(df[df["topic"] == topic].head(top_n)["id"].tolist())
            cps = eval_df[eval_df["topic"] == topic]["id"].tolist()
            recall = len(ids.intersection(set(cps))) / len(cps)
            results[file_name].append(recall)

    return pd.DataFrame(results).round(3)


def show_results(
    files: list[str], under_10k: bool = False, top_n: int = 10_000
) -> None:
    df = parse_results(files, under_10k, top_n)
    df["Topic"] = df["Topic"].apply(lambda x: shorten(x, width=25, placeholder="..."))
    fig = px.imshow(
        df.set_index("Topic").T,
        color_continuous_scale="RdBu",
        aspect="auto",
        text_auto=".2f",
        labels=dict(x="Topic", y="Model"),
    )
    if under_10k:
        sample_size = np.full(df.shape, 10_000).T
        for i, topic in enumerate(df["Topic"].unique()):
            n = TOPICS_UNDER_10K[topic] if topic in TOPICS_UNDER_10K else 10_000
            sample_size[:, i] = n
        fig.update_traces(customdata=sample_size)
        fig.update_traces(
            hovertemplate="Sample Size: %{customdata}<br>Recall: %{z:.3f}"
        )

    mean_recall = df.mean(axis=0, numeric_only=True).values
    print(f"HyDe=0: {mean_recall[0]:.3f}")
    print(f"HyDe=1: {mean_recall[1]:.3f}")
    print(f"HyDe=2: {mean_recall[2]:.3f}")
    fig.show()


#### Using the worst CP

In [None]:
under_10k = False
top_n = 10_000
worst_files = glob("ablation/hyde/worst/All_*.parquet")
show_results(worst_files, under_10k, top_n)

#### Using an average CP

In [None]:
average_files = glob("ablation/hyde/average/All_*.parquet")
show_results(average_files, under_10k)

#### Using the best CP

In [None]:
best_files = glob("ablation/hyde/best/All_*.parquet")
show_results(best_files, under_10k)

#### Using the random CP

In [None]:
random_files = glob("ablation/hyde/random/*.parquet")
show_results(random_files, under_10k)

Overview of the final evaluation using 5 RQs, HyDE 1 and a random CP.

In [None]:
random_files_no_hyde = glob("ablation/final/*.parquet")
show_results(random_files_no_hyde, under_10k) # Ignore the output of HyDE

In [None]:
from plotly.io import read_json
from utils import save_plot
layout = dict(
    # width=1300,
    # height=700,
    xaxis_tickangle=-45,
    xaxis_tickfont_size=20,
    yaxis_tickfont_size=20,
    yaxis_title="Trial Nr.",
    yaxis_title_font_size=30,
    xaxis_title_font_size=30,
)
fig = read_json("test.json")
save_plot(fig, "post_optimization_10_trials", layout)

In [None]:
def recall_at_k():
    results = {
        "CP Type": [],
        "Recall": [],
        "Top N": [],
    }
    options = [
        ("Worst CP", worst_files),
        ("Average CP", average_files),
        ("Best CP", best_files),
        ("Random CP",  random_files_no_hyde)
    ]
    for name, files in tqdm(options):
        files_recall = []
        results["CP Type"].extend([name]* 101)
        results["Top N"].extend(list(range(0, 10100, 100)))
        for file in tqdm(files, leave=False):
            df = pl.read_parquet(file)
            topics = df.select("topic").unique().to_numpy().flatten()

            topics_recall = []
            for topic in topics:
                cps = set(eval_df[eval_df["topic"] == topic]["id"].tolist())
                topic_df = df.filter(pl.col("topic") == topic)
                steps_recall = []

                for i in range(1, 10101, 100):
                    found_cps = set(topic_df.head(i).select("id").to_numpy().flatten().tolist())
                    recall = len(cps.intersection(found_cps)) / len(cps)
                    steps_recall.append(recall)

                topics_recall.append(steps_recall)
            files_recall.append(np.mean(topics_recall, axis=0))
        results["Recall"].extend(np.mean(files_recall, axis=0))
 
    return results
# from utils import save_data
# results = recall_at_k()
# save_data(pd.DataFrame(results), "recall_change_over_topn")