## Ablation Study for PaperSeek

This notebook presents an ablation study for the PaperSeek pipeline, examining how different input query structures affect the retrieval performance across various scientific topics.

1. The impact of different research question counts.
2. The impact of adding sub ressearch questions to the query.


### Preprocessing

In [None]:
from notebooks.ablations.utils import select_core_pub
from utils import DataReader, Query
from itertools import product
from typing import Literal
import pandas as pd
core_df = pd.read_excel("data/eval_cps.xlsx")
df_random = pd.read_excel("ablation/query_combinations/random_cp.xlsx")

In [106]:
def run_ablation(topics, core_type: Literal["worst", "average", "best", "random"] = "average"):
    n_rq = [1, 3, 5, 100]
    has_text = [True, False]
    include_sub_rq = [True, False]

    combinations = list(product(n_rq, has_text, include_sub_rq))
    core_pubs, texts, topics = select_core_pub(topics, type=core_type)
    reader = DataReader(create_index=False)
    results = {
        (rq_count, has_text, include_sub_rq): {}
        for rq_count, has_text, include_sub_rq in combinations
    }
    queries = []
    for text, topic in zip(texts, topics):
        for rq_count, has_text, include_sub_rq in combinations:
            queries.append(
                Query(topic).format(
                    rq_count=rq_count,
                    include_sub_rq=include_sub_rq,
                    supporting_texts=text if has_text else None,
                )
            )

    hits = reader.fetch(queries)
    idx = 0
    n_combs = len(combinations)
    for i, hit in enumerate(hits):
        cycle_idx = i % n_combs # every 16 iteartions occurs a new topic
        if i > 0 and cycle_idx == 0:
            idx += 1
        rq_count, has_text, include_sub_rq =combinations[cycle_idx]
        topic_df = core_df[core_df["topic"] == topics[idx]]
        n_cores = topic_df[core_df["id"].isin(hit.ids)]["title"].count()
        actual = topic_df.shape[0]
        results[(rq_count, has_text, include_sub_rq)][topics[idx]] = n_cores / actual
    return results


def run():
    topics = core_df["topic"].unique()

    pd.DataFrame(run_ablation(topics, "worst")).T.reset_index().rename(
        columns={
            "level_0": "rq_count",
            "level_1": "has_text",
            "level_2": "include_sub_rq",
        }
    ).to_excel("ablation/query_combinations/worst_cp.xlsx", index=False)

    pd.DataFrame(run_ablation(topics, "average")).T.reset_index().rename(
        columns={
            "level_0": "rq_count",
            "level_1": "has_text",
            "level_2": "include_sub_rq",
        }
    ).to_excel("ablation/query_combinations/avg_cp.xlsx", index=False)

    pd.DataFrame(run_ablation(topics, "best")).T.reset_index().rename(
        columns={
            "level_0": "rq_count",
            "level_1": "has_text",
            "level_2": "include_sub_rq",
        }
    ).to_excel("ablation/query_combinations/best_cp.xlsx", index=False)

    pd.DataFrame(run_ablation(topics, "random")).T.reset_index().rename(
        columns={
            "level_0": "rq_count",
            "level_1": "has_text",
            "level_2": "include_sub_rq",
        }
    ).to_excel("ablation/query_combinations/random_cp.xlsx", index=False)

In [None]:
def subrqs_vs_without(df: pd.DataFrame):
    print("The average recall of the queries with and without sub-RQs")
    clms = list(set(df.columns).difference(set(["rq_count", "has_text"])))
    df = (
        df[~df["has_text"]][clms]
        .groupby("include_sub_rq")
        .mean()
        .mean(axis=1)
        .to_frame()
        .reset_index().T
    )
    display(df.rename(columns=df.iloc[0]).iloc[1:])

def one_or_many_RQs(df: pd.DataFrame):
    print("The average recall of the queries using 1, 3, 5 or 100 RQs")
    clms = list(set(df.columns).difference(set(["has_text", "include_sub_rq"])))
    df = (
        df[~df["has_text"]][clms]
        .groupby("rq_count")
        .mean()
        .mean(axis=1)
        .to_frame()
        .reset_index().T
    )
    display(df.rename(columns=df.iloc[0]).iloc[1:])


In [109]:
subrqs_vs_without(df_random)

The average recall of the queries with and without sub-RQs


Unnamed: 0,False,True
0,0.754831,0.753443


In [110]:
one_or_many_RQs(df_random)

The average recall of the queries using 1, 3, 5 or 100 RQs


Unnamed: 0,1.0,3.0,5.0,100.0
0,0.750005,0.755402,0.756034,0.755108
