## Ablation Study for PaperSeek

This notebook presents an ablation study for the PaperSeek pipeline, examining how different Core Publications affect the retrieval performance across various scientific topics.

1. Exhaustive test on all the core publications in every topic.

In [None]:
from utils import DataReader, Query
from textwrap import shorten
import plotly.express as px
from tqdm.auto import tqdm
from glob import glob
import pandas as pd
import numpy as np

core_df = pd.read_excel("data/eval_cps.xlsx")
topics = sorted(core_df["topic"].unique())
reader = DataReader()

In [98]:
def save_hits(cps_id, topic, hits):
    results = {
        "topic": [],
        "core_id": [],
        "n_cores": [],
    }
    for i, hit in enumerate(hits):
        topic_df = core_df[core_df["topic"] == topic]
        n_cores = topic_df[core_df["id"].isin(hit.ids)]["title"].count()

        results["topic"].append(topic)
        results["core_id"].append(cps_id[i])
        results["n_cores"].append(n_cores.item())
    
    path =  f"ablation/cp_effect/{topic}.xlsx"
    
    try:
        df = pd.read_excel(path)
        df = pd.concat([df, pd.DataFrame(results)])
    except FileNotFoundError:
        df = pd.DataFrame(results)
    df.to_excel(path, index=False, engine_kwargs={'options': {'strings_to_urls': False}})

def is_done(topic):
    try:
        df = pd.read_excel(f"ablation/cp_effect/{topic}.xlsx")
    except FileNotFoundError:
        return []
    return df["core_id"].unique().tolist()


def run():
    chunk_size = 500 # Depends on the GPU memory
    pbar = tqdm(topics)

    for topic in pbar:
        finished_cps = is_done(topic)
        pbar.set_description(topic)
        temp = core_df[core_df["topic"] == topic]
        queries = []
        cps_id = []
        for id, title, abstract in temp[["id", "title", "abstract"]].values:
            if id in finished_cps:
                continue
            text = f"Title: {title}[SEP]\nAbstract: {abstract}"
            queries.append(Query(topic).format(rq_count=1, supporting_texts=text))
            cps_id.append(id)
        
        total = len(queries) // chunk_size
        if len(queries) % chunk_size != 0:
            total += + 1

        for i in range(0, len(queries), chunk_size):
            hits = reader.fetch(queries[i:i+chunk_size], prompt_name="s2p_query")
            save_hits(cps_id[i:i+chunk_size], topic, hits)
            
        pbar.update(1)

def show_results(files, plot=False):
    items = core_df.groupby("topic").count().reset_index()[["topic","id"]].values
    cps_per_topic = dict(zip(items[:,0], items[:,1]))
    dfs = []
    for file in files:
        df = pd.read_excel(file)
        topic = df["topic"][0]
        df["recall"] = (df["n_cores"] / cps_per_topic[topic]).round(2)
        dfs.append(df)
    df = pd.concat(dfs)
    df["topic"] = df["topic"].apply(lambda x: shorten(x, width=30, placeholder="..."))
    overview = pd.DataFrame(cps_per_topic.values(), columns=["n_cps"], index=topics).T
    if plot:
        overview.columns = overview.columns.map(lambda x: shorten(x, width=30, placeholder="..."))
        with pd.option_context("display.max_columns", None):
            display(overview)
        px.box(
            df, x="topic", y="recall", points="all", height=600, hover_data=["core_id"]
        ).show()
    return df

def compute_quartile_averages(df: pd.DataFrame):
    df = df.sort_values('recall', ascending=False).reset_index(drop=True)
    n = len(df)
    results = {
        "avg_top_25": [],
        "avg_mid_50": [],
        "avg_bot_25": [],

    }
    for i, topic_df in df.groupby("topic"):
        n = len(topic_df)
        topic_df = topic_df.sort_values('recall', ascending=False)
        results["avg_top_25"].append(topic_df.iloc[:int(n * 0.25)]["recall"].mean())
        results["avg_mid_50"].append(topic_df.iloc[int(n * 0.25):int(n * 0.75)]["recall"].mean())
        results["avg_bot_25"].append(topic_df.iloc[int(n * 0.75):]["recall"].mean())

    return pd.DataFrame({
        'avg_top_25': [np.mean(results["avg_top_25"]).round(3)],
        'avg_mid_50': [np.mean(results["avg_mid_50"]).round(4)],
        'avg_bot_25': [np.mean(results["avg_bot_25"]).round(3)]
    })

In [99]:
files_s2p = glob("ablation/cp_effect_s2p/*.xlsx")
df_s2p = show_results(files_s2p)
compute_quartile_averages(df_s2p)

Unnamed: 0,avg_top_25,avg_mid_50,avg_bot_25
0,0.871,0.8296,0.742


In [96]:
files_s2s = glob("ablation/cp_effect_s2s/*.xlsx")
df_s2s = show_results(files_s2s)
compute_quartile_averages(df_s2s)

Unnamed: 0,avg_top_25,avg_mid_50,avg_bot_25
0,0.856,0.803,0.71


In [None]:
df_s2p = df_s2p.groupby("topic")["recall"].agg(["max", "min", "mean"])
df_s2s = df_s2s.groupby("topic")["recall"].agg(["max", "min", "mean"])
results = df_s2p - df_s2s  # if positive, s2p is better
s2p_better_count = (results["mean"] > 0).sum()
s2s_better_count = (results["mean"] < 0).sum()
same = (results["mean"] == 0).sum()

In [None]:
print(f"""S2P better: {s2p_better_count} topics
S2S better: {s2s_better_count} topics
Same: {same} topics
Average S2P recall: {df_s2p["mean"].mean():.3f}
Average S2S recall: {df_s2s["mean"].mean():.3f}""")


In [None]:
# highlight negative values with red and positive with green, if 0 then no color
print("Positive values means S2P is better")
# add mean as last row that sums the mean of each column
results.style.apply(
    lambda x: ["background-color: red" if v < 0 else "background-color: green" if v > 0 else "" for v in x],
    subset=["max", "min", "mean"],
).format('{:.2f}',na_rep="0", decimal=".", thousands=",")