In [12]:
from utils import save_data, read_data
from glob import glob
import pandas as pd
import numpy as np

In [13]:
paperseek_files = glob("ablation/final/**")
eval_df = pd.read_excel("data/eval_cps.xlsx")
query_df = pd.read_excel("data/slr_query_results.xlsx")
query_topics = set(query_df["topic"].tolist())

In [14]:
def slr_query_results_count(topic: str, top_n=10_000) -> int:
    """If the topic has a query then return the number of results the query had."""
    if topic in query_df["topic"].unique().tolist():
        return min(query_df[query_df["topic"] == topic].shape[0], top_n)
    else:
        return top_n

def get_slr_query_results(df: pd.DataFrame, topics: list[str]) -> dict[str, float]:
    results = {}
    for topic in topics:
        ids = df[(df["topic"] == topic) & (df["is_core"])]["id"].to_list()
        topic_df = eval_df[eval_df["topic"] == topic]
        n_cores = topic_df.loc[eval_df["id"].isin(ids)]["title"].count().item()
        actual = topic_df.shape[0]
        results[topic] = n_cores / actual
    return results

def get_paperseek_results(dfs: list[pd.DataFrame], limit_k: bool = False) -> dict[str, float]:
    results = {}
    for df in dfs:
        for topic in df["topic"].unique():
            topic_df = eval_df[eval_df["topic"] == topic]
            ids = (
                df[df["topic"] == topic]
                .sort_values("score", ascending=False)["id"]
                .tolist()
            )
            if limit_k:
                top_n = slr_query_results_count(topic)
                ids = ids[:top_n]
                
            n_cores = topic_df.loc[eval_df["id"].isin(ids)]["title"].count().item()
            actual = topic_df.shape[0]
            if topic in results:
                results[topic].append(n_cores / actual)
            else:
                results[topic] = [n_cores / actual]
    return {k: np.mean(v) for k, v in results.items()}

In [None]:
def store_results(limit_k: bool, files: list[str], filename: str) -> None:
    slr_query_results = get_slr_query_results(query_df, query_topics)
    paperseek_results = get_paperseek_results(
        [pd.read_parquet(file) for file in files], limit_k
    )
    results_df = pd.merge(
        pd.DataFrame.from_dict(
            paperseek_results, orient="index", columns=["paperseek_results"]
        ),
        pd.DataFrame.from_dict(
            slr_query_results, orient="index", columns=["slr_query_results"]
        ),
        left_index=True,
        right_index=True,
        how="outer",
    )
    results_df["Sample Size"] = results_df.index.map(
        lambda x: slr_query_results_count(x)
    )
    save_data(results_df.reset_index(names="Topic"), filename)


def store_optimzed_results(limit_k=True) -> None:
    # The recall is calcuated based on the top k results of the respective SLR query
    store_results(
        limit_k=limit_k,
        files=paperseek_files,
        filename="post_input_optimization_results_by_k" if limit_k else "post_input_optimization_results_10k",
    )

def store_synthetic_cp_results(limit_k=False) -> None:
    store_results(
        limit_k=limit_k,
        files=[r"ablation\All_HYDE0_SYNTHETIC_CORE.parquet"],
        filename="synthetic_core_results_by_k" if limit_k else "synthetic_core_results_10k",
    )
    
def store_baseline_results(limit_k= False) -> None:
    # The recall is calculated based on the top 10k results of the respective SLR query
    store_results(
        limit_k=limit_k,
        files=[r"ablation\All_HYDE0_notext.parquet"],
        filename="baseline_results_by_k" if limit_k else "baseline_results_10k",
    )

# store_optimzed_results()
store_synthetic_cp_results(True)
