In [34]:
from notebooks.ablations.utils import select_core_pub
from utils import DataReader, Query, HyResearch
from tqdm.auto import tqdm
from glob import glob
import pandas as pd

# Load data
eval_df = pd.read_excel("data/eval_cps.xlsx")
query_df = pd.read_excel("data/slr_query_results.xlsx")
hy = HyResearch()

# Define topic sets
eval_topics = set(eval_df["topic"].tolist()).difference(set(query_df["topic"].tolist()))
query_topics = set(query_df["topic"].tolist())
all_topics = sorted(eval_df["topic"].unique())
files = glob("ablation/hyde/**/*.parquet")
files = [i.replace("\\", "/") for i in files]

In [35]:
def run_name(topics, cp_type, hyde_n, synthetic_core=False):
        # Create descriptive run identifier
    description = cp_type + "/"
    if topics == eval_topics:
        description += "Evaluation"
    elif topics == query_topics:
        description += "Query"
    elif topics == all_topics:
        description += "All"
    else:
        description += "Custom"

    description += f"_HYDE{hyde_n}"

    if synthetic_core:
        description += "_SYNTHETIC_CORE"

    return description

In [36]:
# Configuration parameters
TOP_N = 10_000
SELECTED_TOPICS = all_topics
HYDE_N = 2
SYNTHETIC_CORE = False
CP_TYPE = "random"
description = run_name(SELECTED_TOPICS, CP_TYPE, HYDE_N, SYNTHETIC_CORE)
print(f"Running: {description}")

Running: random/All_HYDE2


In [37]:
def query_to_doc(topics):
    results = {}
    for topic in topics:
        ids = query_df[(query_df["topic"] == topic) & (query_df["is_core"])][
            "id"
        ].to_list()
        topic_df = eval_df[eval_df["topic"] == topic]
        n_cores = topic_df.loc[eval_df["id"].isin(ids)]["title"].count().item()
        actual = topic_df.shape[0]
        results[topic] = {"Query": n_cores, "Actual": actual}
    return results

def doc_to_doc(topics, top_n=TOP_N, hyde_n=0, synthetic_core=False, cp_type="average"):
    if not synthetic_core:
        core_pubs, texts, topics = select_core_pub(topics, type=cp_type)
    else:
        texts = []
        for topic in topics:
            rq = Query(topic=topic).format(rq_count=100)
            texts.append(hy.generate_core_pub(rq, topic + "Software Engineering"))

    reader = DataReader(batch_size=1)
    results = {"topic": [], "id": [], "score": [], "query": [], "text": []}
    queries = []

    for text, topic in tqdm(
        zip(texts, topics), desc="Hyde", leave=False, total=len(topics)
    ):
        if hyde_n > 0:
            supporting_texts = [text]
            supporting_texts.extend(hy.generate_n_docs(text, topic, hyde_n))
        else:
            supporting_texts = text

        queries.append(
            Query(topic).format(rq_count=100, supporting_texts=supporting_texts)
        )

    hits = reader.fetch(queries, n_hits=top_n)
    for i, hit in enumerate(hits):
        ids = hit.ids
        results["id"].extend(ids)
        results["topic"].extend([topics[i]] * len(ids))
        results["score"].extend(hit.scores)
        results["query"].extend([queries[i]] * len(ids))
        results["text"].extend(hit.texts)  # We require the texts for the rerankers
    return pd.DataFrame(results)

def slr_query_results_count(topic):
    """If the topic has a query then return the number of results the query had."""
    if topic in query_df["topic"].unique().tolist():
        return min(query_df[query_df["topic"] == topic].shape[0], TOP_N)
    else:
        return TOP_N

In [38]:
query_results = query_to_doc(SELECTED_TOPICS)
results = None

for file in files:
    if description in file:
        results = pd.read_parquet(file)
if results is None:
    results = doc_to_doc(SELECTED_TOPICS, hyde_n=HYDE_N, synthetic_core=SYNTHETIC_CORE, cp_type=CP_TYPE)

Hyde:   0%|          | 0/30 [00:00<?, ?it/s]

Scanning batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [None]:
topics_df = query_df[query_df["topic"].isin(SELECTED_TOPICS)]
items = {
    "Topic": [],
    "Found": [],
    "Query": [],
    "Actual": [],
    "Sample Size": []
}
for topic in SELECTED_TOPICS:
    top_n = slr_query_results_count(topic)
    hit = results[results["topic"] == topic]
    hit = hit.sort_values("score", ascending=False).head(top_n)
    ids = hit["id"].values
    topic_df = eval_df[eval_df["topic"] == topic]
    n_cores = topic_df[eval_df["id"].isin(ids)]["title"].count()
    actual = topic_df.shape[0]
    items["Topic"].append(topic)
    items["Found"].append(n_cores)
    items["Query"].append(query_results[topic]["Query"])
    items["Actual"].append(actual)
    items["Sample Size"].append(top_n)

In [None]:
results_df = pd.DataFrame(items).sort_values("Sample Size").reset_index(drop=True)

print("Topics with a Query".center(110))
results_df[results_df["Query"] != 0].reset_index(drop=True).style.highlight_max(
    axis=1, subset=["Found", "Query"], color="green"
)

In [None]:
print("Topics without a Query".center(110))
results_df[results_df["Query"] == 0].drop(columns=["Query", "Sample Size"]).reset_index(drop=True).style.highlight_max(
    axis=1, subset=["Found", "Actual"], color="green"
)