#### The goal of this notebook is to bruteforce semantic precision with differnet thresholds to empirically find the best threshold for the given dataset.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import plotly.io as pio
import pandas as pd
import numpy as np
import json
pio.templates.default = "seaborn"
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=25, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=20, legend_itemsizing="constant",
    legend_orientation="h", legend_yanchor="bottom", legend_y=-0.3, legend_xanchor="center", legend_x=0.5
)

In [None]:
topics = [i["baseline"] for i in json.load(open('data/queries.json'))]
results = []
for idx, topic in tqdm(enumerate(topics), total=len(topics)):
    data = get_data(topic, None)
    core_pubs = data["core_pubs"]
    core_mean_embedding = data["core_mean_embedding"]
    baseline_pubs = data["baseline_pubs"]
    predicted_pubs = data["predicted_pubs"]
    predicted_vs = data["predicted_vs"]
    baseline_vs = data["baseline_vs"]
    core_vs = data["core_vs"]
    core_embeddings = data["core_embeddings"]
    core_threshold = data["core_threshold"]
    df = pd.concat([baseline_pubs, predicted_pubs])
    df["Source"] = "Predicted"
    df.loc[df.duplicated(subset=["id"]), "Source"] = "Baseline"
    df.drop_duplicates(subset=["id"], inplace=True, keep="last")

    # ids in baseline but not in predicted
    missing_ids = set(baseline_pubs["id"]) - set(predicted_pubs["id"])
    pub_ids = list(set(df["id"].values.tolist()) - missing_ids)
    embeddings_dict = {}
    for i in range(0, len(pub_ids), 25000):
        items = predicted_vs.get(pub_ids[i:i+25000], include=["embeddings"])
        for j in range(len(items["ids"])):
            embeddings_dict[items["ids"][j]] = items["embeddings"][j]

    # get the missing embeddings from baseline_vs
    for i in missing_ids:
        items = baseline_vs.get(i, include=["embeddings"])
        for j in range(len(items["ids"])):
            embeddings_dict[items["ids"][j]] = items["embeddings"][j]

    # sort the df in the same order as embeddings.keys()
    custom_sorting = {k: v for v, k in enumerate(embeddings_dict.keys())}
    df.sort_values(by="id", key=lambda x: x.map(custom_sorting), inplace=True)
    embeddings = np.array([embeddings_dict[i] for i in df["id"]])

    recall = evaluate_recall(core_pubs, baseline_pubs, predicted_pubs)
    if recall["predicted_recall"] == 0:
        continue
    # if predicted recall is 0 skip
    
    cosine_sim = cosine_similarity(core_mean_embedding, embeddings).flatten()
    df["similarity"] = cosine_sim
    results.append((topic, core_pubs, df[["id", "similarity"]]))

In [116]:
thresholds = np.linspace(0.15, 1, 300)
items = []
for topic, core_pubs, result in results:
    n_total_pubs = result.shape[0]
    for threshold in thresholds:
        n_core_pubs = result[(result["similarity"] >= threshold) & (
            result["id"].isin(core_pubs))].shape[0]
        n_pubs = result[result["similarity"] >= threshold].shape[0]
        items.append(dict(topic=topic, threshold=threshold,
                          n_core_pubs=n_core_pubs, n_pubs=n_pubs,
                          n_total_pubs=n_total_pubs, total_core_pubs=len(core_pubs)))

df = pd.DataFrame(items)
fig = px.line(df, x="threshold", y="n_core_pubs", color="topic",
              labels=dict(n_core_pubs="Number of Core Publications",
                          threshold="Threshold"),
              title="Effect of Threshold on Number of Core Publications")

fig.update_xaxes(range=[0.5, 1])  # prior is irrelevant
fig.show()

In [117]:
df["relative_n_pubs"] = df["n_pubs"] / df["n_total_pubs"]

fig = px.line(df, x="threshold", y="relative_n_pubs", color="topic",
                labels=dict(relative_n_pubs="Relative Number of Publications", threshold="Threshold"),
                title="Effect of Threshold on Relative Number of Publications")
fig.show()

In [None]:
# cost function to maximize the number of core publications while minimizing the number of publications
def f_beta(inverse_precision, recall, beta=2):
    return (1 + beta**2) * (inverse_precision * recall) / (beta**2 * inverse_precision + recall)

df["f2"] = f_beta(df["n_total_pubs"] / df["n_pubs"], df["n_core_pubs"] / df["total_core_pubs"])
fig = px.line(df, x="threshold", y="f2", color="topic", hover_data=["n_core_pubs", "n_pubs"],
                labels=dict(cost="Cost", threshold="Threshold"),
                title="Effect of Threshold on Cost Function")
q = df.loc[df.groupby("topic")["f2"].idxmax()][["topic","threshold", "f2", "n_pubs", "n_total_pubs", "n_core_pubs", "total_core_pubs"]].round(3)
print(f"Average Threshold: {q['threshold'].mean():.3f}")
fig.show()
display(q.set_index("topic"))

Average Threshold: 0.682


Unnamed: 0_level_0,threshold,f2,n_pubs,n_total_pubs,n_core_pubs,total_core_pubs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AI on Edge Devices,0.679,0.478,8333,49081,14,36
Business Process Meta Models,0.63,0.761,850,1862,17,26
Cervical Myelopathy,0.673,1.014,14422,41785,41,47
Crop Yield Prediction,0.699,0.801,5100,48207,30,46
Data Stream Processing Latency,0.611,0.314,1195,1954,18,69
Drones in Agriculture,0.645,0.635,8712,47201,13,25
Energy Growth Nexus,0.659,0.046,1295,39992,1,27
Green Warehousing,0.647,0.296,752,45031,9,38
Internet of Things in Healthcare,0.71,0.634,6736,42945,15,29
Nanopharmaceuticals OR Nanonutraceuticals,0.77,0.05,390,46377,2,50


Unnamed: 0_level_0,threshold,f2,n_pubs,n_total_pubs,n_core_pubs,total_core_pubs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AI on Edge Devices,0.679,0.478,8333,49081,14,36
Business Process Meta Models,0.63,0.761,850,1862,17,26
Cervical Myelopathy,0.673,1.014,14422,41785,41,47
Crop Yield Prediction,0.699,0.801,5100,48207,30,46
Data Stream Processing Latency,0.611,0.314,1195,1954,18,69
Drones in Agriculture,0.645,0.635,8712,47201,13,25
Energy Growth Nexus,0.659,0.046,1295,39992,1,27
Green Warehousing,0.647,0.296,752,45031,9,38
Internet of Things in Healthcare,0.71,0.634,6736,42945,15,29
Nanopharmaceuticals OR Nanonutraceuticals,0.77,0.05,390,46377,2,50
