#### The goal of this notebook is to bruteforce semantic precision with differnet thresholds to empirically find the best threshold for the given dataset.

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import plotly.io as pio
import pandas as pd
import numpy as np
import json
pio.templates.default = "seaborn"
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=25, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=20, legend_itemsizing="constant",
    legend_orientation="h", legend_yanchor="bottom", legend_y=-0.3, legend_xanchor="center", legend_x=0.5
)

In [2]:
topics = [i["baseline"] for i in json.load(open('data/queries.json'))]
results = []
for idx, topic in tqdm(enumerate(topics), total=len(topics)):
    data = get_evaluation_data(topic)
    predicted_pubs = data["predicted_pubs"]
    core_pubs = data["core_pubs"]
    threshold = data["core_threshold"]
    embeddings = data["embeddings"]
    core_mean_embedding = data["core_mean_embedding"]
    predicted_embeddings = data["predicted_embeddings"]

    df = data["df"]

    # if predicted recall is 0 skip
    if predicted_pubs["id"].isin(core_pubs["id"]).sum()  == 0:
        continue
    
    cosine_sim = cosine_similarity(core_mean_embedding, embeddings).flatten()
    df["similarity"] = cosine_sim
    results.append((topic, core_pubs, df[["id", "similarity"]]))

  0%|          | 0/22 [00:00<?, ?it/s]

In [3]:
thresholds = np.linspace(0.15, 1, 300)
items = []
for topic, core_pubs, result in results:
    n_total_pubs = result.shape[0]
    for threshold in thresholds:
        n_core_pubs = result[(result["similarity"] >= threshold) & (
            result["id"].isin(core_pubs["id"]))].shape[0]
        n_pubs = result[result["similarity"] >= threshold].shape[0]
        items.append(dict(topic=topic, threshold=threshold,
                          n_core_pubs=n_core_pubs, n_pubs=n_pubs,
                          n_total_pubs=n_total_pubs, total_core_pubs=len(core_pubs["id"])))

df = pd.DataFrame(items)
fig = px.line(df, x="threshold", y="n_core_pubs", color="topic",
              labels=dict(n_core_pubs="Number of Core Publications",
                          threshold="Threshold"),
              title="Effect of Threshold on Number of Core Publications")

fig.update_xaxes(range=[0.5, 1])  # prior is irrelevant
fig.show()

In [4]:
df["relative_n_pubs"] = df["n_pubs"] / df["n_total_pubs"]

fig = px.line(df, x="threshold", y="relative_n_pubs", color="topic",
                labels=dict(relative_n_pubs="Relative Number of Publications", threshold="Threshold"),
                title="Effect of Threshold on Relative Number of Publications")
fig.show()

In [8]:
# cost function to maximize the number of core publications while minimizing the number of publications
def f_beta(inverse_precision, recall, beta=2):
    return (1 + beta**2) * (inverse_precision * recall) / (beta**2 * inverse_precision + recall)

df["f2"] = f_beta(df["n_total_pubs"] / df["n_pubs"], df["n_core_pubs"] / df["total_core_pubs"])
fig = px.line(df, x="threshold", y="f2", color="topic", hover_data=["n_core_pubs", "n_pubs"],
                labels=dict(cost="Cost", threshold="Threshold"),
                title="Effect of Threshold on Cost Function")
q = df.loc[df.groupby("topic")["f2"].idxmax()][["topic","threshold", "f2", "n_pubs", "n_total_pubs", "n_core_pubs", "total_core_pubs"]].round(3)
print(f"Average Threshold: {q['threshold'].mean():.3f}")
fig.show()
display(q.set_index("topic"))

Average Threshold: 0.691


Unnamed: 0_level_0,threshold,f2,n_pubs,n_total_pubs,n_core_pubs,total_core_pubs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AI on Edge Devices,0.679,0.611,9708,52329,18,36
Business Process Meta Models,0.687,1.081,658,2240,24,26
Cervical Myelopathy,0.77,1.86,5864,52054,73,47
Crop Yield Prediction,0.699,1.431,8494,57133,55,46
Data Stream Processing Latency,0.611,0.38,1235,2011,22,69
Drones in Agriculture,0.645,0.681,9894,49850,14,25
Energy Growth Nexus,0.659,0.092,2526,42888,2,27
Green Warehousing,0.647,0.493,833,45215,15,38
Internet of Things in Healthcare,0.71,1.033,11685,58129,25,29
Nanopharmaceuticals OR Nanonutraceuticals,0.77,0.05,392,46835,2,50
