#### The goal of this notebook is to bruteforce semantic precision with differnet thresholds to empirically find the best threshold for the given dataset.

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import plotly.io as pio
import pandas as pd
import numpy as np
import json
pio.templates.default = "seaborn"
COLORS = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
    '#ff33cc', '#00cc99', '#ffcc00', '#3399ff', '#9933cc',
    '#66ff66', '#ff0066', '#669999', '#996633', '#00cccc'
]
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=30, title_font_family="Modern Computer", font_family="Modern Computer",
    showlegend=True, legend_title="", height=400,
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=20, legend_itemsizing="constant",
    legend_orientation="h", legend_yanchor="bottom", legend_y=-1.2, legend_xanchor="center", legend_x=0.5
)

In [2]:
topics = [i["baseline"] for i in json.load(open('data/queries.json'))]
results = []
for idx, topic in tqdm(enumerate(topics), total=len(topics)):
    data = get_evaluation_data(topic)
    predicted_pubs = data["predicted_pubs"]
    core_pubs = data["core_pubs"]
    threshold = data["core_threshold"]
    embeddings = data["embeddings"]
    core_mean_embedding = data["core_mean_embedding"]
    predicted_embeddings = data["predicted_embeddings"]

    df = data["df"]

    # if predicted recall is 0 skip
    if predicted_pubs["id"].isin(core_pubs["id"]).sum()  == 0:
        continue
    
    cosine_sim = cosine_similarity(core_mean_embedding, embeddings).flatten()
    df["similarity"] = cosine_sim
    results.append((topic, core_pubs, df[["id", "similarity", "Source"]], threshold))

  0%|          | 0/21 [00:00<?, ?it/s]

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [3]:
thresholds = np.linspace(0.15, 1, 300)
items = []
for topic, core_pubs, result, core_threshold in results:
    n_total_pubs = result.shape[0]
    result = result[result["Source"] == "Predicted"]
    for threshold in thresholds:
        n_core_pubs = result[(result["similarity"] >= threshold) & (
            result["id"].isin(core_pubs["id"]))].shape[0]
        n_pubs = result[result["similarity"] >= threshold].shape[0]
        items.append(dict(topic=topic, threshold=threshold, core_threshold=core_threshold,
                          n_core_pubs=n_core_pubs, n_pubs=n_pubs,
                          n_total_pubs=n_total_pubs, total_core_pubs=len(core_pubs["id"])))

df_plot = pd.DataFrame(items)
fig = px.line(df_plot, x="threshold", y="n_core_pubs", color="topic",
              labels=dict(n_core_pubs="Number of Core Publications",
                          threshold="Threshold"),
              title="Effect of Threshold on Number of Core Publications")

fig.update_xaxes(range=[0.5, 1])  # prior is irrelevant
fig.show()

In [4]:
df_plot["relative_n_pubs"] = df_plot["n_pubs"] / df_plot["n_total_pubs"]

fig = px.line(df_plot, x="threshold", y="relative_n_pubs", color="topic",
                labels=dict(relative_n_pubs="Relative Number of Publications", threshold="Threshold"),
                title="Effect of Threshold on Relative Number of Publications")
fig.show()

In [13]:
# cost function to maximize the number of core publications while minimizing the number of publications
# standarzie to [0, 1]
from sklearn.preprocessing import MinMaxScaler
def f_beta(inverse_precision, recall, beta=2):
    scaler = MinMaxScaler()
    score =  (1 + beta**2) * (inverse_precision * recall) / (beta**2 * inverse_precision + recall)
    return scaler.fit_transform(score.values.reshape(-1, 1)).flatten()
df_plot["f2"] = f_beta(df_plot["n_total_pubs"] / df_plot["n_pubs"], df_plot["n_core_pubs"] / df_plot["total_core_pubs"])
fig = px.line(df_plot, x="threshold", y="f2", color="topic", hover_data=["n_core_pubs", "n_pubs"],
                labels=dict(cost="Cost", threshold="Threshold"),
                title="Effect of Threshold on the Cost Function",
                color_discrete_sequence=COLORS
                )
q = df_plot.loc[df_plot.groupby("topic")["f2"].idxmax()][["topic","threshold", "f2", "n_pubs", "n_total_pubs", "n_core_pubs", "total_core_pubs"]].round(3)
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=20, title_font_family="Modern Computer", font_family="Modern Computer",
    showlegend=True, legend_title="", height=500, width=1200,
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=15, legend_itemsizing="constant",
    legend_orientation="h", legend_yanchor="bottom", legend_y=-.9, legend_xanchor="center", legend_x=0.5
)
print(f"Average Threshold: {q['threshold'].mean():.3f}")
fig.update_layout(**PLOT_CONFIGS, yaxis_title="Cost Function Value")
fig.show()
# fig.write_image("LitQEval-report/pics/threshold-analysis.pdf")
display(q.set_index("topic"))

Average Threshold: 0.689


Unnamed: 0_level_0,threshold,f2,n_pubs,n_total_pubs,n_core_pubs,total_core_pubs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AI on Edge Devices,0.679,0.403,8333,52329,14,36
Business Process Meta Models,0.656,0.677,3244,32568,17,26
Cervical Myelopathy,0.667,0.864,14828,52054,41,47
Cloud Migration,0.733,1.0,470,53945,20,21
Crop Yield Prediction,0.699,0.676,5105,57133,30,46
Data Stream Processing Latency,0.65,0.32,1436,48346,21,69
Drones in Agriculture,0.647,0.536,8179,49850,13,25
Green Warehousing,0.647,0.249,674,45215,9,38
Internet of Things in Healthcare,0.71,0.536,6809,58129,15,29
Multicore Performance Prediction,0.676,0.315,4536,29621,10,33
