In [1]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from litQeval.eval_utils import *
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import numpy as np
import pandas as pd
import kaleido
pio.kaleido.scope.chromium_args = (
    "--headless",
#    "--no-sandbox",
    "--single-process",
    "--disable-gpu"
) 
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
PLOT_CONFIGS = dict(
    title_x=0.5, title_font_size=25, title_font_family="Modern Computer", font_family="Modern Computer",
    xaxis_title="", yaxis_title="", showlegend=True, legend_title="",
    xaxis_tickfont_size=15, yaxis_tickfont_size=15, legend_font_size=25,
    xaxis_title_font_size=10, yaxis_title_font_size=10, xaxis_tickangle=45,
    width=550, height=400,
    legend_orientation="h", legend_yanchor="bottom", legend_y=-0.5, legend_xanchor="center", legend_x=0.5
    )

In [2]:
def ellipse(x_center=0, y_center=0, ax1 = [1, 0],  ax2 = [0,1], a=1, b =1,  N=100):
    # x_center, y_center the coordinates of ellipse center
    # ax1 ax2 two orthonormal vectors representing the ellipse axis directions
    # a, b the ellipse parameters
    if np.linalg.norm(ax1) != 1 or np.linalg.norm(ax2) != 1:
        raise ValueError('ax1, ax2 must be unit vectors')
    if  abs(np.dot(ax1, ax2)) > 1e-06:
        raise ValueError('ax1, ax2 must be orthogonal vectors')
    t = np.linspace(0, 2*np.pi, N)
    #ellipse parameterization with respect to a system of axes of directions a1, a2
    xs = a * np.cos(t)
    ys = b * np.sin(t)
    #rotation matrix
    R = np.array([ax1, ax2]).T
    # coordinate of the  ellipse points with respect to the system of axes [1, 0], [0,1] with origin (0,0)
    xp, yp = np.dot(R, [xs, ys])
    x = xp + x_center 
    y = yp + y_center
    return x, y

# random 1000 points
np.random.seed(0)
pubs = np.random.rand(200, 2) # retrieved publications
c_pubs = np.random.rand(10, 2) * 0.55 + 0.4 # core publications

# df both pubs and core pubs
A, c = mvee(c_pubs)
x, y = ellipse(c[0], c[1], np.linalg.eig(A)[1][:, 0], np.linalg.eig(A)[1][:, 1], 1/np.sqrt(np.linalg.eig(A)[0][0]), 1/np.sqrt(np.linalg.eig(A)[0][1]))
pubs_df = pd.DataFrame(pubs, columns=["x", "y"])
pubs_df["Type"] = "Retrieved"
c_pubs_df = pd.DataFrame(c_pubs, columns=["x", "y"])
c_pubs_df["Type"] = "CP"
inside = is_inside_ellipse(A, c, pubs)
inside_df = pd.DataFrame(pubs[inside], columns=["x", "y"])
inside_df["Type"] = "Relevant"
df = pd.concat([pubs_df, c_pubs_df, inside_df])
df = df.drop_duplicates(subset=["x", "y"], keep="last")
fig = px.scatter(df, x="x", y="y", color="Type", title="Semantic Precision using Minimum Volume Ellipsoid",
                 color_discrete_sequence=[COLORS[0], "Red", COLORS[1], COLORS[2]])
trace = px.line(x=x, y=y).data[0]
trace["line"]["dash"] = "dash"
trace["line"]["color"] = "black"
trace["name"] = "MVEE"
trace["opacity"] = 0.5
trace["showlegend"] = True
fig.add_traces(trace)
relevant_ratio = len(inside_df) / len(pubs)
fig.update_layout(title_subtitle_text=f"Relevance ratio: {relevant_ratio:.0%}", **PLOT_CONFIGS)
pio.write_image(fig, "LitQEval-report/pics/sp_mvee.pdf", width=1200)
fig.show()

100%|██████████| 200/200 [00:00<?, ?it/s]


In [3]:
df_1 = pd.DataFrame({
    "x": pubs[:, 0],
    "y": pubs[:, 1],
    "Type": "Irrelevant"
})
df_2 = pd.DataFrame({
    "x": c_pubs[:, 0],
    "y": c_pubs[:, 1],
    "Type": "CP"
})
df = pd.concat([df_1, df_2])
avg = np.mean(c_pubs, axis=0)
df["sim"] = cosine_similarity(np.concatenate(
    [pubs, c_pubs]), avg.reshape(1, -1)).flatten()
df = pd.concat(
    [df, pd.DataFrame({"x": [avg[0]], "y": [avg[1]], "Type": "Centroid", "sim": [1]})])
# select the least similar entry with type "CP"
threshold = df[df["Type"] == "CP"]["sim"].min()
# set everything above or equal to the threshold to "relevant" except leave the core as is
df.loc[(df["sim"] >= threshold) & (df["Type"] != "CP") & (df["Type"] != "Centroid"), "Type"] = "Relevant"
# relevant ratio
relevant_ratio = df[df["Type"] == "Relevant"].shape[0] / (df.shape[0] - c_pubs.shape[0] - 1)
fig = px.scatter(df, x="x", y="y", color="Type", hover_data={"sim": True},
                 color_discrete_map={"CP": "Red",
                                     "Relevant": COLORS[1],
                                     "Irrelevant": COLORS[0],
                                     "Centroid": "black"},
                 title=f"Semantic Precision using Cosine Similarity")
fig.update_layout(title_subtitle_text=f"The threshold is {threshold:.3f} and the relevance ratio is {relevant_ratio:.0%}", **PLOT_CONFIGS)
fig.update_xaxes(tickfont=dict(size=15))

fig.add_traces(
    [
        go.Scatter(x=[0, 1], y=[0, 0.55], mode="lines", line=dict(color="black", dash="dash")
                   , showlegend=False, opacity=0.5),
        go.Scatter(x=[0, 0.42], y=[0, 1], mode="lines", line=dict(color="black", dash="dash"),
                   showlegend=False, opacity=0.5),
    ]
)
# pio.write_image(fig, "LitQEval-report/pics/sp_cos.pdf", width=1200)
fig.show()