In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

In [3]:
baseline = 'Synthetic Biology'
predicted = """
(synthetic biology) OR "genome editing" OR "microbial engineering" OR "synthetic organisms" OR "rna engineering" OR "environmental biotechnology" OR "bioprinting" OR "biomolecular engineering" OR "synthetic ecology" OR "synthetic genomics" OR "biomanufacturing" OR "biocontainment" OR "biodesign" OR "designer organisms" OR "chassis organisms" OR 
(("gene synthesis" OR "dna synthesis" OR "molecular biology" OR "systems biology" OR "bioinformatics" OR "cell-free systems" OR "protein engineering" OR "biotechnology" OR "directed evolution" OR "gene drives" OR "genetic engineering" OR "crispr" OR "synthetic pathways" OR "pathway engineering" OR "metabolic engineering") AND (Synthetic))
"""

data = get_data(baseline, predicted)
core_pubs = data["core_pubs"]
core_mean_embedding = data["core_mean_embedding"]
baseline_pubs = data["baseline_pubs"]
predicted_pubs = data["predicted_pubs"]
baseline_vs = data["baseline_vs"]
predicted_vs = data["predicted_vs"]
core_vs = data["core_vs"]
predicted_embeddings = np.array([embedding for embedding in predicted_vs.get(include=["embeddings"])["embeddings"]])
baseline_embeddings = np.array([embedding for embedding in baseline_vs.get(include=["embeddings"])["embeddings"]])
core_embeddings = np.squeeze([core_vs.get(i,include=["embeddings"])["embeddings"] for i in core_pubs])
core_mean_embedding.reshape(1, -1).shape, predicted_embeddings.shape, baseline_embeddings.shape


[2mSearching config file credentials for default 'live' instance..[0m


[2mDimcli - Dimensions API Client (v1.3)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.10[0m
[2mMethod: dsl.ini file[0m


Starting iteration with limit=1000 skip=0 ...[0m
0-1000 / 32683 (8.37s)[0m
1000-2000 / 32683 (3.09s)[0m
2000-3000 / 32683 (6.30s)[0m
3000-4000 / 32683 (3.33s)[0m
4000-5000 / 32683 (5.26s)[0m
5000-6000 / 32683 (2.90s)[0m
6000-7000 / 32683 (2.68s)[0m
7000-8000 / 32683 (6.10s)[0m
8000-9000 / 32683 (2.73s)[0m
9000-10000 / 32683 (5.93s)[0m
10000-11000 / 32683 (2.83s)[0m
11000-12000 / 32683 (3.12s)[0m
12000-13000 / 32683 (6.15s)[0m
13000-14000 / 32683 (2.91s)[0m
14000-15000 / 32683 (2.61s)[0m
15000-16000 / 32683 (6.29s)[0m
16000-17000 / 32683 (5.99s)[0m
17000-18000 / 32683 (3.18s)[0m
18000-19000 / 32683 (3.07s)[0m
19000-20000 / 32683 (3.30s)[0m
20000-21000 / 32683 (2.50s)[0m
21000-22000 / 32683 (5.87s)[0m
22000-23000 / 32683 (2.60s)[0m
23000-24000 / 32683 (2.69s)[0m
24000-25000 / 32683 (3.42s)[0m
25000-26000 / 32683 (2.66s)[0m
26000-27000 / 32683 (3.27s)[0m
27000-28000 / 32683 (2.97s)[0m
28000-29000 / 32683 (3.57s)[0m
29000-30000 / 32683 (3.37s)[0m
30000-31000 

Total results: 32683
Total results after filtering: 30538


[2mSearching config file credentials for default 'live' instance..[0m


[2mDimcli - Dimensions API Client (v1.3)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.10[0m
[2mMethod: dsl.ini file[0m


Starting iteration with limit=1000 skip=0 ...[0m
0-1000 / 37485 (15.14s)[0m
1000-2000 / 37485 (4.88s)[0m
2000-3000 / 37485 (5.07s)[0m
3000-4000 / 37485 (6.12s)[0m
4000-5000 / 37485 (5.21s)[0m
5000-6000 / 37485 (5.10s)[0m
6000-7000 / 37485 (4.31s)[0m
7000-8000 / 37485 (4.28s)[0m


In [None]:
# baseline
cosine_sim = cosine_similarity(core_mean_embedding, baseline_embeddings).flatten()
baseline_pubs["similarity"] = cosine_sim

core_pubs_in_baseline = baseline_pubs[baseline_pubs["id"].isin(core_pubs)]
threshold = core_pubs_in_baseline["similarity"].min()
relevent_baseline_pubs = baseline_pubs[baseline_pubs["similarity"] >= threshold].copy()
print(f"Number of core publications in the baseline: {core_pubs_in_baseline.shape[0]}")
print(f"Number of relevant publications in the baseline: {relevent_baseline_pubs.shape[0]}")

In [None]:
# predicted
cosine_sim = cosine_similarity(core_mean_embedding, predicted_embeddings).flatten()
predicted_pubs["similarity"] = cosine_sim

core_pubs_in_predicted = predicted_pubs[predicted_pubs["id"].isin(core_pubs)]
threshold = core_pubs_in_predicted["similarity"].min()
relevant_predicted_pubs = predicted_pubs[predicted_pubs["similarity"] >= threshold].copy()
print(f"Number of core publications in the predicted: {core_pubs_in_predicted.shape[0]}")
print(f"Number of relevant publications in the predicted: {relevant_predicted_pubs.shape[0]}")

### Cosine Similarity Measure

In [None]:
recall = evaluate_recall(core_pubs, baseline_pubs, predicted_pubs)
# semnatic precision: every element that is more similar than the least similar core publication is considered relevant
# relevant_predicted_pubs: publications that are more similar than the least similar core publication.
pred_precision = relevant_predicted_pubs.shape[0] / predicted_pubs.shape[0] # total number of found publications
baseline_precision = (relevent_baseline_pubs.shape[0] / baseline_pubs.shape[0]) if baseline_pubs.shape[0] > 0 else 0
pred_f2 = fscore(pred_precision, recall["predicted_recall"], 2)
baseline_f2 = fscore(baseline_precision, recall["baseline_recall"], 2)
df = pd.DataFrame({
    "Semantic Precision": [pred_precision, baseline_precision],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic F2": [pred_f2, baseline_f2]
}, index=["Predicted", "Baseline"])
df

### Minimum Volume Enclosing Ellipsoid MMVE

In [None]:
import numpy.linalg as la

def mvee(points, tol=0.0001):
    """
    Finds the ellipse equation in "center form"
    (x-c).T * A * (x-c) = 1
    """
    N, d = points.shape
    Q = np.column_stack((points, np.ones(N))).T
    err = tol+1.0
    u = np.ones(N)/N
    while err > tol:
        # assert u.sum() == 1 # invariant
        X = np.dot(np.dot(Q, np.diag(u)), Q.T)
        M = np.diag(np.dot(np.dot(Q.T, la.inv(X)), Q))
        jdx = np.argmax(M)
        step_size = (M[jdx]-d-1.0)/((d+1)*(M[jdx]-1.0))
        new_u = (1-step_size)*u
        new_u[jdx] += step_size
        err = la.norm(new_u-u)
        u = new_u
    c = np.dot(u, points)
    A = la.inv(np.dot(np.dot(points.T, np.diag(u)), points)
               - np.multiply.outer(c, c))/d
    return A, c

A, c = mvee(core_embeddings)

In [None]:
base_is_inside = is_inside_ellipse(A, c, baseline_embeddings)
predicted_is_inside = is_inside_ellipse(A, c, predicted_embeddings)

In [None]:
mvve_prec_baseline = base_is_inside.sum() / len(base_is_inside)
mvve_prec_predicted = predicted_is_inside.sum() / len(predicted_is_inside)

mvve_df = pd.DataFrame({
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])
print(f"Baseline - Inside: {base_is_inside.sum()}")
print(f"Predicted - Inside: {predicted_is_inside.sum()}")
mvve_df

In [None]:
results = pd.DataFrame({
    "Query": [predicted] + [baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic Precision": [pred_precision, baseline_precision],
    "Semantic F2": [pred_f2, baseline_f2],
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])

try:
    old_results = pd.read_excel("results.xlsx", index_col=0)
    results = pd.concat([old_results, results]).drop_duplicates(subset=["Query"]).round(3)
    results.to_excel("results.xlsx")
except FileNotFoundError:
    results.to_excel("results.xlsx")

display(results)

In [None]:
results["Topic"] = results["Query"].copy()
results["Topic"][0::2] = np.nan
plt_df = results.bfill()
plt_df.reset_index(inplace=True)

In [None]:
fig = px.histogram(plt_df, x="Topic", y=["Recall", "Semantic Precision", "MVVE Precision", "Semantic F2", "MVVE F2"], title="Metrics for Predicted and Baseline Queries", barmode="group",
                   facet_row="index", facet_row_spacing=0.1)

# Sort by value
fig.for_each_trace(lambda t: t.update(x=t.x[::-1], y=t.y[::-1]))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(yaxis1_title="", yaxis2_title="", yaxis1_dtick=0.2, yaxis2_dtick=0.2)
fig.show()