In [90]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

In [91]:
baseline = 'Software Process Line'
predicted = """
("process adaptation" OR "processes adaptation" OR "customization of processes" OR "software processes customization" OR "software process customization" OR "customizing software processes" OR "process definition" OR "processes definition" OR "process composition" OR "compose processes" OR "processes composition" OR "process tailoring" OR "processes tailoring" OR "tailing of processes" OR "process development" OR "processes development" OR "process engineering" OR "processes engineering" OR "process design" OR "software process modelling" OR "software process modelling" OR "process implementation" OR "managing processes")

AND

("family of software process" OR "family of software processes" OR "families of software process" OR "families of software processes" OR "software process line" OR "software process lines" OR "software processes line" OR "software processes lines" OR "process-line" OR "process-lines" OR "processes-line" OR "processes-lines" OR "software process family" OR "software processes family" OR "software process families" OR "software processes families" OR "process-family" OR "processes-family" OR "process-families" OR "processes-families" OR "software process variability" OR "software process variabilities" OR "software processes variability" OR "software processes variabilities" OR "variabilities in software processes" OR "process domain engineering" OR "processes domain engineering" OR "process feature" OR "process features" OR "processes feature" OR "processes features" OR "process asset reuse")
"""

data = get_data(baseline, predicted, True)
core_pubs = data["core_pubs"]
core_mean_embedding = data["core_mean_embedding"]
baseline_pubs = data["baseline_pubs"]
predicted_pubs = data["predicted_pubs"]
baseline_vs = data["baseline_vs"]
predicted_vs = data["predicted_vs"]
core_vs = data["core_vs"]
predicted_embeddings = np.array([embedding for embedding in predicted_vs.get(include=["embeddings"])["embeddings"]])
baseline_embeddings = np.array([embedding for embedding in baseline_vs.get(include=["embeddings"])["embeddings"]])
core_embeddings = np.squeeze([core_vs.get(i,include=["embeddings"])["embeddings"] for i in core_pubs])
core_mean_embedding.reshape(1, -1).shape, predicted_embeddings.shape, baseline_embeddings.shape


((1, 1536), (844, 1536), (13918, 1536))

In [92]:
# baseline
cosine_sim = cosine_similarity(core_mean_embedding, baseline_embeddings).flatten()
baseline_pubs["similarity"] = cosine_sim

core_pubs_in_baseline = baseline_pubs[baseline_pubs["id"].isin(core_pubs)]
threshold = core_pubs_in_baseline["similarity"].min()
relevent_baseline_pubs = baseline_pubs[baseline_pubs["similarity"] >= threshold].copy()
print(f"Number of core publications in the baseline: {core_pubs_in_baseline.shape[0]}")
print(f"Number of relevant publications in the baseline: {relevent_baseline_pubs.shape[0]}")

Number of core publications in the baseline: 26
Number of relevant publications in the baseline: 301


In [93]:
# predicted
cosine_sim = cosine_similarity(core_mean_embedding, predicted_embeddings).flatten()
predicted_pubs["similarity"] = cosine_sim

core_pubs_in_predicted = predicted_pubs[predicted_pubs["id"].isin(core_pubs)]
threshold = core_pubs_in_predicted["similarity"].min()
relevant_predicted_pubs = predicted_pubs[predicted_pubs["similarity"] >= threshold].copy()
print(f"Number of core publications in the predicted: {core_pubs_in_predicted.shape[0]}")
print(f"Number of relevant publications in the predicted: {relevant_predicted_pubs.shape[0]}")

Number of core publications in the predicted: 12
Number of relevant publications in the predicted: 39


### Cosine Similarity Measure

In [94]:
recall = evaluate_recall(core_pubs, baseline_pubs, predicted_pubs)
# semnatic precision: every element that is more similar than the least similar core publication is considered relevant
# relevant_predicted_pubs: publications that are more similar than the least similar core publication.
pred_precision = relevant_predicted_pubs.shape[0] / predicted_pubs.shape[0] # total number of found publications
baseline_precision = (relevent_baseline_pubs.shape[0] / baseline_pubs.shape[0]) if baseline_pubs.shape[0] > 0 else 0
pred_f2 = fscore(pred_precision, recall["predicted_recall"], 2)
baseline_f2 = fscore(baseline_precision, recall["baseline_recall"], 2)
df = pd.DataFrame({
    "Semantic Precision": [pred_precision, baseline_precision],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic F2": [pred_f2, baseline_f2]
}, index=["Predicted", "Baseline"])
df

Unnamed: 0,Semantic Precision,Recall,Semantic F2
Predicted,0.046209,0.27907,0.138988
Baseline,0.021627,0.604651,0.094599


### Minimum Volume Enclosing Ellipsoid MMVE

In [None]:
A, c = mvee(core_embeddings)
base_is_inside = is_inside_ellipse(A, c, baseline_embeddings)
predicted_is_inside = is_inside_ellipse(A, c, predicted_embeddings)

100%|██████████| 13918/13918 [00:02<00:00, 6840.11it/s]
100%|██████████| 844/844 [00:00<00:00, 6492.29it/s]


In [97]:
mvve_prec_baseline = base_is_inside.sum() / len(base_is_inside)
mvve_prec_predicted = predicted_is_inside.sum() / len(predicted_is_inside)

mvve_df = pd.DataFrame({
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])
print(f"Baseline - Inside: {base_is_inside.sum()}")
print(f"Predicted - Inside: {predicted_is_inside.sum()}")
mvve_df

Baseline - Inside: 5988
Predicted - Inside: 379


Unnamed: 0,MVVE Precision,Recall,MVVE F2
Predicted,0.449052,0.27907,0.301928
Baseline,0.430234,0.604651,0.559303


In [98]:
results = pd.DataFrame({
    "Query": [predicted] + [baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic Precision": [pred_precision, baseline_precision],
    "Semantic F2": [pred_f2, baseline_f2],
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])

try:
    old_results = pd.read_excel("results.xlsx", index_col=0)
    results = pd.concat([old_results, results]).drop_duplicates(subset=["Query"]).round(3)
    results.to_excel("results.xlsx")
except FileNotFoundError:
    results.to_excel("results.xlsx")

display(results)

Unnamed: 0,Query,Recall,Semantic Precision,Semantic F2,MVVE Precision,MVVE F2
Predicted,"\n""digital surgery"" OR ""surgical navigation"" O...",0.957,0.061,0.244,0.33,0.694
Baseline,Robotic Arthroplasty,0.957,0.595,0.853,0.401,0.749
Predicted,"\n""compliant materials"" OR (soft robotics) OR ...",0.722,0.212,0.487,0.451,0.644
Baseline,Soft Robotics,0.556,0.452,0.531,0.492,0.542
Predicted,"\n""crop yield estimation"" OR ""crop simulation ...",0.652,0.106,0.321,0.528,0.623
Baseline,Crop Yield Prediction,0.543,0.37,0.497,0.508,0.536
Predicted,"\n(""process adaptation"" OR ""processes adaptati...",0.279,0.046,0.139,0.528,0.308
Baseline,Software Process Line,0.605,0.023,0.101,0.48,0.575


In [99]:
results["Topic"] = results["Query"].copy()
results["Topic"][0::2] = np.nan
plt_df = results.bfill()
plt_df.reset_index(inplace=True)


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [100]:
fig = px.histogram(plt_df, x="Topic", y=["Recall", "Semantic Precision", "MVVE Precision", "Semantic F2", "MVVE F2"], title="Metrics for Predicted and Baseline Queries", barmode="group",
                   facet_row="index", facet_row_spacing=0.1)

# Sort by value
fig.for_each_trace(lambda t: t.update(x=t.x[::-1], y=t.y[::-1]))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(yaxis1_title="", yaxis2_title="", yaxis1_dtick=0.2, yaxis2_dtick=0.2)
fig.show()

In [102]:
import umap
baseline_embeddings = np.array([embedding for embedding in baseline_vs.get(include=["embeddings"])["embeddings"]])
umap_embeddings = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine').fit_transform(baseline_embeddings)

In [103]:
umap_df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
umap_df["pub_id"] = [i["id"] for i in baseline_vs.get()["metadatas"]]
umap_df["title"] = baseline_pubs["title"].copy()
umap_df["type"] = "Retrieved"
umap_df.loc[umap_df["pub_id"].isin(core_pubs), "type"] = "Core"

In [105]:
# k-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(baseline_embeddings)

umap_df["cluster"] = kmeans.labels_.astype(str)
fig = px.scatter(umap_df, x="x", y="y", color="cluster", hover_data=["pub_id", "title"], title="UMAP of Baseline Publications with KMeans Clustering")
fig.show()
# which cluster has the most core publications and how many are there?
cluster = umap_df[umap_df["type"] == "Core"]["cluster"].value_counts().idxmax()
cluster_pubs = umap_df[umap_df["cluster"] == cluster]
core_pubs_count = cluster_pubs[cluster_pubs["type"] == "Core"].shape[0]
print(f"Cluster {cluster} has {core_pubs_count} core publications, and size of {cluster_pubs.shape[0]}")

Cluster 7 has 25 core publications, and size of 1211


In [111]:
# plot baseline umap and highlight base_is_inside
umap_df["is_inside_mvee"] = base_is_inside
fig = px.scatter(umap_df, x="x", y="y", color="is_inside_mvee", hover_data=["pub_id", "title"], title="UMAP of Baseline Publications with MVVE Highlighted")
fig.show()
umap_df["is_inside_mvee"].value_counts()

is_inside_mvee
False    7930
True     5988
Name: count, dtype: int64

In [None]:
# highlight relevent_baseline_pubs as is simlar using cos
umap_df["is_similar"] = baseline_pubs["id"].isin(relevent_baseline_pubs["id"])
fig = px.scatter(umap_df, x="x", y="y", color="is_similar", hover_data=["pub_id", "title"], title="UMAP of Baseline Publications with Relevant Publications Highlighted")
fig.show()