In [83]:
from sklearn.metrics.pairwise import cosine_similarity
from litQeval.eval_utils import *
import plotly.express as px
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

In [84]:
baseline = 'Software Process Line'
predicted = """
("process adaptation" OR "processes adaptation" OR "customization of processes" OR "software processes customization" OR "software process customization" OR "customizing software processes" OR "process definition" OR "processes definition" OR "process composition" OR "compose processes" OR "processes composition" OR "process tailoring" OR "processes tailoring" OR "tailing of processes" OR "process development" OR "processes development" OR "process engineering" OR "processes engineering" OR "process design" OR "software process modelling" OR "software process modelling" OR "process implementation" OR "managing processes")

AND

("family of software process" OR "family of software processes" OR "families of software process" OR "families of software processes" OR "software process line" OR "software process lines" OR "software processes line" OR "software processes lines" OR "process-line" OR "process-lines" OR "processes-line" OR "processes-lines" OR "software process family" OR "software processes family" OR "software process families" OR "software processes families" OR "process-family" OR "processes-family" OR "process-families" OR "processes-families" OR "software process variability" OR "software process variabilities" OR "software processes variability" OR "software processes variabilities" OR "variabilities in software processes" OR "process domain engineering" OR "processes domain engineering" OR "process feature" OR "process features" OR "processes feature" OR "processes features" OR "process asset reuse")"""

data = get_data(baseline, predicted, True)
core_pubs = data["core_pubs"]
core_mean_embedding = data["core_mean_embedding"]
baseline_pubs = data["baseline_pubs"]
predicted_pubs = data["predicted_pubs"]
baseline_vs = data["baseline_vs"]
predicted_vs = data["predicted_vs"]
core_vs = data["core_vs"]
core_embeddigs = data["core_embeddings"]
core_threshold = data["core_threshold"]
core_embeddings = data["core_embeddings"]
predicted_embeddings = np.array([embedding for embedding in predicted_vs.get(include=["embeddings"])["embeddings"]])
baseline_embeddings = np.array([embedding for embedding in baseline_vs.get(include=["embeddings"])["embeddings"]])
core_mean_embedding.shape, predicted_embeddings.shape, baseline_embeddings.shape

((1, 1536), (844, 1536), (13918, 1536))

In [85]:
cosine_sim = cosine_similarity(core_mean_embedding, baseline_embeddings).flatten()
baseline_pubs["similarity"] = cosine_sim

core_pubs_in_baseline = baseline_pubs[baseline_pubs["id"].isin(core_pubs)]
relevent_baseline_pubs = baseline_pubs[baseline_pubs["similarity"] >= core_threshold].copy()
print(f"Number of core publications in the baseline: {core_pubs_in_baseline.shape[0]}")
print(f"Number of relevant publications in the baseline: {relevent_baseline_pubs.shape[0]}")

Number of core publications in the baseline: 26
Number of relevant publications in the baseline: 5135


In [86]:
# predicted
cosine_sim = cosine_similarity(core_mean_embedding, predicted_embeddings).flatten()
predicted_pubs["similarity"] = cosine_sim

core_pubs_in_predicted = predicted_pubs[predicted_pubs["id"].isin(core_pubs)]
relevant_predicted_pubs = predicted_pubs[predicted_pubs["similarity"] >= core_threshold].copy()
print(f"Number of core publications in the predicted: {core_pubs_in_predicted.shape[0]}")
print(f"Number of relevant publications in the predicted: {relevant_predicted_pubs.shape[0]}")

Number of core publications in the predicted: 12
Number of relevant publications in the predicted: 403


### Cosine Similarity Measure

In [87]:
recall = evaluate_recall(core_pubs, baseline_pubs, predicted_pubs)
# semnatic precision: every element that is more similar than the least similar core publication is considered relevant
# relevant_predicted_pubs: publications that are more similar than the least similar core publication.
pred_precision = relevant_predicted_pubs.shape[0] / predicted_pubs.shape[0] # total number of found publications
baseline_precision = (relevent_baseline_pubs.shape[0] / baseline_pubs.shape[0]) if baseline_pubs.shape[0] > 0 else 0
pred_f2 = fscore(pred_precision, recall["predicted_recall"], 2)
baseline_f2 = fscore(baseline_precision, recall["baseline_recall"], 2)
df = pd.DataFrame({
    "Semantic Precision": [pred_precision, baseline_precision],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic F2": [pred_f2, baseline_f2]
}, index=["Predicted", "Baseline"])
df

Unnamed: 0,Semantic Precision,Recall,Semantic F2
Predicted,0.477488,0.27907,0.304365
Baseline,0.368947,0.604651,0.536147


### Minimum Volume Enclosing Ellipsoid MMVE

In [88]:
A, c = mvee(core_embeddings)
base_is_inside = is_inside_ellipse(A, c, baseline_embeddings)
predicted_is_inside = is_inside_ellipse(A, c, predicted_embeddings)

100%|██████████| 13918/13918 [00:02<00:00, 6068.99it/s]
100%|██████████| 844/844 [00:00<00:00, 5985.78it/s]


In [89]:
mvve_prec_baseline = base_is_inside.sum() / len(base_is_inside)
mvve_prec_predicted = predicted_is_inside.sum() / len(predicted_is_inside)

mvve_df = pd.DataFrame({
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])
print(f"Baseline - Inside: {base_is_inside.sum()}")
print(f"Predicted - Inside: {predicted_is_inside.sum()}")
mvve_df

Baseline - Inside: 5988
Predicted - Inside: 379


Unnamed: 0,MVVE Precision,Recall,MVVE F2
Predicted,0.449052,0.27907,0.301928
Baseline,0.430234,0.604651,0.559303


In [90]:
results = pd.DataFrame({
    "Query": [predicted] + [baseline],
    "Recall": [recall["predicted_recall"], recall["baseline_recall"]],
    "Semantic Precision": [pred_precision, baseline_precision],
    "Semantic F2": [pred_f2, baseline_f2],
    "MVVE Precision": [mvve_prec_predicted, mvve_prec_baseline],
    "MVVE F2": [fscore(mvve_prec_predicted, recall["predicted_recall"], 2), fscore(mvve_prec_baseline, recall["baseline_recall"], 2)]
}, index=["Predicted", "Baseline"])

try:
    old_results = pd.read_excel("results.xlsx", index_col=0)
    results = pd.concat([old_results, results]).drop_duplicates(subset=["Query"]).round(3)
    results.to_excel("results.xlsx")
except FileNotFoundError:
    results.to_excel("results.xlsx")

display(results)

Unnamed: 0,Query,Recall,Semantic Precision,Semantic F2,MVVE Precision,MVVE F2
Predicted,"""digital surgery"" OR ""surgical navigation"" OR ...",0.957,0.081,0.302,0.348,0.709
Baseline,Robotic Arthroplasty,0.957,0.72,0.898,0.469,0.792
Predicted,"""compliant materials"" OR (soft robotics) OR ""s...",0.722,0.507,0.666,0.422,0.632
Baseline,Soft Robotics,0.556,0.755,0.587,0.464,0.534
Predicted,"""crop yield estimation"" OR ""crop simulation mo...",0.652,0.25,0.494,0.638,0.649
Baseline,Crop Yield Prediction,0.543,0.622,0.558,0.573,0.549
Predicted,"\n(""process adaptation"" OR ""processes adaptati...",0.279,0.477,0.304,0.449,0.302
Baseline,Software Process Line,0.605,0.369,0.536,0.43,0.559
Predicted,"(synthetic biology) OR ""genome editing"" OR ""mi...",0.586,0.553,0.579,0.502,0.567
Baseline,Synthetic Biology,0.207,0.619,0.239,0.463,0.233


In [91]:
results["Topic"] = results["Query"].copy()
results["Topic"][0::2] = np.nan
plt_df = results.bfill()
plt_df.reset_index(inplace=True)


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [92]:
fig = px.histogram(plt_df, x="Topic", y=["Recall", "Semantic Precision", "MVVE Precision", "Semantic F2", "MVVE F2"], title="Metrics for Predicted and Baseline Queries", barmode="group",
                   facet_row="index", facet_row_spacing=0.1)

# Sort by value
fig.for_each_trace(lambda t: t.update(x=t.x[::-1], y=t.y[::-1]))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(yaxis1_title="", yaxis2_title="", yaxis1_dtick=0.2, yaxis2_dtick=0.2)
fig.show()

In [128]:
# difference in metrics between predicted and baseline (Recall	Semantic Precision	Semantic F2	MVVE Precision	MVVE F2	)
diff_df = results.loc["Predicted"][["Recall","Semantic Precision" ,"Semantic F2" ,"MVVE Precision", "MVVE F2"]].values\
      - results.loc["Baseline"][["Recall","Semantic Precision" ,"Semantic F2" ,"MVVE Precision", "MVVE F2"]].values

diff_df = pd.DataFrame(diff_df.round(3), columns=["Recall","Semantic Precision" ,"Semantic F2" ,"MVVE Precision", "MVVE F2"], index=results.loc["Baseline"]["Query"])
diff_df = diff_df.style.map(lambda x: 'background-color: #6b0801' if x < -0.5 else 'background-color: #a82b22' if x < 0 else '')
diff_df.format("{:.3f}").set_caption("Difference in metrics between predicted and baseline (Red indicates a decrease in the metric in comparison to the baseline")

Unnamed: 0_level_0,Recall,Semantic Precision,Semantic F2,MVVE Precision,MVVE F2
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Robotic Arthroplasty,0.0,-0.639,-0.596,-0.121,-0.083
Soft Robotics,0.166,-0.248,0.079,-0.042,0.098
Crop Yield Prediction,0.109,-0.372,-0.064,0.065,0.1
Software Process Line,-0.326,0.108,-0.232,0.019,-0.257
Synthetic Biology,0.379,-0.066,0.34,0.039,0.334
Resilience in Business and management,0.222,-0.194,0.233,0.035,0.252
Cervical Myelopathy,0.085,-0.294,-0.042,0.029,0.066
Drones in Agriculture,0.48,-0.297,0.39,-0.096,0.458
Tourism Growth Nexus,0.0,-0.357,-0.076,-0.023,0.0
Sustainable Biofuel Economy,0.34,-0.147,0.353,-0.056,0.353
