#### The goal of this notebook is provide an evaluation overview.

In [1]:
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np
pio.templates.default = "seaborn"

In [2]:
results = pd.read_excel("results.xlsx")
results["Topic"] = results["Query"].copy()
results.loc[1::2, "Topic"] = np.nan
results = results.rename(columns={"Unnamed: 0": "Index"})
plt_df = results.ffill()
plt_df = plt_df[plt_df["Recall"] != 0]


In [3]:
fig = px.histogram(plt_df, x="Topic", y=["Recall", "Cosine Precision",
                                         "Cosine F2", "Cosine F2",
                                         "Cluster Precision", "Cluster F2",
                                         "MVEE Precision", "MVEE F2",
                                         "Hull Precision", "Hull F2"],
                    title="Metrics for Predicted and Baseline Queries", barmode="group",
                   facet_row="Index", facet_row_spacing=0.1)

fig.for_each_trace(lambda t: t.update(x=t.x[::-1], y=t.y[::-1]))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(yaxis1_title="", yaxis2_title="", yaxis1_dtick=0.2, yaxis2_dtick=0.2)
fig.show()

In [4]:
diff_df = (results[results["Index"] == "Predicted"][["Recall",
                                                    "Cosine Precision", "Cosine F2",
                                                    "Cluster Precision", "Cluster F2",
                                                    "MVEE Precision", "MVEE F2",
                                                    "Hull Precision", "Hull F2"]].values\
      - results[results["Index"] == "Baseline"][["Recall",
                                                "Cosine Precision","Cosine F2", 
                                                "Cluster Precision", "Cluster F2",
                                                "MVEE Precision", "MVEE F2",
                                                "Hull Precision", "Hull F2"]].values).astype(float)

diff_df = pd.DataFrame(np.round(diff_df,3), columns=["Recall",
                                                    "Cosine Precision","Cosine F2", 
                                                    "Cluster Precision", "Cluster F2",
                                                    "MVEE Precision", "MVEE F2",
                                                    "Hull Precision", "Hull F2"], index=results.dropna()["Topic"])
diff_df = diff_df.style.map(lambda x: 'background-color: #6b0801' if x < -0.5 else 'background-color: #a82b22' if x < 0 else '')
diff_df.format("{:.3f}").set_caption("Difference in metrics between predicted and baseline (Negative means the baseline is better)")
display(diff_df.hide(subset=diff_df.index[-7:]))

Unnamed: 0_level_0,Recall,Cosine Precision,Cosine F2,Cluster Precision,Cluster F2,MVEE Precision,MVEE F2,Hull Precision,Hull F2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Robotic Arthroplasty,0.0,-0.745,-0.55,-0.528,-0.49,-0.741,-0.55,-0.657,-0.55
Soft Robotics,0.083,-0.113,-0.13,-0.147,-0.06,-0.227,-0.22,-0.095,-0.15
Crop Yield Prediction,0.108,-0.262,-0.17,-0.118,-0.01,-0.251,-0.35,-0.216,-0.23
Synthetic Biology,0.31,-0.042,0.03,-0.185,0.14,0.3,0.15,0.294,0.29
Resilience in Business and management,0.148,-0.017,0.04,-0.838,0.2,0.133,0.42,0.072,0.28
Cervical Myelopathy,0.085,-0.288,-0.16,-0.299,-0.24,-0.084,-0.22,-0.026,-0.15
Drones in Agriculture,0.44,-0.158,0.22,0.298,0.37,0.256,0.58,0.154,0.46
Tourism Growth Nexus,0.0,-0.505,-0.07,0.31,0.0,0.0,0.0,0.0,0.0
Sustainable Biofuel Economy,0.26,-0.106,0.24,0.733,0.3,0.73,0.63,0.512,0.7
Perovskite Solar Cells Stability,0.103,-0.255,-0.32,-0.213,-0.17,-0.004,-0.34,-0.072,-0.23


In [5]:
diff_df.hide(subset=diff_df.index[:15])

Unnamed: 0_level_0,Recall,Cosine Precision,Cosine F2,Cluster Precision,Cluster F2,MVEE Precision,MVEE F2,Hull Precision,Hull F2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Software Process Line,-0.209,0.27,0.26,0.319,0.14,0.119,0.2,0.162,0.29
Data Stream Processing Latency,-0.073,-0.021,-0.08,-0.321,-0.1,-0.136,-0.13,-0.045,-0.06
Business Process Meta Models,0.269,0.032,0.17,-0.082,0.09,0.499,0.53,0.334,0.52
Multicore Performance Prediction,0.0,0.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cloud Migration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Software Fault Prediction Metrics,0.542,-0.543,-0.01,0.061,0.44,-0.141,-0.19,-0.048,-0.1
Software Defect Prediction,-0.111,-0.552,-0.27,-0.446,-0.21,-0.564,-0.77,-0.434,-0.75
