# Evaluation

In [1]:
import numpy as np
import pandas as pd
from tools import eval_methods

### Evaluate candidate selection and ranking

In [2]:
dRenameCombs = {"stations":"stns", "stations+alts":"stns+alts", "stations+places+alts":"stns+alts+plcs"}

# Options:
devtest_settings = ["test"]
cr_approaches = ["perfect_match", "partial_match", "deezy_match"]
ncand_options = [1, 3, 5]
combinations = [["stations"], ["stations", "alts"], ["stations", "places", "alts"]]

for setting in devtest_settings:
    for comb in combinations:
                
        print("==========================================")
        print("Split:", setting)
        print("Combination:", comb)
        
        eval_results = []
        for approach in cr_approaches:
            appr_results = []
            
            for num_candidates in ncand_options:
                
                test_df = pd.read_pickle("../processed/resolution/candranking_" + approach + "_" + setting + str(num_candidates) + ".pkl")

                # Get relevant columns from dataframe:
                relv_columns = []
                for c in comb:
                    relv_columns.append("cr_" + approach + "_" + c)

                exact_station = True
                if comb == ["stations", "places", "alts"]:
                    exact_station = False
                    
                # Report performance:
                p = test_df.apply(lambda row: eval_methods.pAt(row, approach, relv_columns, exact_station), axis=1).mean()
                mapAt = test_df.apply(lambda row: eval_methods.avgP(row, approach, relv_columns, exact_station), axis=1).mean()
                isRetrieved = test_df.apply(lambda row: eval_methods.isRetrieved(row, approach, relv_columns, exact_station), axis=1).mean()
                
                # Perfect match always returns just candidates where nv=1:
                if approach == "perfect_match" and num_candidates  > 1:
                    appr_results += [np.nan, np.nan, np.nan]
                else:
                    appr_results += [p, mapAt, isRetrieved]
            
            annotation = "Strict" if exact_station else "Appr"
            approach_renamed = approach.split("_")[0]
            if approach_renamed == "perfect":
                approach_renamed = "exact"
            
            eval_results.append([annotation, approach_renamed + ":" + dRenameCombs["+".join(comb)]] + appr_results)
            
        cr_eval_df = pd.DataFrame(eval_results, columns = ["Eval", "Approach", "p", "map", "retr", "p", "map", "retr", "p", "map", "retr"])
        cr_eval_df = cr_eval_df.round(2)
        cr_eval_df = cr_eval_df.fillna("--")
        print(cr_eval_df.to_latex(index=False))
        print("==========================================")
        print()

Split: test
Combination: ['stations']
\begin{tabular}{llrrrllllll}
\toprule
  Eval &     Approach &    p &  map & retr &     p &   map &  retr &     p &   map &  retr \\
\midrule
Strict &   exact:stns & 0.66 & 0.68 & 0.71 &    -- &    -- &    -- &    -- &    -- &    -- \\
Strict & partial:stns & 0.66 & 0.68 & 0.71 &   0.6 &  0.68 &  0.72 &  0.59 &  0.69 &  0.72 \\
Strict &   deezy:stns & 0.67 & 0.69 & 0.72 &  0.56 &  0.69 &  0.72 &  0.55 &  0.69 &  0.72 \\
\bottomrule
\end{tabular}


Split: test
Combination: ['stations', 'alts']
\begin{tabular}{llrrrllllll}
\toprule
  Eval &          Approach &    p &  map & retr &     p &   map &  retr &     p &   map &  retr \\
\midrule
Strict &   exact:stns+alts & 0.64 & 0.68 & 0.72 &    -- &    -- &    -- &    -- &    -- &    -- \\
Strict & partial:stns+alts & 0.64 & 0.69 & 0.72 &  0.57 &  0.67 &  0.73 &  0.56 &  0.68 &  0.73 \\
Strict &   deezy:stns+alts & 0.63 & 0.69 & 0.73 &  0.52 &  0.69 &  0.73 &  0.51 &  0.69 &  0.73 \\
\bottomrule
\end{tabul

### Evaluate entity resolution

In [3]:
# Load gazetteer
gazetteer_df = pd.read_csv("../processed/wikidata/gb_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [4]:
cr_approaches = ["deezy_match"]
ncand_options = [1, 5]

# Dictionary of shorter names for the approaches:
approachList = ["skyline", "candrank_most_confident_1", "candrank_most_confident_2", "candrank_most_confident_3", "candrank_most_confident_4", "candrank_most_confident_5", "wikipedia_most_relevant", "semantically_most_similar", "ranklib_1", "ranklib_2", "ranklib_3", "ranklib_4", "ranklib_5", "our_method_all", "our_method_comb"]

for candrank_method in cr_approaches:
    for num_candidates in ncand_options:
        
        dApproachNames = {"candrank_most_confident":"string confidence", "wikipedia_most_relevant":"wikipedia relevance", "semantically_most_similar":"semantic coherence", "our_method_all":"SVM simple", "our_method_comb":"SVM refined", "skyline": "skyline","ranklib":"RankLib"}
        
        results_test_df = pd.read_pickle("../processed/resolution/resolved_" + candrank_method + "_test" + str(num_candidates) + ".pkl")
        
        eval_results = []
        for topres_approach in approachList:

            acc_at = (np.nan, np.nan, np.nan)
            exact_acc_approx = np.nan

            exact_acc_strict = eval_methods.topres_exactmetrics(results_test_df, topres_approach, True)

            if topres_approach != "skyline":
                acc_at = eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach, False)
                exact_acc_approx = eval_methods.topres_exactmetrics(results_test_df, topres_approach, False)

            eval_results.append([topres_approach, exact_acc_strict, exact_acc_approx, acc_at[0], acc_at[1], acc_at[2]])

        tr_eval_df = pd.DataFrame(eval_results, columns = ["Approach", "PStr", "PAppr", "Acc@1km", "Acc@5km", "Acc@10km"])
        
        # Merge candrank and ranklib averages:
        cand_rank_merged = tr_eval_df[tr_eval_df['Approach'].str.contains("candrank_most_confident")].mean(axis=0).to_dict()
        cand_rank_merged["Approach"] = "candrank_most_confident"
        ranklib_merged = tr_eval_df[tr_eval_df['Approach'].str.contains("ranklib")].mean(axis=0).to_dict()
        ranklib_merged["Approach"] = "ranklib"
        tr_eval_df = tr_eval_df.append(cand_rank_merged, ignore_index=True)
        tr_eval_df = tr_eval_df.append(ranklib_merged, ignore_index=True)
        tr_eval_df = tr_eval_df[~tr_eval_df.Approach.str.startswith("candrank_most_confident_")]
        tr_eval_df = tr_eval_df[~tr_eval_df.Approach.str.startswith("ranklib_")]
        
        # Simplify method name and reorder rows:
        tr_eval_df["Approach"] = tr_eval_df["Approach"].replace(dApproachNames)
        tr_eval_df["Approach"] = tr_eval_df["Approach"] + " (" + candrank_method.split("_")[0] + ", nv=" + str(num_candidates) + ")"
        tr_eval_df = tr_eval_df.reset_index()
        row_index = pd.Series([0, 5, 1, 2, 6, 3, 4])
        tr_eval_df = tr_eval_df.iloc[row_index, :]
        tr_eval_df = tr_eval_df.drop(columns=["index"])
        
        # Round values, and print the latex table:
        tr_eval_df = tr_eval_df.round(2)
        tr_eval_df = tr_eval_df.fillna("-")
        print(tr_eval_df.to_latex(index=False))
        print()

\begin{tabular}{lrllll}
\toprule
                         Approach &  PStr & PAppr & Acc@1km & Acc@5km & Acc@10km \\
\midrule
            skyline (deezy, nv=1) &  0.73 &     - &       - &       - &        - \\
  string confidence (deezy, nv=1) &  0.66 &  0.69 &    0.77 &    0.84 &     0.85 \\
wikipedia relevance (deezy, nv=1) &  0.10 &  0.16 &    0.54 &     0.8 &     0.81 \\
 semantic coherence (deezy, nv=1) &  0.30 &  0.32 &    0.58 &    0.78 &     0.79 \\
            RankLib (deezy, nv=1) &  0.68 &   0.7 &    0.79 &    0.85 &     0.86 \\
         SVM simple (deezy, nv=1) &  0.68 &  0.71 &     0.8 &    0.86 &     0.86 \\
        SVM refined (deezy, nv=1) &  0.67 &   0.7 &    0.79 &    0.86 &     0.86 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrllll}
\toprule
                         Approach &  PStr & PAppr & Acc@1km & Acc@5km & Acc@10km \\
\midrule
            skyline (deezy, nv=5) &  0.73 &     - &       - &       - &        - \\
  string confidence (deezy, nv=5) &  0.66 &  0.6