# Candidate selection

In [1]:
import time
import numpy as np
import pandas as pd
from pathlib import Path
from collections import OrderedDict
from tools import eval_methods, selection_methods
from tqdm.auto import tqdm
tqdm.pandas()

setting = "dev" # dev or test

### Perform candidate selection and ranking

In [2]:
if not Path("candranking_" + setting + ".pkl").is_file():

    df = pd.read_pickle("../processed/quicks/quicks_" + setting + ".pkl")
    alts_df = pd.read_pickle("../processed/quicks/quicks_altname_" + setting + ".pkl")
    wkdt_df_places = pd.read_pickle("../processed/wikidata/altname_british_isles_gazetteer.pkl")
    wkdt_df_stations = pd.read_pickle("../processed/wikidata/altname_british_isles_stations_gazetteer.pkl")

    # ---------------
    # Perfect Match
    df["cr_perfect_match_stations"] = df.apply(lambda row: selection_methods.perfect_match(row["SubStFormatted"], wkdt_df_stations), axis=1)
    df["cr_perfect_match_places"] = df.apply(lambda row: selection_methods.perfect_match(row["MainStation"], wkdt_df_places), axis=1)
    alts_df["cr_perfect_match_alts"] = alts_df.apply(lambda row: selection_methods.perfect_match(row["Altname"], wkdt_df_stations), axis=1)
    print("Perfect match done!")

    # ---------------
    # Partial Match
    df["cr_partial_match_stations"] = df.apply(lambda row: selection_methods.partial_match(row["SubStFormatted"], wkdt_df_stations), axis=1)
    df["cr_partial_match_places"] = df.apply(lambda row: selection_methods.partial_match(row["MainStation"], wkdt_df_places), axis=1)
    alts_df["cr_partial_match_alts"] = alts_df.apply(lambda row: selection_methods.partial_match(row["Altname"], wkdt_df_stations), axis=1)
    print("Partial match done!")

    # ---------------
    # DeezyMatch
    candidates = "british_isles_stations"
    dm_model = "wikidata_british_isles"
    inputfile = "input_dfm"
    queries = "quicks_stations"
    candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
    candrank_thr = 5
    num_candidates = 3
    quicks_query_column = "SubStFormatted"
    df["cr_deezy_match_stations"] = selection_methods.find_deezymatch_candidates(wkdt_df_stations, df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

    candidates = "british_isles"
    dm_model = "wikidata_british_isles"
    inputfile = "input_dfm"
    queries = "quicks_places"
    candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
    candrank_thr = 5
    num_candidates = 3
    quicks_query_column = "MainStation"
    df["cr_deezy_match_places"] = selection_methods.find_deezymatch_candidates(wkdt_df_places, df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

    candidates = "british_isles_stations"
    dm_model = "wikidata_british_isles"
    inputfile = "input_dfm"
    queries = "quicks_altns"
    candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
    candrank_thr = 5
    num_candidates = 3
    quicks_query_column = "Altname"
    alts_df["cr_deezy_match_alts"] = selection_methods.find_deezymatch_candidates(wkdt_df_stations, alts_df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)
    print("Deezy match done!")

    # Add altnames to dataframe:
    # Add deezymatch altnames to dataframe:
    for appr in ["perfect_match", "partial_match", "deezy_match"]:
        dAlts = dict()
        altn_candidates = []
        for i, row in alts_df.iterrows():
            if row["SubId"] in dAlts:
                dAlts[row["SubId"]].update(row["cr_" + appr + "_alts"])
            else:
                dAlts[row["SubId"]] = row["cr_" + appr + "_alts"]
        for i, row in df.iterrows():
            if row["SubId"] in dAlts:
                altn_candidates.append(dict(OrderedDict(dAlts[row["SubId"]])))
            else:
                altn_candidates.append(dict())
        df["cr_" + appr + "_alts"] = altn_candidates

    # ---------------
    # Store candidate selection
    df.to_pickle("candranking_" + setting + ".pkl")

### Evaluate candidate selection and raking

In [3]:
test_df = pd.read_pickle("candranking_test.pkl")

combinations = [["stations"], ["stations", "alts"], ["stations", "places"], ["stations", "places", "alts"]]
candrank_approaches = [x.replace("cr_", "").replace("_stations", "") for x in test_df if x.startswith("cr_") and x.endswith("stations")]

eval_results = []
for combination in combinations:
    for approach in candrank_approaches:
        
        # Get relevant columns from dataframe:
        relv_columns = []
        for c in combination:
            relv_columns.append("cr_" + approach + "_" + c)
            
        reverse_values = False if approach == "deezy_match" else True
        
        # Report performance:
        p1 = test_df.apply(lambda row: eval_methods.pAt(row, approach, relv_columns, 1, reverse_values), axis=1).mean()
        p5 = test_df.apply(lambda row: eval_methods.pAt(row, approach, relv_columns, 5, reverse_values), axis=1).mean()
        p10 = test_df.apply(lambda row: eval_methods.pAt(row, approach, relv_columns, 10, reverse_values), axis=1).mean()
        map5 = test_df.apply(lambda row: eval_methods.avgP(row, approach, relv_columns, 5, reverse_values), axis=1).mean()
        map10 = test_df.apply(lambda row: eval_methods.avgP(row, approach, relv_columns, 10, reverse_values), axis=1).mean()
        isRetrieved = test_df.apply(lambda row: eval_methods.isRetrieved(row, approach, relv_columns, 20, reverse_values), axis=1).mean()
        eval_results.append([approach, "+".join(combination), p1, p5, p10, map5, map10, isRetrieved])
        
cr_eval_df = pd.DataFrame(eval_results, columns = ["Sources", "Approach", "p1", "p5", "p10", "map5", "map10", "isRetrieved"])
cr_eval_df.round(3)
print(cr_eval_df.round(3).to_latex(index=False))

\begin{tabular}{llrrrrrr}
\toprule
      Sources &             Approach &    p1 &    p5 &   p10 &  map5 &  map10 &  isRetrieved \\
\midrule
perfect\_match &             stations & 0.670 & 0.710 & 0.710 & 0.689 &  0.689 &        0.710 \\
partial\_match &             stations & 0.674 & 0.715 & 0.719 & 0.692 &  0.692 &        0.719 \\
  deezy\_match &             stations & 0.688 & 0.733 & 0.733 & 0.709 &  0.709 &        0.733 \\
perfect\_match &        stations+alts & 0.683 & 0.724 & 0.724 & 0.703 &  0.703 &        0.724 \\
partial\_match &        stations+alts & 0.656 & 0.724 & 0.733 & 0.686 &  0.688 &        0.733 \\
  deezy\_match &        stations+alts & 0.683 & 0.747 & 0.747 & 0.712 &  0.712 &        0.747 \\
perfect\_match &      stations+places & 0.710 & 0.769 & 0.774 & 0.737 &  0.738 &        0.774 \\
partial\_match &      stations+places & 0.181 & 0.389 & 0.543 & 0.250 &  0.271 &        0.665 \\
  deezy\_match &      stations+places & 0.715 & 0.787 & 0.796 & 0.746 &  0.747 &    