In [None]:
import time
import numpy as np
import pandas as pd
from pathlib import Path
from collections import OrderedDict
from tools import eval_methods, selection_methods
from tqdm.auto import tqdm
tqdm.pandas()

setting = "dev" # dev or test

df = pd.read_pickle("../processed/quicks/quicks_" + setting + ".pkl")
alts_df = pd.read_pickle("../processed/quicks/quicks_altname_" + setting + ".pkl")
wkdt_df = pd.read_pickle("../processed/wikidata/altname_british_isles_gazetteer.pkl")
wkdt_alts_df = pd.read_pickle("../processed/wikidata/altname_british_isles_stations_gazetteer.pkl")

In [None]:
# ---------------
# Skyline: best possible result considering NIL entities
df["cr_skyline"] = df.apply(lambda row: selection_methods.skyline(row["Final Wikidata ID"], wkdt_alts_df), axis=1)

# ---------------
# Perfect Match
df["cr_perfect_match"] = df.apply(lambda row: selection_methods.perfect_match(row["SubStFormatted"], wkdt_alts_df), axis=1)

# ---------------
# Partial Match
df["cr_partial_match"] = df.apply(lambda row: selection_methods.partial_match(row["SubStFormatted"], wkdt_alts_df), axis=1)

# ---------------
# DeezyMatch
candidates = "british_isles_stations"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 5
num_candidates = 10
quicks_query_column = "SubStFormatted"

df["cr_deezy_match"] = selection_methods.find_deezymatch_candidates(wkdt_alts_df, df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

# ----------------
# DeezyMatch: main place names
candidates = "british_isles"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 5
num_candidates = 10
quicks_query_column = "MainStation"

df["cr_deezy_match_places"] = selection_methods.find_deezymatch_candidates(wkdt_df, df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

# ----------------
# DeezyMatch: altnames
candidates = "british_isles_stations"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 5
num_candidates = 3
quicks_query_column = "Altname"

alts_df["cr_deezy_match_alts"] = selection_methods.find_deezymatch_candidates(wkdt_alts_df, alts_df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

# Add deezymatch altnames to dataframe:
dAlts = dict()
altn_candidates = []
for i, row in alts_df.iterrows():
    if row["SubId"] in dAlts:
        dAlts[row["SubId"]].update(row["cr_deezy_match_alts"])
    else:
        dAlts[row["SubId"]] = row["cr_deezy_match_alts"]
for i, row in df.iterrows():
    if row["SubId"] in dAlts:
        altn_candidates.append(dict(OrderedDict(dAlts[row["SubId"]])))
    else:
        altn_candidates.append(dict())
df["cr_deezy_match_alts"] = altn_candidates

# ---------------
# Store candidate selection
df.to_pickle("candranking_" + setting + ".pkl")

In [None]:
df = pd.read_pickle("candranking_" + setting + ".pkl")
candrank_approaches = [x for x in df if x.startswith("cr_")]

for approach in candrank_approaches:
    print(approach)
    print("p1:", df.apply(lambda row: eval_methods.pAt(row, approach, 1, False), axis=1).mean())
    print("p5:", df.apply(lambda row: eval_methods.pAt(row, approach, 5, False), axis=1).mean())
    print("p10:", df.apply(lambda row: eval_methods.pAt(row, approach, 10, False), axis=1).mean())
    print("map5:", df.apply(lambda row: eval_methods.avgP(row, approach, 5, False), axis=1).mean())
    print("map10:", df.apply(lambda row: eval_methods.avgP(row, approach, 10, False), axis=1).mean())
    print()