In [None]:
import time
import numpy as np
import pandas as pd
from pathlib import Path
from tools import eval_methods, selection_methods, resolution_methods
from tqdm.auto import tqdm
tqdm.pandas()

df = pd.read_pickle("../processed/quicks/quicks_dev.pkl")
wikidata_df = pd.read_pickle("../processed/wikidata/altname_british_isles_stations_gazetteer.pkl")

In [None]:
# ---------------
# Perfect Match
approach = "cr_perfect_match"
df[approach] = df.apply(lambda row: selection_methods.perfect_match(row["SubStFormatted"], wikidata_df), axis=1)

# ---------------
# Partial Match
approach = "cr_partial_match"
df[approach] = df.apply(lambda row: selection_methods.partial_match(row["SubStFormatted"], wikidata_df), axis=1)

# ---------------
# Skyline: best possible result considering NIL entities
approach = "cr_skyline"
df[approach] = df.apply(lambda row: selection_methods.skyline(row["Final Wikidata ID"], wikidata_df), axis=1)

# ---------------
# DeezyMatch
approach = "cr_deezy_match"
candidates = "british_isles_stations"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 10
num_candidates = 10

df[approach] = selection_methods.find_deezymatch_candidates(wikidata_df, df, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

# ---------------
# Store candidate selection
df.to_pickle("candranking_df.pkl")

In [3]:
df = pd.read_pickle("candranking_df.pkl")
candrank_approaches = [x for x in df if x.startswith("cr_")]

for approach in candrank_approaches:
    print(approach)
    print("p1:", df.apply(lambda row: eval_methods.pAt(row, approach, 1, False), axis=1).mean())
    print("p5:", df.apply(lambda row: eval_methods.pAt(row, approach, 5, False), axis=1).mean())
    print("p10:", df.apply(lambda row: eval_methods.pAt(row, approach, 10, False), axis=1).mean())
    print("map5:", df.apply(lambda row: eval_methods.avgP(row, approach, 5, False), axis=1).mean())
    print("map10:", df.apply(lambda row: eval_methods.avgP(row, approach, 10, False), axis=1).mean())
    print()

cr_perfect_match
p1: 0.5674418604651162
p5: 0.6046511627906976
p10: 0.6046511627906976
map5: 0.586046511627907
map10: 0.586046511627907

cr_partial_match
p1: 0.4697674418604651
p5: 0.6
p10: 0.627906976744186
map5: 0.5267441860465116
map10: 0.5312846068660022

cr_skyline
p1: 0.8
p5: 0.8
p10: 0.8
map5: 0.8
map10: 0.8

cr_deezy_match
p1: 0.6
p5: 0.641860465116279
p10: 0.641860465116279
map5: 0.6197674418604651
map10: 0.6197674418604651

