In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import time
import numpy as np
import pandas as pd
from pathlib import Path
from collections import OrderedDict
from tools import eval_methods, selection_methods, resolution_methods
from tqdm.auto import tqdm
tqdm.pandas()

df = pd.read_pickle("../processed/quicks/quicks_dev.pkl")
alts_df = pd.read_pickle("../processed/quicks/quicks_altname_dev.pkl")
wkdt_alts_df = pd.read_pickle("../processed/wikidata/altname_british_isles_stations_gazetteer.pkl")
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=None, low_memory=False)

In [None]:
# ---------------
# Skyline: best possible result considering NIL entities
approach = "cr_skyline"
df[approach] = df.apply(lambda row: selection_methods.skyline(row["Final Wikidata ID"], wkdt_alts_df), axis=1)

# ---------------
# Perfect Match
approach = "cr_perfect_match"
df[approach] = df.apply(lambda row: selection_methods.perfect_match(row["SubStFormatted"], wkdt_alts_df), axis=1)
df.apply(lambda row: selection_methods.perfect_match(row["SubStFormatted"], wkdt_alts_df), axis=1)

# ---------------
# Partial Match
approach = "cr_partial_match"
df[approach] = df.apply(lambda row: selection_methods.partial_match(row["SubStFormatted"], wkdt_alts_df), axis=1)

# ---------------
# DeezyMatch
approach = "cr_deezy_match"
candidates = "british_isles_stations"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 5
num_candidates = 10
quicks_query_column = "SubStFormatted"

df[approach] = selection_methods.find_deezymatch_candidates(wkdt_alts_df, df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)

# ----------------
# DeezyMatch: altnames
approach = "cr_deezy_match_alts"
candidates = "british_isles_stations"
dm_model = "wikidata_british_isles"
inputfile = "input_dfm"
queries = "quicks_stations"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
candrank_thr = 5
num_candidates = 3
quicks_query_column = "Altname"

alts_df[approach] = selection_methods.find_deezymatch_candidates(wkdt_alts_df, alts_df, quicks_query_column, dm_model, inputfile, candidates, queries, candrank_metric, candrank_thr, num_candidates)
# Add deezymatch altnames to dataframe:
dAlts = dict()
altn_candidates = []
for i, row in alts_df.iterrows():
    if row["SubId"] in dAlts:
        dAlts[row["SubId"]].update(row["cr_deezy_match_alts"])
    else:
        dAlts[row["SubId"]] = row["cr_deezy_match_alts"]
for i, row in df.iterrows():
    if row["SubId"] in dAlts:
        altn_candidates.append(dict(OrderedDict(dAlts[row["SubId"]])))
    else:
        altn_candidates.append(dict())
df[approach] = altn_candidates

# ---------------
# Store candidate selection
df.to_pickle("candranking_df.pkl")

In [9]:
df = pd.read_pickle("candranking_df.pkl")
candrank_approaches = [x for x in df if x.startswith("cr_")]

for approach in candrank_approaches:
    print(approach)
    print("p1:", df.apply(lambda row: eval_methods.pAt(row, approach, 1, False), axis=1).mean())
    print("p5:", df.apply(lambda row: eval_methods.pAt(row, approach, 5, False), axis=1).mean())
    print("p10:", df.apply(lambda row: eval_methods.pAt(row, approach, 10, False), axis=1).mean())
    print("map5:", df.apply(lambda row: eval_methods.avgP(row, approach, 5, False), axis=1).mean())
    print("map10:", df.apply(lambda row: eval_methods.avgP(row, approach, 10, False), axis=1).mean())
    print()

cr_skyline
p1: 0.8
p5: 0.8
p10: 0.8
map5: 0.8
map10: 0.8

cr_perfect_match
p1: 0.5674418604651162
p5: 0.6046511627906976
p10: 0.6046511627906976
map5: 0.586046511627907
map10: 0.586046511627907

cr_partial_match
p1: 0.4697674418604651
p5: 0.6
p10: 0.627906976744186
map5: 0.5267441860465116
map10: 0.5312846068660022

cr_deezy_match
p1: 0.6
p5: 0.641860465116279
p10: 0.641860465116279
map5: 0.6197674418604651
map10: 0.6197674418604651

cr_deezy_match_alts
p1: 0.06046511627906977
p5: 0.11627906976744186
p10: 0.11627906976744186
map5: 0.08310077519379845
map10: 0.08310077519379845



In [None]:
topres_approach = "first_match"
candrank_approach = "cr_deezy_match"
column_name = candrank_approach + "_" + topres_approach
df_resolved = df.copy()

# Resolution methods
df_resolved[column_name] = df_resolved.apply(lambda row: resolution_methods.first_match(row[candrank_approach], wkdt_alts_df), axis=1)

In [11]:
# Evaluation methods
eval_methods.topres_exactmetrics(df_resolved, column_name)
eval_methods.topres_distancemetrics(gazetteer_df, df_resolved, column_name)

Hamming Loss: 0.3953488372093023
Accuracy Score: 0.6046511627906976
Jaccard Score: 0.5543290043290043
Accuracy at 1: 0.6465116279069767
Accuracy at 5: 0.7209302325581395
Accuracy at 10: 0.7209302325581395
