In [None]:
# import utils
import pandas as pd
import collections
import ast
from wikimapper import WikiMapper
from pathlib import Path
import urllib.parse
import json
import html
from tqdm.auto import tqdm
tqdm.pandas()

candidates = "stnwikidata_candidates"
min_wkdtgazetteer = "stnwikidata_gazetteer"
max_wkdtgazetteer = "british_isles_stations"
queries_mainst = "quicks_mainst_queries"
queries_subst = "quicks_subst_queries"
queries_altns = "quicks_altnames"
queries_refs = "quicks_referenced"
dm_model = "wikigaz_en_002"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
mapper = WikiMapper("/resources/wikidata2wikipedia/index_enwiki-20190420.db") # WikiMapper object (see https://pypi.org/project/wikimapper/)
wikipedia_path = "/resources/wikipedia/extractedResources/Aspects/"

# Load candidate ranking results:
quicks_mainst_cands = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_mainst + "_" + candidates + "_" + dm_model + "_" + candrank_metric + ".pkl")
quicks_subst_cands = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_subst + "_" + candidates + "_" + dm_model + "_" + candrank_metric + ".pkl")
quicks_altns_cands = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_altns + "_" + candidates + "_" + dm_model + "_" + candrank_metric + ".pkl")
quicks_refs_cands = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_refs + "_" + candidates + "_" + dm_model + "_" + candrank_metric + ".pkl")

# Load minimal wikidata gazetteer (fields: wkid, altname, source, lat, lon) from which candidates were created:
wikidatagaz_df = pd.read_pickle("../../toponym_matching/gazetteers/" + min_wkdtgazetteer + ".pkl")

# Load maximal wikidata gazetteer (fields: wikidata_id, english_label, instance_of, description_set...):
wikidata_gazetteer = pd.read_csv("../../wikidata/" + max_wkdtgazetteer + ".csv", index_col=0, low_memory=False)

In [None]:
# ----------------------------------------------------------------
# Function that finds the Wikidata IDs of DeezyMatch's returned matches.
# * Input:
#     * row: a ranker_results df row (fields: id, query, pred_score,
#            faiss_distance, cosine_sim, ...)
#     * gazetteer: gazetteer where candidates have been obtained from
#                  (fields: wkid, altname, source, lat, lon)
#     * ranking: candidate ranking metric (must be one column of the
#                'row' argument.)
# * Output: a dictionary with DeezyMatch candidates aligned with their
#           Wikidata IDs, per row.
re_station = r"( \b(([Rr]ailw[ae]y [Ss]tation)|([Bb]us [Ss]tation)|([Uu]nderground [Ss]tation)|([Tt]ram [Ss]top)|([Hh]alt)|([Ss]top)|([Ss]tation))((\, .*)|( \(.*))?)$"
def match_cands_wikidata(row,gazetteer,ranking):
    wikidata_cands = {}
    cands = list(row[ranking].items())[:3] # Closest three matches.
    for cand,score in cands:
        wikidataIds = gazetteer[gazetteer["altname"].str.contains(r"^" + cand + re_station, regex = True)]["wkid"]
        for _id in wikidataIds:
            if _id not in wikidata_cands:
                wikidata_cands[_id] = score
    return wikidata_cands

In [None]:
if not Path("quicks_mainst_cands.pkl").exists():
    quicks_mainst_cands["wikidata_cands"] = quicks_mainst_cands.progress_apply(lambda row : match_cands_wikidata(row,wikidatagaz_df,"faiss_distance"), axis=1)
    quicks_subst_cands["wikidata_cands"] = quicks_subst_cands.progress_apply(lambda row : match_cands_wikidata(row,wikidatagaz_df,"faiss_distance"), axis=1)
    quicks_altns_cands["wikidata_cands"] = quicks_altns_cands.progress_apply(lambda row : match_cands_wikidata(row,wikidatagaz_df,"faiss_distance"), axis=1)
    quicks_refs_cands["wikidata_cands"] = quicks_refs_cands.progress_apply(lambda row : match_cands_wikidata(row,wikidatagaz_df,"faiss_distance"), axis=1)
    quicks_mainst_cands.to_pickle("quicks_mainst_cands.pkl")
    quicks_subst_cands.to_pickle("quicks_subst_cands.pkl")
    quicks_altns_cands.to_pickle("quicks_altns_cands.pkl")
    quicks_refs_cands.to_pickle("quicks_refs_cands.pkl")

quicks_mainst_cands = pd.read_pickle("quicks_mainst_cands.pkl")
quicks_subst_cands = pd.read_pickle("quicks_subst_cands.pkl")
quicks_altns_cands = pd.read_pickle("quicks_altns_cands.pkl")
quicks_refs_cands = pd.read_pickle("quicks_refs_cands.pkl")

In [None]:
# Load csv-structured Quicks dataset:
quicks_dataset = pd.read_pickle("../../quick/quicks_processed.pkl")
quicks_altnames = pd.read_pickle("../../quick/quicks_altnames_df.pkl")
quicks_referenced = pd.read_pickle("../../quick/quicks_referenced_df.pkl")

In [None]:
main2df = []
sub2df = []
alt2df = []
ref2df = []
for i, row in quicks_dataset.iterrows():
    mainId = row["MainId"]
    subId = row["SubId"]
    mainName = row["MainStation"]
    subName = row["SubStFormatted"]
    
    # Main station cands:
    main_cands = dict()
    mains = tuple(quicks_mainst_cands[quicks_mainst_cands["query"] == mainName]["wikidata_cands"].values)
    for mc in mains:
        for k in mc:
            if k in main_cands:
                if main_cands[k] > mc[k]:
                    main_cands[k] = mc[k]
            else:
                main_cands[k] = mc[k]
        
    # Substation cands:
    subst_cands = dict()
    subs = tuple(quicks_subst_cands[quicks_subst_cands["query"] == subName]["wikidata_cands"].values)
    for mc in subs:
        for k in mc:
            if k in subst_cands:
                if subst_cands[k] > subs[k]:
                    subst_cands[k] = mc[k]
            else:
                subst_cands[k] = mc[k]
        
    # Altnames for stations cands:
    alts = quicks_altnames[(quicks_altnames["MainId"] == mainId) & (quicks_altnames["SubId"] == subId)]["Altname"].tolist()
    alt_cands = dict()
    for a in alts:
        for item in quicks_altns_cands[quicks_altns_cands["query"] == a]["wikidata_cands"].values:
            for k, v in item.items():
                if k in alt_cands:
                    if alt_cands[k] > v:
                        alt_cands[k] = v
                else:
                    alt_cands[k] = v
                    
    # Referenced station names cands:
    refs = quicks_referenced[(quicks_referenced["MainId"] == mainId) & (quicks_referenced["SubId"] == subId)]["Referenced"].tolist()
    ref_cands = dict()
    for a in refs:
        for item in quicks_refs_cands[quicks_refs_cands["query"] == a]["wikidata_cands"].values:
            for k, v in item.items():
                if k in ref_cands:
                    if ref_cands[k] > v:
                        ref_cands[k] = v
                else:
                    ref_cands[k] = v
                    
    main2df.append(main_cands)
    sub2df.append(subst_cands)
    alt2df.append(alt_cands)
    ref2df.append(ref_cands)
    
quicks_dataset["MainstWkdt"] = main2df
quicks_dataset["SubstWkdt"] = sub2df
quicks_dataset["AltnmWkdt"] = alt2df
quicks_dataset["RefdWkdt"] = ref2df