In [1]:
# import utils
import pandas as pd
import collections
import ast
import re
from wikimapper import WikiMapper
from pathlib import Path
import urllib.parse
import json
import html
from tqdm.auto import tqdm
tqdm.pandas()

# Railway station resources
stn_candidates = "stnwikidata_candidates"
stn_min_wkdtgazetteer = "stnwikidata_gazetteer"
stn_max_wkdtgazetteer = "british_isles_stations"

# British Wikidata resources
brit_candidates = "britwikidata_candidates"
brit_min_wkdtgazetteer = "britwikidata_gazetteer"
brit_max_wkdtgazetteer = "british_isles"

# Queries files
queries_mainst = "quicks_mainst_queries"
queries_subst = "quicks_subst_queries"
queries_altns = "quicks_altnames_queries"

# DeezyMatch model
dm_model = "wikigaz_en_003"
candrank_metric = "faiss" # 'faiss', 'cosine', 'conf'
num_candidates = "10"

# Load candidate ranking results (main, sub and altname stations against railway station Wikidata):
quicks_mainst_cands_stn = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_mainst + "_" + stn_candidates + "_" + dm_model + "_" + candrank_metric + num_candidates + ".pkl")
quicks_subst_cands_stn = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_subst + "_" + stn_candidates + "_" + dm_model + "_" + candrank_metric + num_candidates + ".pkl")
quicks_altns_cands_stn = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_altns + "_" + stn_candidates + "_" + dm_model + "_" + candrank_metric + num_candidates + ".pkl")

# Load candidate ranking results (main stations against British Wikidata):
quicks_mainst_cands_brit = pd.read_pickle("../../toponym_matching/ranker_results/" + queries_mainst + "_" + brit_candidates + "_" + dm_model + "_" + candrank_metric + num_candidates + ".pkl")

# Load minimal wikidata gazetteer (fields: wkid, altname, source, lat, lon) from which candidates were created:
stn_wikidatagaz_df = pd.read_pickle("../../toponym_matching/gazetteers/" + stn_min_wkdtgazetteer + ".pkl")
brit_wikidatagaz_df = pd.read_pickle("../../toponym_matching/gazetteers/" + brit_min_wkdtgazetteer + ".pkl")

# Load maximal wikidata gazetteer (fields: wikidata_id, english_label, instance_of, description_set...):
stn_wikidata_gazetteer = pd.read_csv("../../wikidata/" + stn_max_wkdtgazetteer + ".csv", index_col=0, low_memory=False)
brit_wikidata_gazetteer = pd.read_csv("../../wikidata/" + brit_max_wkdtgazetteer + ".csv", index_col=0, low_memory=False)

  from pandas import Panel


In [2]:
# ----------------------------------------------------------------
# Function that finds the Wikidata IDs of DeezyMatch's returned matches.
# * Input:
#     * row: a ranker_results df row (fields: id, query, pred_score,
#            faiss_distance, cosine_sim, ...)
#     * gazetteer: gazetteer where candidates have been obtained from
#                  (fields: wkid, altname, source, lat, lon)
#     * ranking: candidate ranking metric (must be one column of the
#                'row' argument.)
# * Output: a dictionary with DeezyMatch candidates aligned with their
#           Wikidata IDs, per row.
re_station = r"( \b(([Rr]ailw[ae]y [Ss]tation)|([Bb]us [Ss]tation)|([Uu]nderground [Ss]tation)|([Tt]ram [Ss]top)|([Hh]alt)|([Ss]top)|([Ss]tation))((\, .*)|( \(.*))?)$"
def match_cands_wikidata_stn(row,gazetteer,ranking):
    wikidata_cands = {}
    
    # Matches at closest three distances:
    minval = sorted(list(set(list(row[ranking].values()))))[2]
    cands = [(k, row[ranking][k]) for k in row[ranking] if row[ranking][k] <= minval]
    
    # Find wikidata IDs:
    for cand,score in cands:
        wikidataIds = gazetteer[gazetteer["altname"].str.contains(r"^" + re.escape(cand) + re_station, regex = True)]["wkid"]
        for _id in wikidataIds:
            if _id not in wikidata_cands:
                wikidata_cands[_id] = score
                
    return wikidata_cands

In [3]:
# ----------------------------------------------------------------
# Function that finds the Wikidata IDs of DeezyMatch's returned matches.
# * Input:
#     * row: a ranker_results df row (fields: id, query, pred_score,
#            faiss_distance, cosine_sim, ...)
#     * gazetteer: gazetteer where candidates have been obtained from
#                  (fields: wkid, altname, source, lat, lon)
#     * ranking: candidate ranking metric (must be one column of the
#                'row' argument.)
# * Output: a dictionary with DeezyMatch candidates aligned with their
#           Wikidata IDs, per row.
def match_cands_wikidata_brit(row,gazetteer,ranking):
    wikidata_cands = {}
    
    # Matches at closest three distances:
    minval = sorted(list(set(list(row[ranking].values()))))[2]
    cands = [(k, row[ranking][k]) for k in row[ranking] if row[ranking][k] <= minval]
    
    # Find wikidata IDs:
    for cand,score in cands:
        wikidataIds = gazetteer[gazetteer["altname"] == cand]["wkid"]
        for _id in wikidataIds:
            if _id not in wikidata_cands:
                wikidata_cands[_id] = score
                
    return wikidata_cands

In [4]:
measure = "cosine_distance" # pred_score, 1-pred_score, faiss_distance, cosine_distance

if not Path(measure + "_stn_quicks_mainst_cands.pkl").exists():
    quicks_mainst_cands_stn["wikidata_cands"] = quicks_mainst_cands_stn.progress_apply(lambda row : match_cands_wikidata_stn(row,stn_wikidatagaz_df, measure), axis=1)
    quicks_mainst_cands_stn.to_pickle(measure + "_stn_quicks_mainst_cands.pkl")

if not Path(measure + "_brit_quicks_mainst_cands.pkl").exists():
    quicks_mainst_cands_brit["wikidata_cands"] = quicks_mainst_cands_brit.progress_apply(lambda row : match_cands_wikidata_brit(row,brit_wikidatagaz_df, measure), axis=1)
    quicks_mainst_cands_brit.to_pickle(measure + "_brit_quicks_mainst_cands.pkl")
    
if not Path(measure + "_stn_quicks_subst_cands.pkl").exists():
    quicks_subst_cands_stn["wikidata_cands"] = quicks_subst_cands_stn.progress_apply(lambda row : match_cands_wikidata_stn(row,stn_wikidatagaz_df, measure), axis=1)
    quicks_subst_cands_stn.to_pickle(measure + "_stn_quicks_subst_cands.pkl")
    
if not Path(measure + "_stn_quicks_altns_cands.pkl").exists():
    quicks_altns_cands_stn["wikidata_cands"] = quicks_altns_cands_stn.progress_apply(lambda row : match_cands_wikidata_stn(row,stn_wikidatagaz_df, measure), axis=1)
    quicks_altns_cands_stn.to_pickle(measure + "_stn_quicks_altns_cands.pkl")

quicks_mainst_cands_stn = pd.read_pickle(measure + "_stn_quicks_mainst_cands.pkl")
quicks_mainst_cands_brit = pd.read_pickle(measure + "_brit_quicks_mainst_cands.pkl")
quicks_subst_cands_stn = pd.read_pickle(measure + "_stn_quicks_subst_cands.pkl")
quicks_altns_cands_stn = pd.read_pickle(measure + "_stn_quicks_altns_cands.pkl")

In [5]:
# Load csv-structured Quicks dataset:
quicks_dataset = pd.read_pickle("../../quick/outputs/quicks_processed.pkl")
quicks_altnames = pd.read_pickle("../../quick/outputs/quicks_altnames_df.pkl")

In [6]:
main2df = []
main2df_brit = []
sub2df = []
alt2df = []
ref2df = []
for i, row in tqdm(quicks_dataset.iterrows()):
    mainId = row["MainId"]
    subId = row["SubId"]
    mainName = row["MainStation"]
    subName = row["SubStFormatted"]
    
    # Main station cands:
    main_cands = dict()
    mains = tuple(quicks_mainst_cands_stn[quicks_mainst_cands_stn["query"] == mainName]["wikidata_cands"].values)
    for mc in mains:
        for k in mc:
            if k in main_cands:
                if main_cands[k] > mc[k]:
                    main_cands[k] = mc[k]
            else:
                main_cands[k] = mc[k]
    
    # Main station cands:
    main_cands_brit = dict()
    mains = tuple(quicks_mainst_cands_brit[quicks_mainst_cands_brit["query"] == mainName]["wikidata_cands"].values)
    for mc in mains:
        for k in mc:
            if k in main_cands_brit:
                if main_cands_brit[k] > mc[k]:
                    main_cands_brit[k] = mc[k]
            else:
                main_cands_brit[k] = mc[k]
        
    # Substation cands:
    subst_cands = dict()
    subs = tuple(quicks_subst_cands_stn[quicks_subst_cands_stn["query"] == subName]["wikidata_cands"].values)
    for mc in subs:
        for k in mc:
            if k in subst_cands:
                if subst_cands[k] > subs[k]:
                    subst_cands[k] = mc[k]
            else:
                subst_cands[k] = mc[k]
        
    # Altnames for stations cands:
    alts = quicks_altnames[(quicks_altnames["MainId"] == mainId) & (quicks_altnames["SubId"] == subId)]["Altnames"].tolist()
    alt_cands = dict()
    for a in alts:
        for item in quicks_altns_cands_stn[quicks_altns_cands_stn["query"] == a]["wikidata_cands"].values:
            for k, v in item.items():
                if k in alt_cands:
                    if alt_cands[k] > v:
                        alt_cands[k] = v
                else:
                    alt_cands[k] = v
                    
    main2df.append(main_cands)
    main2df_brit.append(main_cands_brit)
    sub2df.append(subst_cands)
    alt2df.append(alt_cands)
    
quicks_dataset["MainstWkdt_stn"] = main2df
quicks_dataset["MainstWkdt_brit"] = main2df_brit
quicks_dataset["SubstWkdt_stn"] = sub2df
quicks_dataset["AltnmWkdt_stn"] = alt2df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [7]:
quicks_dataset.iloc[100:150]

Unnamed: 0,MainId,SubId,MainStation,SubStation,SubStFormatted,Description,MainstWkdt_stn,MainstWkdt_brit,SubstWkdt_stn,AltnmWkdt_stn
100,82,101,ABOYNE,ABOYNE,ABOYNE,"[GNS] op 2 December 1859 ( co n, item Aberdeen...","{'Q4668673': 0.0, 'Q4860179': 0.111, 'Q3405374...","{'Q323196': 0.0, 'Q31083921': 0.0, 'Q2239361':...","{'Q4668673': 0.0, 'Q4860179': 0.111, 'Q3405374...",{}
101,83,102,ABOYNE CURLING POND,ABOYNE CURLING POND,ABOYNE CURLING POND,[GNS] (non-tt): occasional use; alias LOCH OF ...,"{'Q61014533': 0.0, 'Q2740729': 0.0899, 'Q46678...","{'Q1785643': 0.0591, 'Q2820496': 0.063, 'Q2658...","{'Q61014533': 0.0, 'Q2740729': 0.0899, 'Q46678...","{'Q2134781': 0.0876, 'Q2065104': 0.088}"
102,84,103,ABRAM COLLIERY,ABRAM COLLIERY,ABRAM COLLIERY,[LNW] (non-tt): miners; op by February 1919; c...,"{'Q4859998': 0.1181, 'Q4666792': 0.1107, 'Q466...","{'Q38518616': 0.0814, 'Q26279970': 0.0986, 'Q5...","{'Q4859998': 0.1181, 'Q4666792': 0.1107, 'Q466...",{}
103,85,104,ABY,ABY,ABY,[GN] op 3 September 1848 (Boston 4 th - line) ...,"{'Q2924155': 0.0997, 'Q5001099': 0.118, 'Q1920...","{'Q4670716': 0.0, 'Q80582388': 0.0231, 'Q28117...","{'Q2924155': 0.0997, 'Q5001099': 0.118, 'Q1920...","{'Q4670719': 0.0, 'Q5050615': 0.0463, 'Q473799..."
104,86,105,ACCRINGTON,ACCRINGTON,ACCRINGTON,[LY] op 19 June 1848 (Preston G 24 th ) ; stil...,"{'Q1811969': 0.0, 'Q5038686': 0.0561, 'Q504645...","{'Q1622949': 0.0, 'Q3540002': 0.0561, 'Q659945...","{'Q1811969': -0.0, 'Q5038686': 0.0561, 'Q50464...",{}
105,87,106,ACH-NA-CLOICH,ACH-NA-CLOICH,ACH NA CLOICH,[Cal] first in Brad June 1881; clo 1 January 1...,"{'Q4673334': 0.0, 'Q2499847': 0.1451}","{'Q17771850': 0.1217, 'Q99452382': 0.1151, 'Q9...",{'Q4673334': 0.0},{}
106,88,107,ACHANALT,ACHANALT,ACHANALT,[High] op 19 August 1870 ** ; still open. Aot ...,"{'Q3404222': 0.0, 'Q7073896': 0.0883, 'Q799187...","{'Q3470243': 0.0, 'Q4673846': 0.0567, 'Q503042...","{'Q3404222': 0.0, 'Q7073896': 0.0883, 'Q799187...","{'Q3404222': 0.0205, 'Q4819462': 0.1123, 'Q707..."
107,89,108,ACHEILIDH CROSSING,ACHEILIDH CROSSING,ACHEILIDH CROSSING,"[High] (non-tt): railwaymen, families; dates ?...","{'Q21062004': 0.1005, 'Q3397348': 0.1103, 'Q48...","{'Q31073068': 0.0437, 'Q26717088': 0.0437, 'Q4...","{'Q21062004': 0.1005, 'Q3397348': 0.1103, 'Q48...",{}
108,90,109,ACHNASHEEN,ACHNASHEEN,ACHNASHEEN,[High] op 19 August 1870 ** ; still open. Brad...,"{'Q2148366': 0.0, 'Q2530277': 0.0885, 'Q467392...","{'Q2193195': 0.0, 'Q4677693': 0.0741, 'Q246596...","{'Q2148366': 0.0, 'Q2530277': 0.0885, 'Q467392...",{}
109,91,110,ACHNASHELLACH,ACHNASHELLACH,ACHNASHELLACH,[High] op as private station 19 August 1870 (H...,{'Q2530277': 0.0},"{'Q2482446': 0.0, 'Q4673845': 0.0628}",{'Q2530277': 0.0},{}


In [8]:
quicks_dataset.to_pickle(measure + "_quicks_candranked.pkl")