# Process Wikidata candidates

Two main outputs from this notebook:
- [x] Generate a altname-centric British Wikidata gazetteer: `gazetteers/britwikidata_gazetteer.pkl`
- [x] Generate the candidates (aka unique altnames) input file for candidate ranker: `gazetteers/britwikidata_candidates.txt`

In [None]:
import pandas as pd
import glob

from collections import Counter
from  itertools import chain
import pydash
import ast
import re
from pathlib import Path

import numpy as np

#### Load British wikidata

The `british_isles.csv` file is one output from `entity_extraction.py`. Running `entity_extraction.py` takes days, so for the time being you can find this file in the `ToponymVM` under `/home/mcollardanuy/PlaceLinking/wikidata/`.

In [None]:
britdf = pd.read_csv("british_isles.csv", header=0, index_col=None, low_memory=False)
britdf = britdf.drop(columns=['Unnamed: 0'])
britdf['latitude'] = britdf['latitude'].astype(float)
britdf['longitude'] = britdf['longitude'].astype(float)
britdf = britdf[britdf['latitude'].notna()]
britdf = britdf[britdf['longitude'].notna()]

In [None]:
britdf.head()

#### Add geonames alternate names

In [None]:
if not Path("/resources/geonames/alternateNamesV2.txt").exists():
    !wget http://download.geonames.org/export/dump/alternateNamesV2.zip
    !unzip alternateNamesV2.zip
    !rm alternateNamesV2.zip
    !rm iso-languagecodes.txt
    !mv alternateNamesV2.txt /resources/geonames/alternateNamesV2.txt
    
if not Path("/resources/geonames/GB.txt").exists():
    !wget http://download.geonames.org/export/dump/GB.zip
    !unzip GB.zip
    !rm readme.txt
    !rm GB.zip
    !mv GB.txt /resources/geonames/GB.txt
    
if not Path("/resources/geonames/IE.txt").exists():
    !wget http://download.geonames.org/export/dump/IE.zip
    !unzip IE.zip
    !rm readme.txt
    !rm IE.zip
    !mv IE.txt /resources/geonames/IE.txt

#### Process alternate names table

In [None]:
geoaltnames = pd.read_csv("/resources/geonames/alternateNamesV2.txt", sep="\t", names=["alternateNameId", "geonameid", "isolanguage", "alternateName", "isPreferredName", "isShortName", "isColloquial", "isHistoric", "from", "to"], index_col=None, low_memory=False)

In [None]:
# Filter out alternate names that are actually pseudocodes:
gn_pseudocodes = ["post", "link", "iata", "icao",
                  "faac", "tcid", "unlc", "abbr",
                  "wkdt", "phon", "piny", "fr_1793"] # Geonames pseucodes from here: https://www.geonames.org/manual.html

geoaltnames = geoaltnames[~geoaltnames["isolanguage"].isin(gn_pseudocodes)]

# Filter by languages native to the British Isles or with strong influence in toponymy:
# gd: Scottish Gaelic
# kw: Cornish
# sco: Scots
# cy: Welsh
# ga: Irish
# en: English
# gv: Manx
# br: Breton
# fr: French
# la: Latin
gn_toplanguages = ["gd", "kw", "sco", "cy", "ga", "en", "gv", "br", "fr", "la"]
geoaltnames = geoaltnames[(geoaltnames["isolanguage"].isin(gn_toplanguages)) | (geoaltnames["isolanguage"].isnull())]

In [None]:
geoaltnames = geoaltnames.drop(columns=["alternateNameId", "isolanguage", "isPreferredName", "isShortName", "isColloquial", "isHistoric", "from", "to"])

In [None]:
geoaltnames.head()

#### Process GB and IE geonames tables

In [None]:
# Country: GB (United Kingdom)
gb_geonames = pd.read_csv("/resources/geonames/GB.txt", sep="\t", names=["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"], index_col=None, low_memory=False)
gb_geonames = gb_geonames.drop(columns=["alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"])
gb_altnames = list(set(gb_geonames.groupby(['geonameid', 'name']).groups))
gb_altnames.extend(list(set(gb_geonames.groupby(['geonameid', 'asciiname']).groups)))
gb_altnames = list(set(gb_altnames))
gb_geonames = pd.DataFrame(gb_altnames, columns = ["geonameid", "alternateName"])

In [None]:
# Country: IE (Ireland)
ie_geonames = pd.read_csv("/resources/geonames/IE.txt", sep="\t", names=["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"], index_col=None, low_memory=False)
ie_geonames = ie_geonames.drop(columns=["alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"])
ie_altnames = list(set(ie_geonames.groupby(['geonameid', 'name']).groups))
ie_altnames.extend(list(set(ie_geonames.groupby(['geonameid', 'asciiname']).groups)))
ie_altnames = list(set(ie_altnames))
ie_geonames = pd.DataFrame(ie_altnames, columns = ["geonameid", "alternateName"])

In [None]:
gb_geonames.head()

#### Concatenate all altname dataframes and filter relevant rows

In [None]:
geonames_altnames = pd.concat([geoaltnames, gb_geonames, ie_geonames], ignore_index=True)
geonames_altnames = geonames_altnames.drop_duplicates(ignore_index=True)

In [None]:
# Filter out alternate names if they are not in Latin alphabet:
def latin_alphabet(toponym):
    latin_range = re.compile(u'[\u0040-\u007F\u0080-\u00FF\u0100-\u017F\u0180-\u024F]', flags=re.UNICODE)
    if re.search(latin_range, toponym):
        return True
    else:
        return False
    
geonames_altnames = geonames_altnames[geonames_altnames.apply(lambda x: latin_alphabet(x["alternateName"]), axis=1)]

In [None]:
geonames_altnames[geonames_altnames["geonameid"] == 7297387]

In [None]:
# Keep only rows that have a corresponding Wikidata entry
def parse_geonames(geoIDs):
    geonamesIDs = []
    if type(geoIDs) == str:
        geonamesIDs = ast.literal_eval(geoIDs)
        geonamesIDs = [int(gn) for gn in geonamesIDs if type(gn) == str]
    return geonamesIDs

brit_geonameIDs = []
for i, row in britdf.iterrows():
    tmp_gnalt = parse_geonames(row["geonamesIDs"])
    if tmp_gnalt:
        brit_geonameIDs.extend(tmp_gnalt)

geonames_altnames = geonames_altnames[geonames_altnames["geonameid"].isin(brit_geonameIDs)]

In [None]:
geonames_altnames.head()

#### Create altname-focused gazetteer

In [None]:
def obtain_wikidata_altnames(elabel, aliases, nativelabel):
    re_appo = r"(.+)\(.+\)$"
    altnames = dict()
    if type(elabel) == str:
        if re.match(re_appo, elabel):
            elabel = re.match(re_appo, elabel).group(1).strip()
            elabel = re.sub(",$", "", elabel)
        if not elabel in altnames:
            altnames[elabel] = "english_label"
    if type(aliases) == str:
        aliases = ast.literal_eval(aliases)
        for language in aliases:
            for a in aliases[language]:
                if not a in altnames:
                    altnames[a] = "wikidata_alias"
    if type(nativelabel) == str:
        nlabel = ast.literal_eval(nativelabel)
        for nl in nlabel:
            if not nl in altnames:
                altnames[nl] = "native_label"
    return altnames

In [None]:
def obtain_geonames_altnames(geonamesIDs, geoaltnames, altnames):
    if type(geonamesIDs) == str:
        geonamesIDs = ast.literal_eval(geonamesIDs)
        for gid in geonamesIDs:
            if type(gid) == str:
                tmp_gndf = geoaltnames[geoaltnames["geonameid"] == int(gid)]
                if not tmp_gndf.empty:
                    for i, row in tmp_gndf.iterrows():
                        if not row["alternateName"] in altnames:
                            altnames[row["alternateName"]] = "geonames"
    return altnames

In [None]:
def format_for_candranker(gazname, unique_placenames_array):
    """
    This function returns the unique alternate names in a given gazetteer
    in the format required by DeezyMatch candidate ranker."""
    with open(gazname + ".txt", "w") as fw:
        for pl in unique_placenames_array:
            pl = pl.strip()
            if pl:
                pl = pl.replace('"', "")
                fw.write(pl.strip() + "\t0\tfalse\n")

#### Create generic British Isles gazetteer and candidates

In [None]:
wkid = []
altname = []
source = []
lat = []
lon = []
for i, row in britdf.iterrows():
    dAltnames = obtain_wikidata_altnames(row["english_label"], row["alias_dict"], row["nativelabel"])
    if dAltnames: # Entities without any alternate names are likely to be ghost entities, e.g. Q24663377
        dAltnames = obtain_geonames_altnames(row["geonamesIDs"], geonames_altnames, dAltnames)
    for a in dAltnames:
        if a:
            if type(a) == str:
                wkid.append(row["wikidata_id"])
                altname.append(a)
                source.append(dAltnames[a])
                lat.append(row["latitude"])
                lon.append(row["longitude"])
                
wkgazetteer = pd.DataFrame()
wkgazetteer["wkid"] = wkid
wkgazetteer["altname"] = altname
wkgazetteer["source"] = source
wkgazetteer["lat"] = lat
wkgazetteer["lon"] = lon

wkgazetteer = wkgazetteer.drop_duplicates(subset = ['wkid', 'altname'])
wkgazetteer = wkgazetteer[wkgazetteer['altname'].notna()]
wkgazetteer.to_pickle("../toponym_matching/gazetteers/britwikidata_gazetteer.pkl")
unique_placenames_array = list(set(list(np.array(wkgazetteer["altname"]))))
format_for_candranker("../toponym_matching/gazetteers/britwikidata_candidates", unique_placenames_array)

In [None]:
wkgazetteer.head()

#### Create gazetteer and candidates of railway stations from British Isles

In [None]:
stationgaz = wkgazetteer[wkgazetteer["altname"].str.contains(r"\b(?:station|stop|halt|railway)\b", case=False, regex=True)]
station_altnames = stationgaz.wkid.to_list()

# From: https://docs.google.com/spreadsheets/d/1sREU_TKBU0HXoSSm7nyOw-4kId_bfu6OTEXxtdZeLl0/edit#gid=0
stn_wkdt_classes = ["Q55488", "Q4663385", "Q55491", "Q18516630", "Q1335652", "Q28109487",
                    "Q55678", "Q1567913", "Q39917125", "Q11424045", "Q14562709", "Q27020748",
                    "Q22808403", "Q85641138", "Q928830", "Q1339195", "Q27030992", "Q55485",
                    "Q17158079", "Q55493", "Q325358", "Q168565", "Q18543139", "Q11606300",
                    "Q2175765", "Q2298537"]

for i, row in britdf.iterrows():
    if type(row["instance_of"]) == str:
        wkdtcl = ast.literal_eval(row["instance_of"])
        if any(x in wkdtcl for x in stn_wkdt_classes):
            if not row["wikidata_id"] in station_altnames:
                stationgaz = pd.concat([stationgaz, wkgazetteer[wkgazetteer["wkid"] == row["wikidata_id"]]])
                
stationgaz = stationgaz[stationgaz['altname'].notna()]

In [None]:
stationgaz.head()

In [None]:
# Most railway stations end with "railway station", but Quick's takes it for
# granted that they are railway stations, so it just says "Currie" for "Currie
# railway station". Therefore, we add new alternate names without rail keywords.
re_station = r"(.*)\b(([Rr]ailw[ae]y [Ss]tation)|([Bb]us [Ss]tation)|([Uu]nderground [Ss]tation)|([Tt]ram [Ss]top)|([Hh]alt)|([Ss]top))((\, .*)|( \(.*))?$"
re_nostation = r".*\b(([Pp]olice [Ss]tation)|([Pp]ower [Ss]tation)|([Ll]ifeboat [Ss]tation)|([Pp]umping [Ss]tation)|([Tt]ransmitting [Ss]tation)).*$"

# stationgaz = pd.DataFrame()
for i, row in stationgaz.iterrows():
    if re.match(re_station, row["altname"]) and not re.match(re_nostation, row["altname"]):
        newaltname = re.sub(re_station, r"\1", row["altname"])
        if newaltname:
            stationgaz = stationgaz.append(pd.Series([row["wkid"], newaltname, "processed", row["lat"], row["lon"]], index=stationgaz.columns), ignore_index=True)

stationgaz = stationgaz.drop_duplicates(subset = ['wkid', 'altname'])
stationgaz = stationgaz[stationgaz['altname'].notna()]

In [None]:
stationgaz.tail()

In [None]:
stationgaz = stationgaz.drop_duplicates(subset = ['wkid', 'altname'])
stationgaz.to_pickle("../toponym_matching/gazetteers/stnwikidata_gazetteer.pkl")
unique_placenames_array = list(set(list(np.array(stationgaz["altname"]))))
format_for_candranker("../toponym_matching/gazetteers/stnwikidata_candidates", unique_placenames_array)

In [None]:
len(stationgaz.wkid.unique())

In [None]:
britdf.head()