# Process Wikidata candidates

Two main outputs from this notebook:
- [x] Generate a altname-centric British Wikidata gazetteer: `gazetteers/britwikidata_gazetteer.pkl`
- [x] Generate the candidates (aka unique altnames) input file for candidate ranker: `gazetteers/britwikidata_candidates.txt`

In [1]:
import pandas as pd
import glob

from collections import Counter
from  itertools import chain
import pydash
import ast
import re
from pathlib import Path

import numpy as np

#### Load British wikidata

In [2]:
britdf = pd.read_csv("british_isles.csv", header=0, index_col=None, low_memory=False)
britdf = britdf.drop(columns=['Unnamed: 0'])
britdf['latitude'] = britdf['latitude'].astype(float)
britdf['longitude'] = britdf['longitude'].astype(float)
britdf = britdf[britdf['latitude'].notna()]
britdf = britdf[britdf['longitude'].notna()]

#### Add geonames alternate names

In [3]:
if not Path("/resources/geonames/alternateNamesV2.txt").exists():
    !wget http://download.geonames.org/export/dump/alternateNamesV2.zip
    !unzip alternateNamesV2.zip
    !rm alternateNamesV2.zip
    !rm iso-languagecodes.txt
    !mv alternateNamesV2.txt /resources/geonames/alternateNamesV2.txt
    
if not Path("/resources/geonames/GB.txt").exists():
    !wget http://download.geonames.org/export/dump/GB.zip
    !unzip GB.zip
    !rm readme.txt
    !rm GB.zip
    !mv GB.txt /resources/geonames/GB.txt
    
if not Path("/resources/geonames/IE.txt").exists():
    !wget http://download.geonames.org/export/dump/IE.zip
    !unzip IE.zip
    !rm readme.txt
    !rm IE.zip
    !mv IE.txt /resources/geonames/IE.txt

--2020-10-07 13:34:08--  http://download.geonames.org/export/dump/alternateNamesV2.zip
Resolving download.geonames.org (download.geonames.org)... 188.40.33.19
Connecting to download.geonames.org (download.geonames.org)|188.40.33.19|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 161915121 (154M) [application/zip]
Saving to: ‘alternateNamesV2.zip’


2020-10-07 13:34:10 (90.8 MB/s) - ‘alternateNamesV2.zip’ saved [161915121/161915121]

Archive:  alternateNamesV2.zip
  inflating: iso-languagecodes.txt   
  inflating: alternateNamesV2.txt    
--2020-10-07 13:34:15--  http://download.geonames.org/export/dump/GB.zip
Resolving download.geonames.org (download.geonames.org)... 188.40.33.19
Connecting to download.geonames.org (download.geonames.org)|188.40.33.19|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2163500 (2.1M) [application/zip]
Saving to: ‘GB.zip’


2020-10-07 13:34:15 (11.3 MB/s) - ‘GB.zip’ saved [2163500/2163500]

Archive:  GB.zip


#### Process alternate names table

In [4]:
geoaltnames = pd.read_csv("/resources/geonames/alternateNamesV2.txt", sep="\t", names=["alternateNameId", "geonameid", "isolanguage", "alternateName", "isPreferredName", "isShortName", "isColloquial", "isHistoric", "from", "to"], index_col=None, low_memory=False)

In [5]:
# Filter out alternate names that are actually pseudocodes:
gn_pseudocodes = ["post", "link", "iata", "icao",
                  "faac", "tcid", "unlc", "abbr",
                  "wkdt", "phon", "piny", "fr_1793"] # Geonames pseucodes from here: https://www.geonames.org/manual.html

geoaltnames = geoaltnames[~geoaltnames["isolanguage"].isin(gn_pseudocodes)]

In [6]:
geoaltnames = geoaltnames.drop(columns=["alternateNameId", "isolanguage", "isPreferredName", "isShortName", "isColloquial", "isHistoric", "from", "to"])

#### Process GB and IE geonames tables

In [7]:
# Country: GB (United Kingdom)
gb_geonames = pd.read_csv("/resources/geonames/GB.txt", sep="\t", names=["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"], index_col=None, low_memory=False)
gb_geonames = gb_geonames.drop(columns=["alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"])
gb_altnames = list(set(gb_geonames.groupby(['geonameid', 'name']).groups))
gb_altnames.extend(list(set(gb_geonames.groupby(['geonameid', 'asciiname']).groups)))
gb_altnames = list(set(gb_altnames))
gb_geonames = pd.DataFrame(gb_altnames, columns = ["geonameid", "alternateName"])

In [8]:
# Country: IE (Ireland)
ie_geonames = pd.read_csv("/resources/geonames/IE.txt", sep="\t", names=["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"], index_col=None, low_memory=False)
ie_geonames = ie_geonames.drop(columns=["alternatenames", "latitude", "longitude", "fclass", "fcode", "ccode", "cc2", "admin1", "admin2", "admin3", "admin4", "population", "elevation", "dem", "timezone", "moddate"])
ie_altnames = list(set(ie_geonames.groupby(['geonameid', 'name']).groups))
ie_altnames.extend(list(set(ie_geonames.groupby(['geonameid', 'asciiname']).groups)))
ie_altnames = list(set(ie_altnames))
ie_geonames = pd.DataFrame(ie_altnames, columns = ["geonameid", "alternateName"])

#### Concatenate all altname dataframes and filter relevant rows

In [9]:
geonames_altnames = pd.concat([geoaltnames, gb_geonames, ie_geonames], ignore_index=True)
geonames_altnames = geonames_altnames.drop_duplicates(ignore_index=True)

In [10]:
# Filter out alternate names if they are not in Latin alphabet:
def latin_alphabet(toponym):
    latin_range = re.compile(u'[\u0040-\u007F\u0080-\u00FF\u0100-\u017F\u0180-\u024F]', flags=re.UNICODE)
    if re.search(latin_range, toponym):
        return True
    else:
        return False
    
geonames_altnames = geonames_altnames[geonames_altnames.apply(lambda x: latin_alphabet(x["alternateName"]), axis=1)]

In [11]:
# Keep only rows that have a corresponding Wikidata entry
def parse_geonames(geoIDs):
    geonamesIDs = []
    if type(geoIDs) == str:
        geonamesIDs = ast.literal_eval(geoIDs)
        geonamesIDs = [int(gn) for gn in geonamesIDs if type(gn) == str]
    return geonamesIDs

brit_geonameIDs = []
for i, row in britdf.iterrows():
    tmp_gnalt = parse_geonames(row["geonamesIDs"])
    if tmp_gnalt:
        brit_geonameIDs.extend(tmp_gnalt)

geonames_altnames = geonames_altnames[geonames_altnames["geonameid"].isin(brit_geonameIDs)]

#### Create altname-focused gazetteer

In [12]:
def obtain_wikidata_altnames(elabel, aliases, nativelabel):
    re_appo = r"(.+)\(.+\)$"
    altnames = dict()
    if type(elabel) == str:
        if re.match(re_appo, elabel):
            elabel = re.match(re_appo, elabel).group(1).strip()
            elabel = re.sub(",$", "", elabel)
        if not elabel in altnames:
            altnames[elabel] = "english_label"
    if type(aliases) == str:
        aliases = ast.literal_eval(aliases)
        for language in aliases:
            for a in aliases[language]:
                if not a in altnames:
                    altnames[a] = "wikidata_alias"
    if type(nativelabel) == str:
        nlabel = ast.literal_eval(nativelabel)
        for nl in nlabel:
            if not nl in altnames:
                altnames[nl] = "native_label"
    return altnames

In [13]:
def obtain_geonames_altnames(geonamesIDs, geoaltnames, altnames):
    if type(geonamesIDs) == str:
        geonamesIDs = ast.literal_eval(geonamesIDs)
        for gid in geonamesIDs:
            if type(gid) == str:
                tmp_gndf = geoaltnames[geoaltnames["geonameid"] == int(gid)]
                if not tmp_gndf.empty:
                    for i, row in tmp_gndf.iterrows():
                        if not row["alternateName"] in altnames:
                            altnames[row["alternateName"]] = "geonames"
    return altnames

In [14]:
def format_for_candranker(gazname, unique_placenames_array):
    """
    This function returns the unique alternate names in a given gazetteer
    in the format required by DeezyMatch candidate ranker."""
    with open(gazname + ".txt", "w") as fw:
        for pl in unique_placenames_array:
            pl = pl.strip()
            if pl:
                pl = pl.replace('"', "")
                fw.write(pl.strip() + "\t0\tfalse\n")

In [15]:
wkid = []
altname = []
source = []
lat = []
lon = []
for i, row in britdf.iterrows():
    dAltnames = obtain_wikidata_altnames(row["english_label"], row["alias_dict"], row["nativelabel"])
    if dAltnames: # Entities without any alternate names are likely to be ghost entities, e.g. Q24663377
        dAltnames = obtain_geonames_altnames(row["geonamesIDs"], geonames_altnames, dAltnames)
    for a in dAltnames:
        if a:
            if type(a) == str:
                wkid.append(row["wikidata_id"])
                altname.append(a)
                source.append(dAltnames[a])
                lat.append(row["latitude"])
                lon.append(row["longitude"])
                
wkgazetteer = pd.DataFrame()
wkgazetteer["wkid"] = wkid
wkgazetteer["altname"] = altname
wkgazetteer["source"] = source
wkgazetteer["lat"] = lat
wkgazetteer["lon"] = lon

wkgazetteer = wkgazetteer.drop_duplicates(subset = ['wkid', 'altname'])
wkgazetteer.to_pickle("../toponym_matching/gazetteers/britwikidata_gazetteer.pkl")
unique_placenames_array = list(set(list(np.array(wkgazetteer["altname"]))))
format_for_candranker("../toponym_matching/gazetteers/britwikidata_candidates", unique_placenames_array)

In [16]:
wkgazetteer.shape

(822161, 5)