In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import csv
from difflib import SequenceMatcher
import itertools
import numpy as np
import pandas as pd
import pyproj
from scipy import spatial
import string
import tqdm

### Get GB1900 relevant data

In [4]:
gb1900df = pd.DataFrame()
with open("gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    gb1900df = pd.read_csv(f)

In [5]:
gb1900subdf = gb1900df.loc[:, ['final_text','latitude', 'longitude']]

In [6]:
gb1900subdf.head()

Unnamed: 0,final_text,latitude,longitude
0,F. P.,52.84205,-3.176744
1,Parly. & Munl Boro. By.,51.509918,-0.102246
2,S. Ps.,51.510105,-0.103083
3,Southwark Bridge Stairs,51.509744,-0.09642
4,St. Paul's Pier,51.510232,-0.099456


In [7]:
gb1900subdf.to_pickle("gb1900_toponyms.pkl")

In [8]:
gb1900subdf.head()

Unnamed: 0,final_text,latitude,longitude
0,F. P.,52.84205,-3.176744
1,Parly. & Munl Boro. By.,51.509918,-0.102246
2,S. Ps.,51.510105,-0.103083
3,Southwark Bridge Stairs,51.509744,-0.09642
4,St. Paul's Pier,51.510232,-0.099456


### Start the WikiGazetteer DB server

WikiGazetteer is a gazetteer based on Wikipedia and enriched with Geonames data.

To build a WikiGazetteer (into a MySQL database) for a specific Wikipedia version follow [these instructions](https://github.com/Living-with-machines/lwm_GIR19_resolving_places/tree/master/gazetteer_construction). 

**Conntect to mysqlGaz:** Make sure you change your credentials. This is to connect to the DB locally in MySQL. This notebook takes the relevant fields in the WikiGazetteer MySQL database and creates a more manageable pickle file.

In [9]:
import mysql.connector
from mysql.connector import Error

gazDB = ""
cursorGaz = ""
try:
    gazDB = mysql.connector.connect(
            host='localhost',
            database='gazetteer',
            user='testGazetteer',
            password='1234')
    if gazDB.is_connected():
        cursorGaz = gazDB.cursor(dictionary=True)
except Error as e:
    print("Error while connecting to MySQL", e)

### Get WikiGazetteer relevant data

Since GB1900 is only British, we only get Wiki entries within an approximate bounding box of Britain. We focus in less relevant place names (hence inlinks <= 200).

In [10]:
cursorGaz.execute("""
        SELECT lat, lon, altname.altname FROM location
        JOIN altname ON altname.main_id=location.id
        JOIN inlinks ON inlinks.main_id=location.id
        WHERE lat > 49.0
        AND lat < 63.0
        AND lon > -15.0
        AND lon < 3.5
        AND inlinks <= 200
    """)
results = cursorGaz.fetchall()

In [11]:
wikidf = pd.DataFrame()

altnames = []
latitudes = []
longitudes = []
for result in results:
    altnames.append(result["altname"])
    latitudes.append(result["lat"])
    longitudes.append(result["lon"])
wikidf['altname'] = altnames
wikidf['latitude'] = latitudes
wikidf['longitude'] = longitudes

In [12]:
wikidf.to_pickle("wikigaz_britain.pkl")

In [13]:
wikidf.head()

Unnamed: 0,altname,latitude,longitude
0,A. A. Milne,51.09,0.107
1,Achill Island,53.9639,-10.003
2,Acaill,53.9639,-10.003
3,Achill,53.9639,-10.003
4,Wyspa Achill,53.9639,-10.003


In [14]:
if (gazDB.is_connected()):
    cursorGaz.close()
    gazDB.close()

### WikiGazetteer: From coordinates to 3D space

In [15]:
def transform_coordinates(longitude, latitude):
    ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
    lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
    return pyproj.transform(lla, ecef, 
                            longitude.to_numpy(), 
                            latitude.to_numpy(), 
                            np.zeros(len(latitude)), 
                            radians=False)

In [16]:
gb1900subdf["x"], gb1900subdf["y"], gb1900subdf["z"] = \
    transform_coordinates(gb1900subdf["longitude"], gb1900subdf["latitude"])
gb1900subdf.head()

Unnamed: 0,final_text,latitude,longitude,x,y,z
0,F. P.,52.84205,-3.176744,3854770.0,-213945.645048,5059946.0
1,Parly. & Munl Boro. By.,51.509918,-0.102246,3977779.0,-7098.456811,4969049.0
2,S. Ps.,51.510105,-0.103083,3977762.0,-7156.526053,4969062.0
3,Southwark Bridge Stairs,51.509744,-0.09642,3977794.0,-6694.027028,4969037.0
4,St. Paul's Pier,51.510232,-0.099456,3977752.0,-6904.747577,4969071.0


In [17]:
wikidf["x"], wikidf["y"], wikidf["z"] = transform_coordinates(wikidf["longitude"], wikidf["latitude"])
wikidf.head()

Unnamed: 0,altname,latitude,longitude,x,y,z
0,A. A. Milne,51.09,0.107,4014238.0,7496.60667,4939840.0
1,Achill Island,53.9639,-10.003,3703301.0,-653191.806674,5134381.0
2,Acaill,53.9639,-10.003,3703301.0,-653191.806674,5134381.0
3,Achill,53.9639,-10.003,3703301.0,-653191.806674,5134381.0
4,Wyspa Achill,53.9639,-10.003,3703301.0,-653191.806674,5134381.0


### Find close-by entries

Find closest x entries less than y km apart. Code by Kasra and Amy.

In [18]:
kdtree = spatial.cKDTree(gb1900subdf[["x", "y", "z"]].to_numpy())

In [19]:
num_neighbors = 1000
distance_upper_bound = 1000
all_dists, all_indxs = kdtree.query(wikidf[['x', 'y', 'z']].to_numpy(), 
                                    k=num_neighbors, 
                                    distance_upper_bound=distance_upper_bound)

In [20]:
all_dists_5km, all_indxs_5km = kdtree.query(wikidf[['x', 'y', 'z']].to_numpy(), 
                               k=1000, 
                               distance_upper_bound=5000)
np.shape(all_indxs)

(232109, 1000)

### Create tokens and match toponyms

In [21]:
stop = {'ever', 'but', 'down', 'namely', 'may', 'make', 'within', 'us', 'm', 'up', 'back', 'their', 'third', 'll', 'these', 'every', 'therein', 'off', 'either', 'behind', 'six', 'because', 'done', 'further', 'could', 'cannot', 'still', 'latterly', 'through', 'much', 'with', 'what', 've', 'nowhere', 'over', 'yourselves', 'below', 'which', 'of', "re", 'quite', 'others', 'front', 'by', "ll", 'toward', 'another', 'whom', 'beyond', 'empty', 'if', 'call', 'me', 'itself', 'most', "s", 'who', 'themselves', 'whatever', 'must', 'again', 'get', 'thereafter', 'meanwhile', 's', 'as', "m", 'somehow', 'above', 'please', 'nevertheless', 'whereupon', 'hereafter', 'any', 'anyway', 'was', 'seemed', 'hence', 'here', 'across', 'really', 'never', 'becomes', 'ours', 'this', 'yet', 'seeming', 'than', 'anywhere', 'other', 'whereafter', 'except', 'else', 'own', 'whither', 'elsewhere', 'noone', 'll', 'bottom', 'once', 'move', 'beforehand', 'we', 'whereby', 'both', 'four', 'are', 'anything', 'formerly', 'himself', 'from', 'sixty', 'were', 'each', 'all', 'neither', 'go', 'sometime', 'have', 'latter', 'n t', 'first', 'into', 'during', 'thus', 'them', 'while', 'something', 'everywhere', 'whence', 'did', 'would', 'when', 're', "n t", 'thereupon', 'used', 'nothing', 'become', 'amount', 'his', 'various', 'without', 'mine', 'thence', 'becoming', 'against', 'around', 'least', 'such', 'take', 'even', 'be', 'at', 'together', 'top', 'upon', 'hereupon', 'is', 'has', 'nobody', 'none', 'ourselves', 'beside', 'twenty', 've', 'regarding', 'that', 'am', 'due', 'always', 'show', 'for', 'among', 'out', 'since', 'five', 'though', 'had', 'sometimes', 'the', 'wherein', 'in', 'per', 'afterwards', 'its', 'i', 'do', 'made', 're', 'same', 're', 'those', 'although', 'former', 'hers', 'eleven', 'd', 'everyone', 's', 'should', 'about', 'next', 'well', 'no', 'alone', 'whenever', 'hereby', 'more', 'also', 'forty', 'and', 'thru', 'herein', 'someone', 'a', 'ca', 'moreover', 'perhaps', 'fifty', 'an', 'everything', 'became', 'her', 'being', 'now', 'whereas', 'throughout', 'not', 'eight', 'you', 'less', 'only', 'besides', 'she', 'yourself', 'three', 'say', 'name', 'or', 'him', 'therefore', 'onto', 'almost', 'serious', 'after', 'yours', 'see', 'n t', 'just', 'rather', 'might', 'to', 'anyone', 'whoever', 'two', 'there', 'fifteen', 'keep', 'my', 'using', 'anyhow', 'they', 'somewhere', 'will', 'our', 'put', 'few', 'on', "ve", 'twelve', 'part', 'm', 'd', 'often', 'between', 'towards', "d", 'whose', 'many', 'amongst', 'myself', 'been', 'wherever', 'so', 'it', 'hundred', 'your', 'herself', 'mostly', 'last', 'several', 'too', 'full', 'indeed', 'otherwise', 'enough', 'where', 'then', 'he', 'give', 'via', 'already', 'doing', 'along', 'very', 'before', 'until', 'one', 'nine', 'does', 'nor', 'how', 'unless', 'why', 'thereby', 'ten', 'whether', 'can', 'some', 'seem', 'under', 'side', 'however', 'seems', 'whole'}

In [22]:
def get_tokens(toponym):
    result = toponym.split(" ")
    result = [token.lower().translate({ord(c): None for c in string.punctuation}) for token in result]
    result = [token.strip() for token in result if token not in stop and len(token)>=2]
    return result

In [23]:
gb1900subdf['tokens'] = gb1900subdf['final_text'].map(get_tokens)
gb1900subdf.head()

Unnamed: 0,final_text,latitude,longitude,x,y,z,tokens
0,F. P.,52.84205,-3.176744,3854770.0,-213945.645048,5059946.0,[]
1,Parly. & Munl Boro. By.,51.509918,-0.102246,3977779.0,-7098.456811,4969049.0,"[parly, munl, boro]"
2,S. Ps.,51.510105,-0.103083,3977762.0,-7156.526053,4969062.0,[ps]
3,Southwark Bridge Stairs,51.509744,-0.09642,3977794.0,-6694.027028,4969037.0,"[southwark, bridge, stairs]"
4,St. Paul's Pier,51.510232,-0.099456,3977752.0,-6904.747577,4969071.0,"[st, pauls, pier]"


In [24]:
def calculate_similarity(wikitokens, gb1900tokens, threshold=0.8):
    intersection = 0
    matched = []
    for wt, gt in itertools.product(wikitokens, gb1900tokens): 
        m = SequenceMatcher(None, wt, gt)
        matched += [(wt, gt, m.ratio())]
        if m.ratio() > threshold:
            intersection += 1
    union = len(wikitokens) + len(gb1900tokens) - intersection
    if union == 0:
        return [], 0.0
    return matched, float(intersection)/union

In [25]:
def find_candidates(matched_distances, matched_indices, row_index, row):
    wikitoponym = row['altname']
    wikitokens = get_tokens(wikitoponym)
    candidates = []
    isfin = np.isfinite(matched_distances)
    dists = matched_distances[isfin]
    inds = matched_indices[isfin]
    for i, dist in zip(inds.flat, dists.flat):
        gb1900row = gb1900subdf.iloc[i]
        gb1900toponym = gb1900row['final_text']
        gb1900tokens = gb1900row['tokens']
        pairs, sim = calculate_similarity(wikitokens, gb1900tokens)
        if sim >= 0.75:
            candidates.append((wikitoponym, gb1900toponym, dist, sim, pairs))
    return candidates

In [26]:
def output_candidates(row, writer):
    index = row.name
    matched_distances = all_dists[index]
    matched_indices = all_indxs[index]
    candidates = find_candidates(matched_distances, matched_indices, index, row)
    if not candidates:
        matched_distances = all_dists_5km[index]
        matched_indices = all_indxs_5km[index]
        candidates = find_candidates(
            matched_distances, 
            matched_indices, 
            index, row)
    for cand in candidates:
        writer.writerow(map(str, cand))
    return candidates

In [29]:
%%time
with open('candidates.tsv', 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(['wiki', 'gb1900', 'distance', 'jaccard_sim', 'match'])
#     df = wikidf.head(100)
    df = wikidf
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
        output_candidates(row, writer)    

100%|██████████| 232109/232109 [8:40:04<00:00,  6.96it/s]   

CPU times: user 8h 36min 58s, sys: 2min 38s, total: 8h 39min 37s
Wall time: 8h 40min 4s





Execution time:
```
CPU times: user 8h 36min 58s, sys: 2min 38s, total: 8h 39min 37s
Wall time: 8h 40min 4s
```