In [145]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import mysql.connector
from mysql.connector import Error
import psycopg2
import psycopg2.extras
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from timeit import default_timer as timer
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Point
from descartes import PolygonPatch
from geopandas.tools import sjoin
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import itertools
import pyproj
import haversine
from scipy import spatial
import getpass
import os.path
import fiona
import json
import glob
import ast
import csv
import re
import itertools
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import pandas as pd

### Start the gazetteer DB server

Make sure you change your credentials. This is to connect to the DB locally in MySQL:

In [146]:
# read Credentials from the credentials.json file
credentials_config = dict()
with open('../credentials.json') as f:
    credentials_config = json.load(f)

In [147]:
credentials_config['lwmrelationaldb']['password'] = getpass.getpass(prompt='Enter your password: ')

Enter your password: ········


In [148]:
# Construct connection string
psql_conn_string =\
    "host={0} user={1} dbname={2} password={3} sslmode={4} sslrootcert={5}".format(
                credentials_config['lwmrelationaldb']['host'], 
                credentials_config['lwmrelationaldb']['user'], 
                'gazetteer', 
                credentials_config['lwmrelationaldb']['password'], 
                credentials_config['lwmrelationaldb']['sslmode'], 
                credentials_config['lwmrelationaldb']['sslrootcert'])

In [149]:
gazDB = psycopg2.connect(psql_conn_string) 
print("Connection established!")

cursorGaz = gazDB.cursor(cursor_factory=psycopg2.extras.DictCursor)

Connection established!


### Read GB1900

In [150]:
gb1900df = pd.DataFrame()
with open("/Users/mcollardanuy/Documents/workspace/toponymResolution/GIR2019/gb1900_analysis/gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    gb1900df = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


In [151]:
gb1900df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,


### Create an approximately British Wiki Gazetteer

In [152]:
start_time = timer()

In [153]:
def find_british_locations(cursorGaz, timer):
    print('Start locations SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT * FROM location
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    main_id = []
    wiki_title = []
    wiki_lat = []
    wiki_lon = []
    page_len = []
    type_loc = []
    population = []
    for r in results:
        main_id.append(r['id'])
        wiki_title.append(r['wiki_title'])
        page_len.append(r['page_len'])
        wiki_lat.append(r['lat'])
        wiki_lon.append(r['lon'])
        type_loc.append(r['type'])
        population.append(r['population'])

    df = pd.DataFrame(
        {'main_id' : main_id,
         'wiki_title': wiki_title,
         'wiki_lat': wiki_lat,
         'wiki_lon': wiki_lon,
         'page_len': page_len,
         'type_loc': type_loc,
         'population': population
        })
    return df

def gaz_to_geodataframe(df):
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df.wiki_lon, df.wiki_lat))

    poly  = gpd.GeoDataFrame.from_file('gb1900_analysis/shapefiles/GBR_adm/GBR_adm0.shp')
    point = gdf

    pointInPolys = sjoin(point, poly, how='left')
    grouped = pointInPolys.groupby('index_right')
    
    britdf = df.iloc[grouped.groups[0]]
    britdf.reset_index(drop=True)
    britdf.to_pickle("brit_wikigazetteer.pkl")
    
    return britdf

def find_british_altnames(cursorGaz, timer):
    print('Start altnames SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT altname.* FROM altname
            JOIN location ON location.id=altname.main_id
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    dAltnames = dict()
    
    for r in results:
        if len(r['altname']) < 50:
            dAltnames[r['id']] = (r['altname'], r['source'], r['main_id'])

    return dAltnames

#### Locations dataframe

In [154]:
# locdf = find_british_locations(cursorGaz, timer)
# britdf = gaz_to_geodataframe(locdf)
# britdf.head()
britdf = pd.read_pickle("GIR2019/gb1900_analysis/brit_wikigazetteer.pkl")
britdf.head()

Unnamed: 0,wiki_id,wiki_title,wiki_lat,wiki_lon,geometry
0,20,Aberdeenshire,57.151,-2.123,POINT (-2.123 57.151)
1,21,A._A._Milne,51.09,0.107,POINT (0.107 51.09)
4,48,Abbotsford_House,55.5997,-2.78194,POINT (-2.78194 55.5997)
5,59,Aberdeen,57.15,-2.11,POINT (-2.11 57.15)
6,99,"Angus,_Scotland",56.6667,-2.91667,POINT (-2.91667 56.6667)


#### Altnames dataframe

In [155]:
dAltnames = find_british_altnames(cursorGaz, timer)

alt_id = []
main_id = []
altname = []
source = []
for r in dAltnames:
    main_id.append(dAltnames[r][2])
    alt_id.append(r)
    altname.append(dAltnames[r][0])
    source.append(dAltnames[r][1])

altdf = pd.DataFrame(
    {'alt_id' : alt_id,
     'main_id': main_id,
     'altname': altname,
     'source': source
    })

Start altnames SQL query: 2.122660213999552 seconds


In [156]:
altdf.shape

(224386, 4)

In [157]:
altdf = altdf.groupby("main_id")['altname'].apply(', '.join)
altdf.head()

main_id
20    Aberdeenshire, Aberdeen County, Swydd Aberdeen, Aberdeen, Aiberdeenshire, Contae Obar Deathain, Siorrachd Obar Dheathain, Aberdonensis, Coontae Aberdon
21    A. A. Milne                                                                                                                                            
23    Azincourt, Asincurtis                                                                                                                                  
25    Achill Island, Acaill, Achill, Wyspa Achill, Achill - Acaill, Curraun                                                                                  
48    Abbotsford House, Abbotsford, Clartyhole                                                                                                               
Name: altname, dtype: object

#### Joint locations and altnames dataframe

In [158]:
britdf = pd.merge(locdf, altdf, how='left', on='main_id')
britdf = britdf[britdf.altname.notnull()]
# britdf.head()

# Example of multiple altnames:
britdf[britdf['main_id'] == 20]

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,geometry,altname
64008,20,Aberdeenshire,57.151,-2.123,30952,adm2nd,,POINT (-2.123 57.151),"Aberdeenshire, Aberdeen County, Swydd Aberdeen, Aberdeen, Aiberdeenshire, Contae Obar Deathain, Siorrachd Obar Dheathain, Aberdonensis, Coontae Aberdon"


In [159]:
altdf.shape

(206134,)

In [160]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           gb1900df["longitude"].to_numpy(), 
                           gb1900df["latitude"].to_numpy(), 
                           np.zeros(len(gb1900df["latitude"])), 
                           radians=False)

In [161]:
gb1900df["x"] = x
gb1900df["y"] = y
gb1900df["z"] = z

In [162]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           britdf["wiki_lon"].to_numpy(), 
                           britdf["wiki_lat"].to_numpy(), 
                           np.zeros(len(britdf["wiki_lat"])), 
                           radians=False)

In [163]:
britdf["x"] = x
britdf["y"] = y
britdf["z"] = z

In [164]:
kdtree = spatial.cKDTree(gb1900df[["x", "y", "z"]].to_numpy())

In [165]:
wikix = britdf.iloc[0]['x']
wikiy = britdf.iloc[0]['y']
wikiz = britdf.iloc[0]['z']

In [186]:
num_neighbors = 1001
all_dists, all_indxs = kdtree.query(britdf[['x', 'y', 'z']].to_numpy(), 
                                    k=num_neighbors, 
                                    distance_upper_bound=5000)
print(all_dists)
print(all_indxs)
"""
for ind in all_indxs:
    print()
"""

[[  25.52332013   64.86583931   82.02319054 ... 3719.95293279
  3720.37828297 3721.23819518]
 [  27.70722677  115.48635185  117.68556333 ... 2958.7174645
  2959.85930648 2961.83500792]
 [  60.34202469   74.3805534   101.32354039 ... 2250.87624754
  2256.01165412 2256.27307571]
 ...
 [  97.55914254  101.67764269  111.62559352 ... 3951.61453132
  3952.6784816  3954.60336674]
 [ 164.72703494  186.69816686  200.41045138 ... 4031.3263231
  4033.52923922 4038.61758182]
 [  65.59812607   78.62932382  118.01049747 ... 3971.64155341
  3974.18150433 3975.63384941]]
[[ 151327  151331  151340 ... 2273480  232215  231563]
 [ 151288  231738  231737 ...  142321  142801 1948340]
 [ 323101  231865  323102 ...  142769  320936  128591]
 ...
 [ 199375  198643  199374 ... 2167680 2170054 2401373]
 [ 282947  282572  128234 ...  199112  199622 2292994]
 [ 198722  198609  198721 ... 2167964 1094499 2167967]]


'\nfor ind in all_indxs:\n    print()\n'

In [167]:
np.shape(all_indxs)

(206134, 1001)

In [182]:
britdf.iloc[160000:160001]

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,geometry,altname,x,y,z
161598,931350,Deeping_St_James_Priory,52.6717,-0.2895,2778,,,POINT (-0.2895 52.6717),Deeping St James Priory,3875745.0,-19583.250968,5048473.0


In [189]:
all_dists[160000, 0:1000]

array([  52.58162772,   60.57253478,   87.38804972,  102.75402349,
        144.40535071,  155.66414061,  159.87063182,  179.75745924,
        198.37918059,  217.67161723,  245.78537527,  261.66042048,
        295.93811605,  309.17164003,  313.01179478,  316.73162209,
        356.87744327,  362.00726526,  385.27835638,  403.79858562,
        421.2362011 ,  426.88973071,  427.23540366,  461.93925873,
        502.75782717,  518.04806683,  518.65342194,  572.67599439,
        575.18476762,  586.49326018,  604.66977016,  623.52625372,
        645.45741328,  652.22561092,  707.99381244,  714.7300929 ,
        722.9414951 ,  723.47052869,  725.39730414,  731.68852612,
        747.23732267,  763.62614366,  778.52716185,  782.57061306,
        785.37961931,  821.08170705,  835.73205747,  841.5143998 ,
        852.77595639,  852.79332073,  855.31959076,  860.89562326,
        866.81637577,  873.99810542,  878.2174456 ,  886.22257412,
        892.90234157,  899.64215625,  901.49655765,  904.30850

In [192]:
qindx = 160000
gb1900df.iloc[all_indxs[qindx, all_dists[qindx] < 5000]]

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes,x,y,z
2142889,5864df272c66dc10b805a47d,St. James's Ch.,England,South Kesteven,Deeping St.james,515809.534245,309597.851398,52.671724,-0.288724,,3875743.0,-19530.728231,5048475.0
2142888,5864df092c66dc10b805a475,Deeping St. James,England,South Kesteven,Deeping St.james,515800.643933,309636.012958,52.672069,-0.288842,,3875712.0,-19538.55763,5048498.0
2142893,5864df682c66dc10b805a495,P. O.,England,South Kesteven,Deeping St.james,515714.910598,309517.410361,52.671022,-0.290151,,3875804.0,-19627.567352,5048427.0
2142892,5864df612c66dc10b805a493,Sch.,England,South Kesteven,Deeping St.james,515670.870786,309538.086133,52.671217,-0.290794,,3875787.0,-19671.025179,5048441.0
2142899,5864e2832c66dc10b805a568,P. H.,England,South Kesteven,Deeping St.james,515781.795845,309451.66315,52.670417,-0.289185,,3875858.0,-19562.519348,5048387.0
2142894,5864df742c66dc10b805a49b,G. P,England,South Kesteven,Deeping St.james,515905.126572,309545.821367,52.671236,-0.287329,,3875786.0,-19436.597704,5048442.0
2142887,5864defb2c66dc10b805a471,Priory Farm on Site of Priory,England,South Kesteven,Deeping St.james,515607.25968,309649.523834,52.672232,-0.291696,,3875697.0,-19731.531373,5048509.0
2142895,5864df7d2c66dc10b805a49f,Sch.,England,South Kesteven,Deeping St.james,515884.496344,309467.135661,52.670534,-0.287662,,3875848.0,-19459.408247,5048394.0
2142891,5864df5b2c66dc10b805a48d,Vicarage,England,South Kesteven,Deeping St.james,515564.720786,309545.70529,52.671308,-0.292361,,3875778.0,-19776.944238,5048447.0
2142855,58af0baf2c66dc81190db5ba,river welland,England,Peterborough,Deeping Gate,515726.920045,309378.390502,52.66977,-0.290022,,3875915.0,-19619.41891,5048343.0


In [193]:
britdf.head()

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,geometry,altname,x,y,z
0,1188030,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9774,-1.76524,87691,landmark,,POINT (-1.76524 54.9774),The Rectory,3666913.0,-113010.602692,5199940.0
1,1188031,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9396,-1.59225,87691,landmark,,POINT (-1.59225 54.9396),Underhill,3670682.0,-102034.526035,5197524.0
2,1188032,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9609,-1.60517,87691,landmark,,POINT (-1.60517 54.9609),Walker Terrace,3668718.0,-102807.873722,5198886.0
4,1188035,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.974,-1.58059,87691,landmark,,POINT (-1.58059 54.974),Byker Neighbourhood Estate Office,3667568.0,-101201.039814,5199723.0
5,1188037,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9745,-1.57448,87691,landmark,,POINT (-1.57448 54.9745),Lychgate Infront of St Lawrence s School,3667533.0,-100808.678779,5199755.0


In [194]:
gb1900df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes,x,y,z
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,,3854770.0,-213945.645048,5059946.0
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,,3977779.0,-7098.456811,4969049.0
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,,3977762.0,-7156.526053,4969062.0
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,,3977794.0,-6694.027028,4969037.0
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,,3977752.0,-6904.747577,4969071.0


### Exploratory first step

In [42]:
l_altname = []
l_wiki_title = []
l_gb1900text = []
l_gb1900lat = []
l_gb1900lon = []
l_wikilat = []
l_wikilon = []
for index, row in britdf.iloc[180100:180200].iterrows():
    for altnwiki in row['altname'].split(","):
        altnwiki = altnwiki.strip()
        matches = gb1900df[(gb1900df['final_text'].str.contains(row['altname'], case=False)) & (gb1900df['latitude'] >= row['wiki_lat'] - 0.0) & (gb1900df['latitude'] <= row['wiki_lat'] + 0.1) & (gb1900df['longitude'] >= row['wiki_lon'] - 0.1) & (gb1900df['longitude'] <= row['wiki_lon'] + 0.1)]
        for imatch, irow in matches.iterrows():
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append(irow['final_text'])
            l_gb1900lat.append(irow['latitude'])
            l_gb1900lon.append(irow['longitude'])
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])
        if matches.empty:
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append('')
            l_gb1900lat.append('')
            l_gb1900lon.append('')
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])

matchdf = pd.DataFrame(
        {'altname' : l_altname,
         'wiki_title': l_wiki_title,
         'gb1900text': l_gb1900text,
         'gb1900lat': l_gb1900lat,
         'gb1900lon': l_gb1900lon,
         'wiki_lat': l_wikilat,
         'wiki_lon': l_wikilon
        })

In [44]:
matchdf.to_pickle("match09.pkl")

In [45]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
matchdf = pd.read_pickle("match09.pkl")
matchdf

Unnamed: 0,altname,wiki_title,gb1900text,gb1900lat,gb1900lon,wiki_lat,wiki_lon
0,Copinsay Lighthouse,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.8965,-2.67199
1,Deerness,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",ST. ANDREWS AND DEERNESS,58.9473,-2.87704,58.9329,-2.79425
2,Deerness,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",Deerness,58.9476,-2.75038,58.9329,-2.79425
3,Toab,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9175,-2.80488
4,Tankerness Meal Mill,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9614,-2.84581
5,Tankerness Fishing Station,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9621,-2.83201
6,Tankerness,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9692,-2.79546
7,,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9692,-2.79546
8,Canniemyre,"List_of_listed_buildings_in_St_Andrews_And_Deerness,_Orkney",,,,58.9198,-2.85532
9,Gate Piers Balfour Castle,"List_of_listed_buildings_in_Shapinsay,_Orkney",,,,59.0311,-2.91036


### Close DB connection

In [None]:
# PostreSQL:
if(gazDB):
    cursorGaz.close()
    gazDB.close()