In [1]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import psycopg2
import psycopg2.extras
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from timeit import default_timer as timer
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Point
from descartes import PolygonPatch
from geopandas.tools import sjoin
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import itertools
import pyproj
import haversine
from scipy import spatial
import getpass
import os.path
import fiona
import json
import glob
import ast
import csv
import re
import itertools
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import pandas as pd

### Start the gazetteer DB server

Make sure you change your credentials. This is to connect to the DB locally in MySQL:

In [2]:
# read Credentials from the credentials.json file
credentials_config = dict()
with open('./credentials.json') as f:
    credentials_config = json.load(f)

In [3]:
#credentials_config['lwmrelationaldb']['password'] = getpass.getpass(prompt='Enter your password: ')

In [4]:
# Construct connection string
psql_conn_string =\
    "host={0} user={1} dbname={2} password={3} sslmode={4} sslrootcert={5}".format(
                credentials_config['lwmrelationaldb']['host'], 
                credentials_config['lwmrelationaldb']['user'], 
                'gazetteer', 
                credentials_config['lwmrelationaldb']['password'], 
                credentials_config['lwmrelationaldb']['sslmode'], 
                credentials_config['lwmrelationaldb']['sslrootcert'])

In [5]:
gazDB = psycopg2.connect(psql_conn_string) 
print("Connection established!")

cursorGaz = gazDB.cursor(cursor_factory=psycopg2.extras.DictCursor)

Connection established!


### Read GB1900

In [6]:
gb1900df = pd.DataFrame()
with open("./gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    gb1900df = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
gb1900df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,


### Create an approximately British Wiki Gazetteer

In [8]:
start_time = timer()

In [9]:
def find_british_locations(cursorGaz, timer):
    print('Start locations SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT location.*, inlinks.inlinks FROM location
            JOIN inlinks ON inlinks.main_id=location.id
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    main_id = []
    wiki_title = []
    wiki_lat = []
    wiki_lon = []
    page_len = []
    type_loc = []
    population = []
    for r in results:
        main_id.append(r['id'])
        wiki_title.append(r['wiki_title'])
        page_len.append(r['page_len'])
        wiki_lat.append(r['lat'])
        wiki_lon.append(r['lon'])
        type_loc.append(r['type'])
        population.append(r['population'])

    df = pd.DataFrame(
        {'main_id' : main_id,
         'wiki_title': wiki_title,
         'wiki_lat': wiki_lat,
         'wiki_lon': wiki_lon,
         'page_len': page_len,
         'type_loc': type_loc,
         'population': population
        })
    return df

def gaz_to_geodataframe(df):
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df.wiki_lon, df.wiki_lat))

    poly  = gpd.GeoDataFrame.from_file('gb1900_analysis/shapefiles/GBR_adm/GBR_adm0.shp')
    point = gdf

    pointInPolys = sjoin(point, poly, how='left')
    grouped = pointInPolys.groupby('index_right')
    
    britdf = df.iloc[grouped.groups[0]]
    britdf.reset_index(drop=True)
    britdf.to_pickle("brit_wikigazetteer.pkl")
    
    return britdf

def find_british_altnames(cursorGaz, timer):
    print('Start altnames SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT altname.* FROM altname
            JOIN location ON location.id=altname.main_id
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    dAltnames = dict()
    
    for r in results:
        if len(r['altname']) < 50:
            dAltnames[r['id']] = (r['altname'], r['source'], r['main_id'])

    return dAltnames

#### Locations dataframe

In [10]:
locdf = find_british_locations(cursorGaz, timer)
britdf = gaz_to_geodataframe(locdf)
#britdf.head()
britdf = pd.read_pickle("./brit_wikigazetteer.pkl")
britdf.head()

Start locations SQL query: 1.1958821369917132 seconds


DriverError: gb1900_analysis/shapefiles/GBR_adm/GBR_adm0.shp: No such file or directory

#### Altnames dataframe

In [None]:
dAltnames = find_british_altnames(cursorGaz, timer)

alt_id = []
main_id = []
altname = []
source = []
for r in dAltnames:
    main_id.append(dAltnames[r][2])
    alt_id.append(r)
    altname.append(dAltnames[r][0])
    source.append(dAltnames[r][1])

altdf = pd.DataFrame(
    {'alt_id' : alt_id,
     'main_id': main_id,
     'altname': altname,
     'source': source
    })

In [None]:
altdf.shape

In [None]:
altdf = altdf.groupby("main_id")['altname'].apply(', '.join)
altdf.head()

#### Joint locations and altnames dataframe

In [None]:
britdf = pd.merge(locdf, altdf, how='left', on='main_id')
britdf = britdf[britdf.altname.notnull()]
# britdf.head()

# Example of multiple altnames:
britdf[britdf['main_id'] == 20]

In [None]:
altdf.shape

In [None]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           gb1900df["longitude"].to_numpy(), 
                           gb1900df["latitude"].to_numpy(), 
                           np.zeros(len(gb1900df["latitude"])), 
                           radians=False)

In [None]:
gb1900df["x"] = x
gb1900df["y"] = y
gb1900df["z"] = z

In [None]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           britdf["wiki_lon"].to_numpy(), 
                           britdf["wiki_lat"].to_numpy(), 
                           np.zeros(len(britdf["wiki_lat"])), 
                           radians=False)

In [None]:
britdf["x"] = x
britdf["y"] = y
britdf["z"] = z

In [None]:
kdtree = spatial.cKDTree(gb1900df[["x", "y", "z"]].to_numpy())

In [None]:
wikix = britdf.iloc[0]['x']
wikiy = britdf.iloc[0]['y']
wikiz = britdf.iloc[0]['z']

In [None]:
num_neighbors = 5000
distance_upper_bound = 5000

In [None]:
all_dists, all_indxs = kdtree.query(britdf[['x', 'y', 'z']].to_numpy(), 
                                    k=num_neighbors, 
                                    distance_upper_bound=distance_upper_bound)
print(all_dists)
print(all_indxs)
"""
for ind in all_indxs:
    print()
"""

In [None]:
np.shape(all_indxs)

In [None]:
britdf.iloc[160000:160001]

In [None]:
all_dists[160000, 0:1000]

In [None]:
qindx = 160000

In [None]:
import csv

dAbbrevs = dict()
with open('./abbreviations.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
    next(csv_reader)
    for line in csv_reader:
        abbrev = line[0]
        fulltext = line[1]
        or_abbrev = ""
        if not "," in fulltext:
            if " or " in abbrev:
                or_abbrev = abbrev.split(" or ")
            if or_abbrev:
                for ab in or_abbrev:
                    dAbbrevs[ab] = fulltext
            else:
                dAbbrevs[abbrev] = fulltext

In [None]:
dAbbrevs

In [None]:
def cleanup(corpus):
    #corpus = [re.sub(r'[\.,\(\)\'&:\[\]]', '', element,
    #                 flags=re.IGNORECASE) for element in corpus]
    #corpus = [re.sub(r'\bthe\b', '', element, flags=re.IGNORECASE)
    #          for element in corpus]
    #corpus = [re.sub(r'\band\b', '', element, flags=re.IGNORECASE)
    #          for element in corpus]
    #corpus = [re.sub(r'[\|-]', ' ', element, flags=re.IGNORECASE)
    #          for element in corpus]
    corpus = [re.sub(r'[\.,\(\)\'\":\[\]-]', '', element,
                     flags=re.IGNORECASE) for element in corpus]
    corpus = [re.sub(r'\bthe\b', '', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\bof\b', '', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\betc\b', '', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\&', 'and', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\s+', ' ', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\bst$', 'street', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\bst\b', 'saint', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [element.strip() for element in corpus]
    corpus = [element.lower() for element in corpus]
    
    return corpus

In [None]:
cleanup(["st this is a st mary st"])

In [None]:
gb1900df["text2match"] = cleanup(gb1900df['final_text'])
britdf["text2match"] = cleanup(britdf["altname"])

In [None]:
gb1900df.head()

In [None]:
britdf.head()

In [None]:
import importlib

import recordlinkage
importlib.reload(recordlinkage)

In [None]:

# In the following list, first specify the method then the weight:
# Algorithms: 
# 'jaro','jarowinkler', 'levenshtein', 'damerau_levenshtein', 
# 'qgram', 'cosine', 'smith_waterman'
# 'longest_common_substring', 
list_methods_weights = ['jarowinkler', 0,
                        'levenshtein', 0,
                        'damerau_levenshtein', 0,
                        'qgram', 1,
                        'cosine', 0,
                        # For numeric comparisons (see the figure above), the format is:
                        # date_METHOD, e.g.:
                        # date_linear
                        # date_gauss
                        'distance_gauss', 1
                       ]
list_methods = []
list_weights = []
for i, ilm in enumerate(list_methods_weights):
    if i % 2 == 0:
        list_methods.append(ilm)
    else:
        list_weights.append(ilm)
print("List of methods: ", list_methods)
print("List of weights: ", list_weights)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for one_brit_index, one_brit in britdf.iterrows():
    #print(one_brit_index)
    if one_brit_index > 1000:
        break
    indexer = recordlinkage.Index()
    indexer.full()
    
    one_brit['distance'] = 0.1
    #one_brit['distance'] = one_brit['distance'].astype(float)
    one_brit_df = pd.DataFrame(one_brit).T
    one_brit_df["distance"] = one_brit_df["distance"].astype(float)
    
    gb1900df_neighbours = gb1900df.iloc[all_indxs[one_brit_index, 
                                                  all_dists[one_brit_index] <= distance_upper_bound]]
    gb1900df_neighbours['distance'] = list(all_dists[one_brit_index, all_dists[one_brit_index] <= distance_upper_bound])
    gb1900df_neighbours['distance'] = gb1900df_neighbours['distance'].astype(float)
    
    candidate_links = indexer.index(one_brit_df, gb1900df_neighbours)

    compare_cl = recordlinkage.Compare()
    for imethod in list_methods:
        if not 'distance' in imethod:
            compare_cl.string('text2match', 'text2match', method=imethod, label=imethod)
        #else:
        #    compare_cl.geo('wiki_lat', 'wiki_lon', "latitude", "longitude",
        #                       method=imethod.split("_")[1], 
        #                       offset=500.0, scale=1000.0, missing_value=0.5, label=imethod)
        else:
            compare_cl.numeric('distance', 'distance', method=imethod.split("_")[1], 
                               offset=1000.0, scale=5000.0, missing_value=0.5, label=imethod)


    # The comparison vectors
    rl_features = compare_cl.compute(candidate_links, one_brit_df, gb1900df_neighbours)
    
    for imatch_grp, match_grp in rl_features.groupby(level=0):
        match_grp = match_grp.reset_index()
    
        match_grp['overall_score'] = \
            match_grp.apply(lambda row: 
                            np.sum([list_weights[i]*row[list_methods[i]] for i in range(len(list_methods))])/np.sum(list_weights),
                            axis=1)
        max_id = match_grp['overall_score'].idxmax()
        max_score = match_grp['overall_score'].max()
        if max_score > -1000:
            print("\n=============")
            print("Overall score                 : {}".format(round(match_grp.iloc[max_id].overall_score, 3)))
            print(np.round(np.array([match_grp.loc[max_id, i] for i in list_methods]), 3))
            print("Wikipedia (altname) title     : {}".format(one_brit["altname"]))
            print("GB1900 (final_text)           : {}".format(gb1900df.iloc[int(match_grp.iloc[max_id]['level_1'])].final_text))
                  
                  
                  

In [None]:
one_brit_df

In [None]:
gb1900df_neighbours

In [None]:
britdf.iloc[1]

In [None]:
gb1900df.iloc[151288]

### Exploratory first step

In [None]:
l_altname = []
l_wiki_title = []
l_gb1900text = []
l_gb1900lat = []
l_gb1900lon = []
l_wikilat = []
l_wikilon = []
for index, row in britdf.iloc[180100:180200].iterrows():
    for altnwiki in row['altname'].split(","):
        altnwiki = altnwiki.strip()
        matches = gb1900df[(gb1900df['final_text'].str.contains(row['altname'], case=False)) & (gb1900df['latitude'] >= row['wiki_lat'] - 0.0) & (gb1900df['latitude'] <= row['wiki_lat'] + 0.1) & (gb1900df['longitude'] >= row['wiki_lon'] - 0.1) & (gb1900df['longitude'] <= row['wiki_lon'] + 0.1)]
        for imatch, irow in matches.iterrows():
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append(irow['final_text'])
            l_gb1900lat.append(irow['latitude'])
            l_gb1900lon.append(irow['longitude'])
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])
        if matches.empty:
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append('')
            l_gb1900lat.append('')
            l_gb1900lon.append('')
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])

matchdf = pd.DataFrame(
        {'altname' : l_altname,
         'wiki_title': l_wiki_title,
         'gb1900text': l_gb1900text,
         'gb1900lat': l_gb1900lat,
         'gb1900lon': l_gb1900lon,
         'wiki_lat': l_wikilat,
         'wiki_lon': l_wikilon
        })

In [None]:
matchdf.to_pickle("match09.pkl")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
matchdf = pd.read_pickle("match09.pkl")
matchdf

### Close DB connection

In [None]:
# PostreSQL:
if(gazDB):
    cursorGaz.close()
    gazDB.close()