In [1]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import psycopg2
import psycopg2.extras
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from timeit import default_timer as timer
import geopandas as gpd
from shapely.geometry import shape
from shapely.geometry import Point
from descartes import PolygonPatch
from geopandas.tools import sjoin
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import itertools
import pyproj
import haversine
from scipy import spatial
import getpass
import os.path
import fiona
import json
import glob
import ast
import csv
import re
import itertools
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import pandas as pd

### Start the gazetteer DB server

Make sure you change your credentials. This is to connect to the DB locally in MySQL:

In [2]:
# read Credentials from the credentials.json file
credentials_config = dict()
with open('./credentials.json') as f:
    credentials_config = json.load(f)

In [3]:
#credentials_config['lwmrelationaldb']['password'] = getpass.getpass(prompt='Enter your password: ')

In [4]:
# Construct connection string
psql_conn_string =\
    "host={0} user={1} dbname={2} password={3} sslmode={4} sslrootcert={5}".format(
                credentials_config['lwmrelationaldb']['host'], 
                credentials_config['lwmrelationaldb']['user'], 
                'gazetteer', 
                credentials_config['lwmrelationaldb']['password'], 
                credentials_config['lwmrelationaldb']['sslmode'], 
                credentials_config['lwmrelationaldb']['sslrootcert'])

In [5]:
gazDB = psycopg2.connect(psql_conn_string) 
print("Connection established!")

cursorGaz = gazDB.cursor(cursor_factory=psycopg2.extras.DictCursor)

Connection established!


### Read GB1900

In [6]:
gb1900df = pd.DataFrame()
with open("./gb1900_gazetteer_complete_july_2018.csv", encoding='UTF-16') as f:
    gb1900df = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
gb1900df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,


### Create an approximately British Wiki Gazetteer

In [8]:
start_time = timer()

In [9]:
def find_british_locations(cursorGaz, timer):
    print('Start locations SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT * FROM location
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    main_id = []
    wiki_title = []
    wiki_lat = []
    wiki_lon = []
    page_len = []
    type_loc = []
    population = []
    for r in results:
        main_id.append(r['id'])
        wiki_title.append(r['wiki_title'])
        page_len.append(r['page_len'])
        wiki_lat.append(r['lat'])
        wiki_lon.append(r['lon'])
        type_loc.append(r['type'])
        population.append(r['population'])

    df = pd.DataFrame(
        {'main_id' : main_id,
         'wiki_title': wiki_title,
         'wiki_lat': wiki_lat,
         'wiki_lon': wiki_lon,
         'page_len': page_len,
         'type_loc': type_loc,
         'population': population
        })
    return df

def gaz_to_geodataframe(df):
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df.wiki_lon, df.wiki_lat))

    poly  = gpd.GeoDataFrame.from_file('gb1900_analysis/shapefiles/GBR_adm/GBR_adm0.shp')
    point = gdf

    pointInPolys = sjoin(point, poly, how='left')
    grouped = pointInPolys.groupby('index_right')
    
    britdf = df.iloc[grouped.groups[0]]
    britdf.reset_index(drop=True)
    britdf.to_pickle("brit_wikigazetteer.pkl")
    
    return britdf

def find_british_altnames(cursorGaz, timer):
    print('Start altnames SQL query: {} seconds'.format(timer() - start_time))

    cursorGaz.execute("""
            SELECT altname.* FROM altname
            JOIN location ON location.id=altname.main_id
            WHERE lat > 50.0
            AND lat < 62.0
            AND lon > -14.0
            AND lon < 3.0
        """)
    results = cursorGaz.fetchall()
    
    dAltnames = dict()
    
    for r in results:
        if len(r['altname']) < 50:
            dAltnames[r['id']] = (r['altname'], r['source'], r['main_id'])

    return dAltnames

#### Locations dataframe

In [10]:
locdf = find_british_locations(cursorGaz, timer)
#britdf = gaz_to_geodataframe(locdf)
#britdf.head()
britdf = pd.read_pickle("./brit_wikigazetteer.pkl")
britdf.head()

Start locations SQL query: 0.020157377992291003 seconds


Unnamed: 0,wiki_id,wiki_title,wiki_lat,wiki_lon,geometry
0,20,Aberdeenshire,57.151,-2.123,POINT (-2.123 57.151)
1,21,A._A._Milne,51.09,0.107,POINT (0.107 51.09)
4,48,Abbotsford_House,55.5997,-2.78194,POINT (-2.78194 55.5997)
5,59,Aberdeen,57.15,-2.11,POINT (-2.11 57.15)
6,99,"Angus,_Scotland",56.6667,-2.91667,POINT (-2.91667 56.6667)


#### Altnames dataframe

In [11]:
dAltnames = find_british_altnames(cursorGaz, timer)

alt_id = []
main_id = []
altname = []
source = []
for r in dAltnames:
    main_id.append(dAltnames[r][2])
    alt_id.append(r)
    altname.append(dAltnames[r][0])
    source.append(dAltnames[r][1])

altdf = pd.DataFrame(
    {'alt_id' : alt_id,
     'main_id': main_id,
     'altname': altname,
     'source': source
    })

Start altnames SQL query: 6.559561098983977 seconds


In [12]:
altdf.shape

(224386, 4)

In [13]:
altdf = altdf.groupby("main_id")['altname'].apply(', '.join)
altdf.head()

main_id
20    Aberdeenshire, Aberdeen County, Swydd Aberdeen...
21                                          A. A. Milne
23                                Azincourt, Asincurtis
25    Achill Island, Acaill, Achill, Wyspa Achill, A...
48             Abbotsford House, Abbotsford, Clartyhole
Name: altname, dtype: object

#### Joint locations and altnames dataframe

In [14]:
britdf = pd.merge(locdf, altdf, how='left', on='main_id')
britdf = britdf[britdf.altname.notnull()]
# britdf.head()

# Example of multiple altnames:
britdf[britdf['main_id'] == 20]

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,altname
63940,20,Aberdeenshire,57.151,-2.123,30952,adm2nd,,"Aberdeenshire, Aberdeen County, Swydd Aberdeen..."


In [15]:
altdf.shape

(206134,)

In [16]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           gb1900df["longitude"].to_numpy(), 
                           gb1900df["latitude"].to_numpy(), 
                           np.zeros(len(gb1900df["latitude"])), 
                           radians=False)

In [17]:
gb1900df["x"] = x
gb1900df["y"] = y
gb1900df["z"] = z

In [18]:
ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')
lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
x, y, z = pyproj.transform(lla, ecef, 
                           britdf["wiki_lon"].to_numpy(), 
                           britdf["wiki_lat"].to_numpy(), 
                           np.zeros(len(britdf["wiki_lat"])), 
                           radians=False)

In [19]:
britdf["x"] = x
britdf["y"] = y
britdf["z"] = z

In [20]:
kdtree = spatial.cKDTree(gb1900df[["x", "y", "z"]].to_numpy())

In [21]:
wikix = britdf.iloc[0]['x']
wikiy = britdf.iloc[0]['y']
wikiz = britdf.iloc[0]['z']

In [22]:
num_neighbors = 5000
distance_upper_bound = 5000

In [23]:
all_dists, all_indxs = kdtree.query(britdf[['x', 'y', 'z']].to_numpy(), 
                                    k=num_neighbors, 
                                    distance_upper_bound=distance_upper_bound)
print(all_dists)
print(all_indxs)
"""
for ind in all_indxs:
    print()
"""

[[ 25.52332013  64.86583931  82.02319054 ...          inf          inf
           inf]
 [ 27.70722677 115.48635185 117.68556333 ...          inf          inf
           inf]
 [ 60.34202469  74.3805534  101.32354039 ...          inf          inf
           inf]
 ...
 [         inf          inf          inf ...          inf          inf
           inf]
 [         inf          inf          inf ...          inf          inf
           inf]
 [         inf          inf          inf ...          inf          inf
           inf]]
[[ 151327  151331  151340 ... 2552459 2552459 2552459]
 [ 151288  231738  231737 ... 2552459 2552459 2552459]
 [ 323101  231865  323102 ... 2552459 2552459 2552459]
 ...
 [2552459 2552459 2552459 ... 2552459 2552459 2552459]
 [2552459 2552459 2552459 ... 2552459 2552459 2552459]
 [2552459 2552459 2552459 ... 2552459 2552459 2552459]]


'\nfor ind in all_indxs:\n    print()\n'

In [24]:
np.shape(all_indxs)

(206134, 5000)

In [25]:
britdf.iloc[160000:160001]

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,altname,x,y,z
161604,931353,Gokewell_Priory,53.5811,-0.580901,1146,mountain,,Gokewell Priory,3794639.0,-38473.779895,5109201.0


In [26]:
all_dists[160000, 0:1000]

array([ 173.35536901,  214.39066138,  241.76762079,  490.95110395,
        583.13424777,  620.8630507 ,  707.58446613,  750.23697501,
        784.90156888,  833.29580284,  958.30419265,  976.68347796,
        981.46100176, 1008.53842387, 1087.27340403, 1219.79935363,
       1224.8378432 , 1229.29652457, 1242.70706693, 1245.20754843,
       1245.26788997, 1275.9731967 , 1380.55732223, 1384.97086143,
       1478.76681487, 1484.16840743, 1499.80656503, 1509.68456288,
       1524.01369512, 1525.93324178, 1526.35104815, 1535.00004561,
       1548.67625682, 1560.94471463, 1563.44508144, 1581.62153865,
       1595.11075907, 1601.91853774, 1613.98909252, 1632.99767298,
       1637.24716534, 1637.99665557, 1663.63048035, 1685.21054265,
       1691.80819175, 1708.89656475, 1746.02688125, 1747.48543838,
       1750.69532623, 1778.38139785, 1787.64510672, 1802.83429904,
       1826.52276753, 1835.00936609, 1836.4615892 , 1847.71455192,
       1860.73731872, 1862.28109927, 1866.59393338, 1867.10713

In [27]:
qindx = 160000

In [28]:
def cleanup(corpus):
    #corpus = [re.sub(r'[\.,\(\)\'&:\[\]]', '', element,
    #                 flags=re.IGNORECASE) for element in corpus]
    #corpus = [re.sub(r'\bthe\b', '', element, flags=re.IGNORECASE)
    #          for element in corpus]
    #corpus = [re.sub(r'\band\b', '', element, flags=re.IGNORECASE)
    #          for element in corpus]
    #corpus = [re.sub(r'[\|-]', ' ', element, flags=re.IGNORECASE)
    #          for element in corpus]
    corpus = [re.sub(r'[\.,\(\)\'\":\[\]-]', '', element,
                     flags=re.IGNORECASE) for element in corpus]
    corpus = [re.sub(r'\bthe\b', '', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\betc\b', '', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\&', 'and', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\s+', ' ', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\bst\b', 'saint', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [re.sub(r'\bst\b', 'saint', element, flags=re.IGNORECASE)
              for element in corpus]
    corpus = [element.strip() for element in corpus]
    corpus = [element.lower() for element in corpus]
    
    return corpus

In [29]:
gb1900df["text2match"] = cleanup(gb1900df['final_text'])
britdf["text2match"] = cleanup(britdf["altname"])

In [30]:
gb1900df.head()

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes,x,y,z,text2match
0,52b34d8b695fe90005004e1e,F. P.,Wales,Powys,Llansilin,320836.712742,327820.182715,52.84205,-3.176744,,3854770.0,-213945.645048,5059946.0,f p
1,5800a6b92c66dcab3d061796,Parly. & Munl Boro. By.,England,City of London,,531794.825962,180705.741898,51.509918,-0.102246,,3977779.0,-7098.456811,4969049.0,parly and munl boro by
2,5800a6782c66dcab3d061786,S. Ps.,England,City of London,,531736.217116,180725.02773,51.510105,-0.103083,,3977762.0,-7156.526053,4969062.0,s ps
3,57f684f42c66dcab3d01c0dd,Southwark Bridge Stairs,England,City of London,,532199.584123,180696.934434,51.509744,-0.09642,,3977794.0,-6694.027028,4969037.0,southwark bridge stairs
4,57f685002c66dcab3d01c0e9,St. Paul's Pier,England,City of London,,531987.486097,180745.664556,51.510232,-0.099456,,3977752.0,-6904.747577,4969071.0,saint pauls pier


In [31]:
britdf.head()

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,altname,x,y,z,text2match
0,1188030,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9774,-1.76524,87691,landmark,,The Rectory,3666913.0,-113010.602692,5199940.0,rectory
1,1188031,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9396,-1.59225,87691,landmark,,Underhill,3670682.0,-102034.526035,5197524.0,underhill
2,1188032,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9609,-1.60517,87691,landmark,,Walker Terrace,3668718.0,-102807.873722,5198886.0,walker terrace
4,1188035,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.974,-1.58059,87691,landmark,,Byker Neighbourhood Estate Office,3667568.0,-101201.039814,5199723.0,byker neighbourhood estate office
5,1188037,Grade_II*_listed_buildings_in_Tyne_and_Wear,54.9745,-1.57448,87691,landmark,,Lychgate Infront of St Lawrence s School,3667533.0,-100808.678779,5199755.0,lychgate infront of saint lawrence s school


In [32]:
import importlib

import recordlinkage
importlib.reload(recordlinkage)

<module 'recordlinkage' from '/Users/khosseini/anaconda3/envs/py37torch/lib/python3.6/site-packages/recordlinkage/__init__.py'>

In [49]:

# In the following list, first specify the method then the weight:
# Algorithms: 
# 'jaro','jarowinkler', 'levenshtein', 'damerau_levenshtein', 
# 'qgram', 'cosine', 'smith_waterman'
# 'longest_common_substring', 
list_methods_weights = ['jarowinkler', 0,
                        'damerau_levenshtein', 0,
                        'qgram', 1,
                        'cosine', 0,
                        # For numeric comparisons (see the figure above), the format is:
                        # date_METHOD, e.g.:
                        # date_linear
                        # date_gauss
                        'distance_gauss', 1
                       ]
list_methods = []
list_weights = []
for i, ilm in enumerate(list_methods_weights):
    if i % 2 == 0:
        list_methods.append(ilm)
    else:
        list_weights.append(ilm)
print("List of methods: ", list_methods)
print("List of weights: ", list_weights)

List of methods:  ['jarowinkler', 'damerau_levenshtein', 'qgram', 'cosine', 'distance_gauss']
List of weights:  [0, 0, 1, 0, 1]


In [50]:
import warnings
warnings.filterwarnings("ignore")

In [51]:
for one_brit_index, one_brit in britdf.iterrows():
    #print(one_brit_index)
    if one_brit_index > 1000:
        break
    indexer = recordlinkage.Index()
    indexer.full()
    
    one_brit['distance'] = 0.1
    #one_brit['distance'] = one_brit['distance'].astype(float)
    one_brit_df = pd.DataFrame(one_brit).T
    one_brit_df["distance"] = one_brit_df["distance"].astype(float)
    
    gb1900df_neighbours = gb1900df.iloc[all_indxs[one_brit_index, 
                                                  all_dists[one_brit_index] <= distance_upper_bound]]
    gb1900df_neighbours['distance'] = list(all_dists[one_brit_index, all_dists[one_brit_index] <= distance_upper_bound])
    gb1900df_neighbours['distance'] = gb1900df_neighbours['distance'].astype(float)
    
    candidate_links = indexer.index(one_brit_df, gb1900df_neighbours)

    compare_cl = recordlinkage.Compare()
    for imethod in list_methods:
        if not 'distance' in imethod:
            compare_cl.string('text2match', 'text2match', method=imethod, label=imethod)
        #else:
        #    compare_cl.geo('wiki_lat', 'wiki_lon', "latitude", "longitude",
        #                       method=imethod.split("_")[1], 
        #                       offset=500.0, scale=1000.0, missing_value=0.5, label=imethod)
        else:
            compare_cl.numeric('distance', 'distance', method=imethod.split("_")[1], 
                               offset=1000.0, scale=5000.0, missing_value=0.5, label=imethod)


    # The comparison vectors
    rl_features = compare_cl.compute(candidate_links, one_brit_df, gb1900df_neighbours)
    
    for imatch_grp, match_grp in rl_features.groupby(level=0):
        match_grp = match_grp.reset_index()
    
        match_grp['overall_score'] = \
            match_grp.apply(lambda row: 
                            np.sum([list_weights[i]*row[list_methods[i]] for i in range(len(list_methods))])/np.sum(list_weights),
                            axis=1)
        max_id = match_grp['overall_score'].idxmax()
        max_score = match_grp['overall_score'].max()
        if max_score > 0.9:
            print("\n=============")
            print("Overall score                 : {}".format(round(match_grp.iloc[max_id].overall_score, 3)))
            print(np.round(np.array([match_grp.loc[max_id, i] for i in list_methods]), 3))
            print("Wikipedia (altname) title     : {}".format(one_brit["altname"]))
            print("GB1900 (final_text)           : {}".format(gb1900df.iloc[int(match_grp.iloc[max_id]['level_1'])].final_text))
                  
                  
                  


Overall score                 : 0.913
[1.    1.    1.    1.    0.825]
Wikipedia (altname) title     : The Rectory
GB1900 (final_text)           : Rectory

Overall score                 : 1.0
[1. 1. 1. 1. 1.]
Wikipedia (altname) title     : Underhill
GB1900 (final_text)           : Underhill

Overall score                 : 0.961
[1.    1.    1.    1.    0.923]
Wikipedia (altname) title     : South Lodge
GB1900 (final_text)           : South Lodge

Overall score                 : 0.938
[1.    1.    1.    1.    0.875]
Wikipedia (altname) title     : The Red House
GB1900 (final_text)           : Red House

Overall score                 : 0.923
[1.    1.    1.    1.    0.846]
Wikipedia (altname) title     : Emmanuel Church
GB1900 (final_text)           : Emmanuel Church

Overall score                 : 0.924
[0.933 0.9   0.848 0.894 1.   ]
Wikipedia (altname) title     : Moat House
GB1900 (final_text)           : Boat House

Overall score                 : 0.921
[1.    1.    1.    1.    0

In [45]:
one_brit_df

Unnamed: 0,main_id,wiki_title,wiki_lat,wiki_lon,page_len,type_loc,population,altname,x,y,z,text2match,distance
10,1188043,Grade_II*_listed_buildings_in_Tyne_and_Wear,55.0352,-1.61272,87691,landmark,,Gosforth House,3661930.0,-103101,5203630.0,gosforth house,0.1


In [52]:
gb1900df_neighbours

Unnamed: 0,pin_id,final_text,nation,local_authority,parish,osgb_east,osgb_north,latitude,longitude,notes,x,y,z,text2match,distance
1889820,58e0e6832c66dcf8fa097757,Edmond's Farm,England,Mid Sussex,Balcombe,532523.578273,130393.437694,51.057598,-0.110400,,4.017041e+06,-7740.200357,4.937575e+06,edmonds farm,33.729218
1584236,58e0e6772c66dcf8fa097750,Springs,England,Mid Sussex,Balcombe,532558.322500,130300.160758,51.056752,-0.109938,,4.017115e+06,-7707.996047,4.937515e+06,springs,105.171953
1583539,58e0e66c2c66dcf8fa09774c,pearment's farm,England,Mid Sussex,Balcombe,532694.138114,130284.889017,51.056583,-0.108007,,4.017130e+06,-7572.624148,4.937504e+06,pearments farm,224.213338
1583538,58e0e6592c66dcf8fa097741,westhill farm,England,Mid Sussex,Ardingly,532816.864256,130369.081110,51.057311,-0.106226,,4.017067e+06,-7447.638518,4.937555e+06,westhill farm,325.207259
1584235,58e0e5da2c66dcf8fa097700,Five Acre Wood,England,Mid Sussex,Balcombe,532563.574714,130711.097675,51.060444,-0.109713,,4.016795e+06,-7691.587729,4.937774e+06,five acre wood,337.121402
1583537,58e0e6392c66dcf8fa09772d,little wood,England,Mid Sussex,Ardingly,532828.449523,130503.689182,51.058518,-0.106012,,4.016962e+06,-7432.401070,4.937639e+06,little wood,357.972735
1583555,58e0e64d2c66dcf8fa09773c,little westbrook wood,England,Mid Sussex,Ardingly,532857.772037,130402.022743,51.057598,-0.105631,,4.017042e+06,-7405.845151,4.937575e+06,little westbrook wood,366.438102
1583540,58e0e6a12c66dcf8fa097762,west hill,England,Mid Sussex,Balcombe,532733.547750,130683.575507,51.060157,-0.107299,,4.016820e+06,-7522.398378,4.937754e+06,west hill,386.687921
1583536,58e0e5ef2c66dcf8fa097707,great westbrook wood,England,Mid Sussex,Ardingly,532810.337435,130681.422884,51.060120,-0.106205,,4.016824e+06,-7445.683624,4.937751e+06,great westbrook wood,437.316944
1584240,58e0eb0c2c66dcf8fa097ac7,Great Burrow Wood,England,Mid Sussex,Balcombe,532058.427658,130239.709041,51.056323,-0.117089,,4.017151e+06,-8209.425571,4.937485e+06,great burrow wood,456.265604


In [None]:
britdf.iloc[1]

In [None]:
gb1900df.iloc[151288]

### Exploratory first step

In [None]:
l_altname = []
l_wiki_title = []
l_gb1900text = []
l_gb1900lat = []
l_gb1900lon = []
l_wikilat = []
l_wikilon = []
for index, row in britdf.iloc[180100:180200].iterrows():
    for altnwiki in row['altname'].split(","):
        altnwiki = altnwiki.strip()
        matches = gb1900df[(gb1900df['final_text'].str.contains(row['altname'], case=False)) & (gb1900df['latitude'] >= row['wiki_lat'] - 0.0) & (gb1900df['latitude'] <= row['wiki_lat'] + 0.1) & (gb1900df['longitude'] >= row['wiki_lon'] - 0.1) & (gb1900df['longitude'] <= row['wiki_lon'] + 0.1)]
        for imatch, irow in matches.iterrows():
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append(irow['final_text'])
            l_gb1900lat.append(irow['latitude'])
            l_gb1900lon.append(irow['longitude'])
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])
        if matches.empty:
            l_altname.append(altnwiki)
            l_wiki_title.append(row['wiki_title'])
            l_gb1900text.append('')
            l_gb1900lat.append('')
            l_gb1900lon.append('')
            l_wikilat.append(row['wiki_lat'])
            l_wikilon.append(row['wiki_lon'])

matchdf = pd.DataFrame(
        {'altname' : l_altname,
         'wiki_title': l_wiki_title,
         'gb1900text': l_gb1900text,
         'gb1900lat': l_gb1900lat,
         'gb1900lon': l_gb1900lon,
         'wiki_lat': l_wikilat,
         'wiki_lon': l_wikilon
        })

In [None]:
matchdf.to_pickle("match09.pkl")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
matchdf = pd.read_pickle("match09.pkl")
matchdf

### Close DB connection

In [None]:
# PostreSQL:
if(gazDB):
    cursorGaz.close()
    gazDB.close()