# GeoTagger - Spacy

In [1]:
# Import necessary libraries.
import re, warnings, urllib, requests, spacy, geopy, folium, os, sys, glob
import pandas as pd
import numpy as np
from collections import Counter
from geopy.extra.rate_limiter import RateLimiter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('JQA_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from JQA_XML_parser import *

nlp = spacy.load('en_core_web_sm')

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

In [2]:
text = "Several months of ill health detained me at Braintree, and wholly disqualified me for the use of the pen; after attending the Session of the supreme judicial Court at Boston in February, I return’d to this Town, on the 26th: day of March, since which my health has been gradually restored; and is now almost as good as it was previous to my sickness."

doc = nlp(text)

spacy.displacy.serve(doc, style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [24/May/2022 09:15:40] "GET / HTTP/1.1" 200 2724
127.0.0.1 - - [24/May/2022 09:15:40] "GET /favicon.ico HTTP/1.1" 200 2724


Shutting down server on port 5000.


## Get XML Files

In [6]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data"
files = glob.glob(abs_dir + "/PSC/JQA/*/*.xml")

len(files)

CPU times: user 3.52 ms, sys: 4.52 ms, total: 8.05 ms
Wall time: 14.3 ms


762

## Build Dataframe

In [7]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df.head(3)

CPU times: user 3.38 ms, sys: 3.83 ms, total: 7.2 ms
Wall time: 5.92 ms


762

## Get Place Names

In [2]:
%%time

def get_placenames(text):
    doc = nlp(text)
    places = [ent.text for ent in doc.ents if ent.label_ in ['LOC', 'GPE']]
    return places
    
df['places'] = df['text'].apply(lambda x: get_placenames(x))

df = df[['entry', 'date', 'places']]
df = df.explode('places')

print (df.shape)
df.head(3)

CPU times: user 1min 7s, sys: 1.56 s, total: 1min 9s
Wall time: 1min 9s


Unnamed: 0,entry,date,places
0,jqadiaries-v49-1825-01-01,1825-01-01,H.R.U.S.
0,jqadiaries-v49-1825-01-01,1825-01-01,Bolivar
0,jqadiaries-v49-1825-01-01,1825-01-01,South America


## GeoCode Places

In [3]:
%%time

# https://www.natasshaselvaraj.com/a-step-by-step-guide-on-geocoding-in-python/
def geocode(place):
#     url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(place) +'?format=json'
    url = 'https://nominatim.openstreetmap.org/search/' + str(place) + '?format=json'
    response = requests.get(url).json()
    if (len(response) != 0):
#         Default (response[0]): select first search hit in OpenStreetMap.
        return (float(response[0]['lat']), float(response[0]['lon']))
    else:
        return None 

df['coordinates'] = df['places'].apply(geocode)
df[['lat', 'lon']] = pd.DataFrame(df['coordinates'].tolist(), index = df.index)

# Convert to floats.
df['lat'] = df['lat'].apply(lambda x: float(x))
df['lon'] = df['lon'].apply(lambda x: float(x))

df = df.dropna()

df.head(3)

CPU times: user 2min 23s, sys: 10.4 s, total: 2min 33s
Wall time: 1h 52min 5s


Unnamed: 0,entry,date,places,coordinates,lat,lon
0,jqadiaries-v49-1825-01-01,1825-01-01,H.R.U.S.,"(3.4134276999999997, 101.56854987461735)",3.413428,101.56855
0,jqadiaries-v49-1825-01-01,1825-01-01,Bolivar,"(6.333333, -63.5)",6.333333,-63.5
0,jqadiaries-v49-1825-01-01,1825-01-01,South America,"(-21.0002179, -61.0006565)",-21.000218,-61.000656


### Save Results

In [4]:
%%time

df.to_csv(os.path.abspath('../../lab_space/projects/jqa/jqa-geoReference.csv', sep = ',', index = False)

CPU times: user 75.3 ms, sys: 4.56 ms, total: 79.8 ms
Wall time: 83.4 ms
