# GeoTagger - Spacy

In [1]:
# Import necessary libraries.
import re, warnings, urllib, requests, spacy, geopy, folium, os, sys, glob
import pandas as pd
import numpy as np
from collections import Counter
from geopy.extra.rate_limiter import RateLimiter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

nlp = spacy.load('en_core_web_sm')

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Get XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data"
files = glob.glob(abs_dir + "/PSC/Sedgwick/*.xml")

len(files)

CPU times: user 1.14 ms, sys: 1.22 ms, total: 2.36 ms
Wall time: 1.35 ms


122

## Build Dataframe

In [3]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-03-24-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/

Unnamed: 0,file,date,source,target,subjects,references,text
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Catharine Maria Sedgwick,sedgwick-robert,,"sedgwick-charles,sedgwick-elizabeth,sedgwick-h...",Albany March 8' 1819 -- I came here my dear Ro...
1,CMS1816-03-25-toFrancesSedgwickWatsonF.xml,1816-03-25,Catharine Maria Sedgwick,FSW,,"RSI,banyer-maria,jay-sarah,van vechten-jacob,s...",Albany March 25th 1816 I have just heard of an...
2,CMS1813-08-15-toRobertSedgwickIF.xml,1813-08-15,Catharine Maria Sedgwick,RSI,,"FSW,U,payne-eloise,warner-thomas,warner-france...",Stockbridge August 15th 1813 I recollect very...


## Get Place Names

In [4]:
%%time

def get_placenames(text):
    doc = nlp(text)
    places = [ent.text for ent in doc.ents if ent.label_ in ['LOC', 'GPE']]
    return places
    
df['places'] = df['text'].apply(lambda x: get_placenames(x))

df = df[['file', 'date', 'places']]
df = df.explode('places')

df.head(3)

CPU times: user 7.23 s, sys: 238 ms, total: 7.47 s
Wall time: 7.5 s


Unnamed: 0,file,date,places
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Albany
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Northampton
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Boston


## GeoCode Places

In [5]:
%%time

# https://www.natasshaselvaraj.com/a-step-by-step-guide-on-geocoding-in-python/
def geocode(place):
#     url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(place) +'?format=json'
    url = 'https://nominatim.openstreetmap.org/search/' + str(place) + '?format=json'
    response = requests.get(url).json()
    if (len(response) != 0):
#         Default (response[0]): select first search hit in OpenStreetMap.
        return (float(response[0]['lat']), float(response[0]['lon']))
    else:
        return None 

df['coordinates'] = df['places'].apply(geocode)
df[['lat', 'lon']] = pd.DataFrame(df['coordinates'].tolist(), index = df.index)

# Convert to floats.
df['lat'] = df['lat'].apply(lambda x: float(x))
df['lon'] = df['lon'].apply(lambda x: float(x))

df = df.dropna()

df.head(3)

CPU times: user 3.25 s, sys: 218 ms, total: 3.47 s
Wall time: 4min 14s


Unnamed: 0,file,date,places,coordinates,lat,lon
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Albany,"(41.000028, 19.9999619)",41.000028,19.999962
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Northampton,"(52.23433665, -0.9028072768185829)",52.234337,-0.902807
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Boston,"(42.3602534, -71.0582912)",42.360253,-71.058291


### Save Results

In [6]:
%%time

df.to_csv(os.path.abspath('../../lab_space/projects/sedgwick/geo/data/sedgwick_geoReference.csv'), 
                          sep = ',', index = False)

CPU times: user 4.77 ms, sys: 2.41 ms, total: 7.18 ms
Wall time: 9.13 ms
