# GeoTagger - Spacy

In [1]:
# Import necessary libraries.
import re, warnings, urllib, requests, spacy, geopy, folium, os, sys, glob
import pandas as pd
import numpy as np
from collections import Counter
from geopy.extra.rate_limiter import RateLimiter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

nlp = spacy.load('en_core_web_sm')

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Get XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Taney/RBT_RawXML/*/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

CPU times: user 1.44 ms, sys: 1.83 ms, total: 3.27 ms
Wall time: 1.82 ms


In [3]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )
    
# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'jqa/' in i]

# len(files)

## Build Dataframe

In [4]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
# df = build_dataframe(files, url, user, pw)
df = build_dataframe(files)

df.head(3)

CPU times: user 70 ms, sys: 9.75 ms, total: 79.7 ms
Wall time: 78.8 ms


Unnamed: 0,file,date,source,target,subjects,references,text
0,RBT00099-verification.xml,1833-05-05,RBT,Ellicott-Thomas,,"jackson-andrew,kendall-amos,mickle-robert,tane...",Washington May 5. 1833My Dear Sir I received y...
1,RBT00146-verification.xml,1834-03-30,RBT,ellicott-thomas,,mccubbin-george,Washington March 30. 1834My Dear Sir I have on...
2,RBT01364-verification.xml,1833-08-05,RBT,jackson-andrew,,,Washington Augt. 5. 1833 My Dear Sir After ref...


## Get Place Names

In [5]:
%%time

def get_placenames(text):
    doc = nlp(text)
    places = [ent.text for ent in doc.ents if ent.label_ in ['LOC', 'GPE']]
    return places
    
df['places'] = df['text'].apply(lambda x: get_placenames(x))

df = df[['file', 'date', 'places']]
df = df.explode('places')

df.head(3)

CPU times: user 5.88 s, sys: 180 ms, total: 6.07 s
Wall time: 6.13 s


Unnamed: 0,file,date,places
0,RBT00099-verification.xml,1833-05-05,Washington
0,RBT00099-verification.xml,1833-05-05,Fredericksburg
0,RBT00099-verification.xml,1833-05-05,Banking


## GeoCode Places

In [6]:
%%time

# https://www.natasshaselvaraj.com/a-step-by-step-guide-on-geocoding-in-python/
def geocode(place):
#     url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(place) +'?format=json'
    url = 'https://nominatim.openstreetmap.org/search/' + str(place) + '?format=json'
    response = requests.get(url).json()
    if (len(response) != 0):
#         Default (response[0]): select first search hit in OpenStreetMap.
        return (float(response[0]['lat']), float(response[0]['lon']))
    else:
        return None 

df['coordinates'] = df['places'].apply(geocode)
df[['lat', 'lon']] = pd.DataFrame(df['coordinates'].tolist(), index = df.index)

# Convert to floats.
df['lat'] = df['lat'].apply(lambda x: float(x))
df['lon'] = df['lon'].apply(lambda x: float(x))

df = df.dropna()

df.head(3)

CPU times: user 5.81 s, sys: 448 ms, total: 6.25 s
Wall time: 4min 33s


Unnamed: 0,file,date,places,coordinates,lat,lon
0,RBT00099-verification.xml,1833-05-05,Washington,"(38.8950368, -77.0365427)",38.895037,-77.036543
0,RBT00099-verification.xml,1833-05-05,Fredericksburg,"(38.3031837, -77.4605399)",38.303184,-77.46054
0,RBT00099-verification.xml,1833-05-05,Banking,"(53.1914099, -4.4942865)",53.19141,-4.494287


## Save Data

In [7]:
%%time

df.to_csv(os.path.abspath('../../lab_space/projects/taney/geo/data/taney_geoReference.csv'), 
                          sep = ',', index = False)

CPU times: user 5.05 ms, sys: 2.05 ms, total: 7.1 ms
Wall time: 6.46 ms
