# Richards -- Syntactic Dependencies

In [1]:
import re, json, glob, csv, sys, os, warnings, spacy, nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import spaCy language model
nlp = spacy.load('en_core_web_sm')

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)
from Correspondence_XML_parser import *

# Read in config.py (git ignored file) for API username and pw.
config_path = os.path.abspath(os.path.join(os.path.dirname('config.py'), '../Scripts'))
sys.path.append(config_path)
import config

url = 'https://dsg.xmldb-dev.northeastern.edu/BaseX964/rest/psc/'
user = config.username
pw = config.password

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Richards/ESR-XML-Files-MHS/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

len(files)

CPU times: user 748 µs, sys: 960 µs, total: 1.71 ms
Wall time: 1.15 ms


20

In [3]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )
    
# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'esr/' in i]

# len(files)

## Build Dataframe

In [4]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
# df = build_dataframe(files, url, user, pw)

df = build_dataframe(files)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Richards/ESR-XML-Files-MHS/ESR-EDA-1893-09-24.xml 

CPU times: user 9.71 ms, sys: 3.38 ms, total: 13.1 ms
Wall time: 14.7 ms


Unnamed: 0,file,date,source,target,subjects,references,text
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ..."


## Get Syntactic Dependencies

Getting the dependencies of every word would take up a lot of memory. Therefore, we'll focus only on the nouns for now.

In [5]:
%%time

# Declare function to extract ngrams based on syntax.
def get_syntactic_dependencies(text):
    doc = nlp(text)
    
#     Create an empty list to hold nouns and their dependencies as bigrams.
    noun_bigrams = []
    
    for token in doc:
        if token.pos_ == 'NOUN': # Only accept nouns (selected parts of speech)
            for child in token.children:
                
                noun_bigrams.append( (token.lemma_, child.lemma_) )
                    
                
    if noun_bigrams:
        return noun_bigrams
    
# Get dependencies.
df['dependencies'] = df['text'].apply(get_syntactic_dependencies)

df.head(3)

CPU times: user 489 ms, sys: 7.84 ms, total: 497 ms
Wall time: 501 ms


Unnamed: 0,file,date,source,target,subjects,references,text,dependencies
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...,"[(outline, a), (outline, brief), (outline, of)..."
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...,"[(noon, tomorrow), (tech, on), (tech, .), (dou..."
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ...","[(book, that), (book, little), (book, blue), (..."


## Pointwise Mutual Information to Determine Significant Dependencies

In [15]:
%%time

# Treat dependencies as bigrams.
bigram_measures = nltk.collocations.BigramAssocMeasures()

# Convert 'dependencies' column to a list.
len (df['dependencies'].values)

df['dependencies'][0]

# df.score_ngrams(bigram_measures.pmi)

CPU times: user 172 µs, sys: 20 µs, total: 192 µs
Wall time: 190 µs


[('outline', 'a'),
 ('outline', 'brief'),
 ('outline', 'of'),
 ('line', '-PRON-'),
 ('charge', 'of'),
 ('term', 'the'),
 ('outline', 'this'),
 ('section', 'the'),
 ('section', 'Economics'),
 ('paper', 'the'),
 ('plan', 'the'),
 ('plan', 'for'),
 ('lesson', 'the'),
 ('lesson', '5'),
 ('lesson', 'on'),
 ('use', 'the'),
 ('use', 'of'),
 ('week', 'two'),
 ('grip', '-PRON-'),
 ('grip', 'week')]