# Corpus Collocations

1. Remove stopwords.
2. Create keyword in context of each word.
    - Group by project.
    - join corpus text as single and split.
3. Use KWIC windows to measure PMI

In [1]:
import re, csv, glob, warnings, sys, os, pathlib, string
import pandas as pd
import numpy as np

from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords
stopwords_ = set(stopwords.words('english'))

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('JQA_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)
from JQA_XML_parser import *

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Declare directory location to shorten filepaths later.
abs_dir = pathlib.Path().resolve()
print (abs_dir)

temp_data = "/Users/quinn.wi/Documents/Data"

/Users/quinn.wi/Documents/GitHub/dsg-mhs/Jupyter_Notebooks/Prototypes


## Build Dataframe

In [2]:
%%time

# JQA
"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(temp_data + '/PSC/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 'references', 'text'])

dataframe.head(4)

CPU times: user 6.02 s, sys: 229 ms, total: 6.24 s
Wall time: 6.7 s


Unnamed: 0,file,entry,date,references,text
0,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...","1. Bathed with George this morning, at the pla..."
1,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander","2. Bathed again this Morning, and took George ..."
2,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...","3. Bathed this morning, at 6. with Mr: De Gran..."
3,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-04,1808-08-04,"boylston-ward,degrand-peter,adams-louisa-cathe...","4. Mr: Boylston called for me by appointment, ..."


In [3]:
%%time

# Richards
files = glob.glob(temp_data + "/PSC/Richards/ESR-XML-Files-MHS/*.xml")

# Sedgwick
files = files + glob.glob(temp_data + "/PSC/Sedgwick/*.xml")

# Taney
files = files + glob.glob(temp_data + "/PSC/Taney/RBT_RawXML/*/*.xml")
print (len(files))

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df = pd.concat([df, dataframe], ignore_index = True)
df.head(3)

320
/Users/quinn.wi/Documents/Data/PSC/Richards/ESR-XML-Files-MHS/ESR-EDA-1893-09-24.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC

Unnamed: 0,file,date,source,target,subjects,references,text,entry
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...,
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...,
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ...",


In [4]:
%%time

# Add column to specify project.
df['project'] = df['file'].str.replace('(\w{3})(.*)', '\\1')

# Create a year column.
df['year'] = df['date'].str.replace('\d{2}-', '')

df.head(4)

CPU times: user 77.2 ms, sys: 2.18 ms, total: 79.4 ms
Wall time: 78.4 ms


Unnamed: 0,file,date,source,target,subjects,references,text,entry,project,year
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...,,ESR,1808
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...,,ESR,1812
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ...",,ESR,1807
3,ESR-EDA-1890-09-29.xml,1890-09-29,richards-ellen,atkinson-edward,"Teaching,Nutrition","abel-mary,abel-john,palmer-alice",Boston Sept 29 1890 Dear Mr Atkinson I will a...,,ESR,1829


## Join corpus text

KWIC code from ["Keywords in Context (Using n-grams) with Python," William J. Turkel and Adam Crymble](https://programminghistorian.org/en/lessons/keywords-in-context-using-n-grams)

In [5]:
%%time

# Group by project and concatenate text.
corpora = df.groupby('project')['text'] \
    .agg(lambda r: ''.join(r.values)) \
    .reset_index()

# Remove punctuation and lower case.
corpora['text'] = corpora['text'] \
    .str.replace('[{}\d]'.format(string.punctuation), '') \
    .str.lower()

# Remove stopwords.
pat = r'\b(?:{})\b'.format('|'.join(stopwords_))
corpora['text'] = corpora['text'].str.replace(pat, '')

# Split text into list.
corpora['text'] = corpora['text'].str.split(' ')

# Gather collocations.
corpora['collocations'] = corpora['text'].apply(lambda x: BigramCollocationFinder.from_words(x))

# Measure bigram PMI.
bigram_measures = BigramAssocMeasures()
corpora['collocations'] = corpora['collocations'] \
    .apply(lambda x: x.score_ngrams(bigram_measures.pmi))

corpora = corpora[['project', 'collocations']].explode('collocations').reset_index()

corpora

CPU times: user 23.5 s, sys: 535 ms, total: 24 s
Wall time: 24.2 s


Unnamed: 0,index,project,collocations
0,0,CMS,"((adults, buzzing), 16.14414039826136)"
1,0,CMS,"((aeconomical, fits), 16.14414039826136)"
2,0,CMS,"((affright, us”), 16.14414039826136)"
3,0,CMS,"((airbuilt, castles), 16.14414039826136)"
4,0,CMS,"((alternately, sewed), 16.14414039826136)"
...,...,...,...
668170,3,RBT,"((mr, ), -4.4153365865059655)"
668171,3,RBT,"((u, ), -4.67530468929197)"
668172,3,RBT,"((b, ), -4.7354256817295415)"
668173,3,RBT,"((taneywashington, ), -4.7645720273890575)"
