# Overviews - Corpus Comparisons

In [1]:
# Import necessary libraries.
import re, warnings, sys, os, json, glob, pathlib
import pandas as pd
import numpy as np
from itertools import chain

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter


# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('JQA_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)
from JQA_XML_parser import *

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Declare directory location to shorten filepaths later.
abs_dir = pathlib.Path().resolve()
print (abs_dir)

temp_data = "/Users/quinn.wi/Documents/Data"

/Users/quinn.wi/Documents/GitHub/dsg-mhs/Jupyter_Notebooks/Overviews


### Build Dataframe of Corpora

In [2]:
%%time

# JQA
"""
Declare variables.
"""

# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

"""
Build dataframe.
"""

dataframe = []

for file in glob.glob(temp_data + '/PSC/JQA/*/*.xml'):
    reFile = str(regex.search(file).group(1))
#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)

        dataframe.append([reFile, entry, date, people, text])

dataframe = pd.DataFrame(dataframe, columns = ['file', 'entry', 'date', 'references', 'text'])

dataframe.head(4)

CPU times: user 5.44 s, sys: 222 ms, total: 5.66 s
Wall time: 6.17 s


Unnamed: 0,file,entry,date,references,text
0,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...","1. Bathed with George this morning, at the pla..."
1,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander","2. Bathed again this Morning, and took George ..."
2,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...","3. Bathed this morning, at 6. with Mr: De Gran..."
3,JQADiaries-v27-1808-08-p364.xml,jqadiaries-v27-1808-08-04,1808-08-04,"boylston-ward,degrand-peter,adams-louisa-cathe...","4. Mr: Boylston called for me by appointment, ..."


In [3]:
%%time

# Richards
files = glob.glob(temp_data + "/PSC/Richards/ESR-XML-Files-MHS/*.xml")

# Sedgwick
files = files + glob.glob(temp_data + "/PSC/Sedgwick/*.xml")

# Taney
files = files + glob.glob(temp_data + "/PSC/Taney/RBT_RawXML/*/*.xml")
print (len(files))

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

df = pd.concat([df, dataframe], ignore_index = True)
df.head(3)

320
/Users/quinn.wi/Documents/Data/PSC/Richards/ESR-XML-Files-MHS/ESR-EDA-1893-09-24.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC

Unnamed: 0,file,date,source,target,subjects,references,text,entry
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...,
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...,
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ...",


In [4]:
%%time

# Add column to specify project.
df['project'] = df['file'].str.replace('(\w{3})(.*)', '\\1')

# Create a year column.
df['year'] = df['date'].str.replace('\d{2}-', '')

df.head(4)

CPU times: user 77.3 ms, sys: 2.56 ms, total: 79.9 ms
Wall time: 78.4 ms


Unnamed: 0,file,date,source,target,subjects,references,text,entry,project,year
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...,,ESR,1808
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...,,ESR,1812
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ...",,ESR,1807
3,ESR-EDA-1890-09-29.xml,1890-09-29,richards-ellen,atkinson-edward,"Teaching,Nutrition","abel-mary,abel-john,palmer-alice",Boston Sept 29 1890 Dear Mr Atkinson I will a...,,ESR,1829


## Word Count over Time

In [5]:
%%time


# Get total number of words for each document.
df['count'] = [len(x.split()) for x in df['text'].tolist()]

totalWords = df.groupby(['project', 'year'])['count'].sum().reset_index()

# Remove years marked as unknown.
totalWords = totalWords[ totalWords['year'].str.contains('1[789]\d{2}') ]
totalWords['year'] = totalWords['year'].astype(int)

# Pivot table to wide data for json prep.
totalWords = pd.pivot_table(totalWords, 
                            index = ['year'], 
                            columns = ['project'], 
                            values = ['count'],
                            fill_value = 0) \
    .reset_index()

totalWords.columns = totalWords.columns.get_level_values(1)

totalWords = totalWords.rename(columns = {'': 'year'})

totalWords = totalWords.eval('Sum = CMS + ESR + JQA + RBT')

# Convert to dictionary for js.
totalWords = totalWords.to_dict('records')

with open(str(abs_dir.parent.parent) + '/lab_space/projects/overviews/data/wordCount.json', 
          'w+') as f:
    json.dump(totalWords, f)

totalWords

CPU times: user 374 ms, sys: 7.49 ms, total: 382 ms
Wall time: 409 ms


[{'year': 1701, 'CMS': 0, 'ESR': 0, 'JQA': 7988, 'RBT': 0, 'Sum': 7988},
 {'year': 1702, 'CMS': 0, 'ESR': 0, 'JQA': 8609, 'RBT': 0, 'Sum': 8609},
 {'year': 1703, 'CMS': 0, 'ESR': 0, 'JQA': 9970, 'RBT': 0, 'Sum': 9970},
 {'year': 1704, 'CMS': 0, 'ESR': 0, 'JQA': 9750, 'RBT': 0, 'Sum': 9750},
 {'year': 1705, 'CMS': 0, 'ESR': 0, 'JQA': 8027, 'RBT': 0, 'Sum': 8027},
 {'year': 1706, 'CMS': 0, 'ESR': 0, 'JQA': 8149, 'RBT': 0, 'Sum': 8149},
 {'year': 1707, 'CMS': 0, 'ESR': 0, 'JQA': 7726, 'RBT': 0, 'Sum': 7726},
 {'year': 1708, 'CMS': 0, 'ESR': 0, 'JQA': 7414, 'RBT': 0, 'Sum': 7414},
 {'year': 1709, 'CMS': 0, 'ESR': 0, 'JQA': 7660, 'RBT': 0, 'Sum': 7660},
 {'year': 1710, 'CMS': 0, 'ESR': 0, 'JQA': 7698, 'RBT': 0, 'Sum': 7698},
 {'year': 1711, 'CMS': 0, 'ESR': 0, 'JQA': 9299, 'RBT': 0, 'Sum': 9299},
 {'year': 1712, 'CMS': 0, 'ESR': 0, 'JQA': 7206, 'RBT': 0, 'Sum': 7206},
 {'year': 1713, 'CMS': 0, 'ESR': 0, 'JQA': 7082, 'RBT': 0, 'Sum': 7082},
 {'year': 1714, 'CMS': 0, 'ESR': 0, 'JQA': 7931, 'R

## Document Count over Time

In [6]:
%%time

# Group by project & year, then count number of rows (i.e., documents).
totalDocs = df[['project', 'year', 'text']] \
    .groupby(['project', 'year']) \
    .agg({'year': 'count'}) \
    .rename(columns={'year': 'count'}) \
    .reset_index()

# Remove years marked as unknown.
totalDocs = totalDocs[ totalDocs['year'].str.contains('1[789]\d{2}') ]
totalDocs['year'] = totalDocs['year'].astype(int)


# Pivot table to wide data for json prep.
totalDocs = pd.pivot_table(totalDocs, 
                            index = ['year'], 
                            columns = ['project'], 
                            values = ['count'],
                            fill_value = 0) \
    .reset_index()

totalDocs.columns = totalDocs.columns.get_level_values(1)

totalDocs = totalDocs.rename(columns = {'': 'year'})

totalDocs = totalDocs.eval('Sum = CMS + ESR + JQA + RBT')

# Convert to dictionary for js.
totalDocs = totalDocs.to_dict('records')

with open(str(abs_dir.parent.parent) + '/lab_space/projects/overviews/data/docCounts.json', 
          'w+') as f:
    json.dump(totalDocs, f)

totalDocs

CPU times: user 27.9 ms, sys: 1.47 ms, total: 29.3 ms
Wall time: 28.8 ms


[{'year': 1701, 'CMS': 0, 'ESR': 0, 'JQA': 137, 'RBT': 0, 'Sum': 137},
 {'year': 1702, 'CMS': 0, 'ESR': 0, 'JQA': 135, 'RBT': 0, 'Sum': 135},
 {'year': 1703, 'CMS': 0, 'ESR': 0, 'JQA': 138, 'RBT': 0, 'Sum': 138},
 {'year': 1704, 'CMS': 0, 'ESR': 0, 'JQA': 137, 'RBT': 0, 'Sum': 137},
 {'year': 1705, 'CMS': 0, 'ESR': 0, 'JQA': 140, 'RBT': 0, 'Sum': 140},
 {'year': 1706, 'CMS': 0, 'ESR': 0, 'JQA': 137, 'RBT': 0, 'Sum': 137},
 {'year': 1707, 'CMS': 0, 'ESR': 0, 'JQA': 138, 'RBT': 0, 'Sum': 138},
 {'year': 1708, 'CMS': 0, 'ESR': 0, 'JQA': 137, 'RBT': 0, 'Sum': 137},
 {'year': 1709, 'CMS': 0, 'ESR': 0, 'JQA': 138, 'RBT': 0, 'Sum': 138},
 {'year': 1710, 'CMS': 0, 'ESR': 0, 'JQA': 139, 'RBT': 0, 'Sum': 139},
 {'year': 1711, 'CMS': 0, 'ESR': 0, 'JQA': 138, 'RBT': 0, 'Sum': 138},
 {'year': 1712, 'CMS': 0, 'ESR': 0, 'JQA': 139, 'RBT': 0, 'Sum': 139},
 {'year': 1713, 'CMS': 0, 'ESR': 0, 'JQA': 134, 'RBT': 0, 'Sum': 134},
 {'year': 1714, 'CMS': 0, 'ESR': 0, 'JQA': 134, 'RBT': 0, 'Sum': 134},
 {'yea

## Simplified Doc & Word Count

In [9]:
%%time

# Get total number of words for each document.
words = df[['project', 'file', 'entry', 'text']]
words['wordCount'] = [len(x.split()) for x in words['text'].tolist()]

words = words.groupby(['project'])['wordCount'].sum().reset_index()

# Group by project & year, then count number of rows (i.e., documents).
docs = df[['project', 'text']] \
    .groupby(['project'])['text'].count() \
    .reset_index() \
    .rename(columns={'text': 'docCount'})

simple_df = pd.merge(docs, words, 
                     left_on = 'project', right_on = 'project',
                     how = 'inner')

# Change initials (except JQA) to last name in order to match css stylesheet.
simple_df['project'] = simple_df['project'].str.replace('ESR', 'richards')
simple_df['project'] = simple_df['project'].str.replace('CMS', 'sedgwick')
simple_df['project'] = simple_df['project'].str.replace('RBT', 'taney')

simple_df.to_csv(str(abs_dir.parent.parent) + '/lab_space/projects/overviews/data/simple-counts.csv', 
                 index = False, sep = ',')

simple_df

CPU times: user 368 ms, sys: 3.99 ms, total: 372 ms
Wall time: 377 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,project,docCount,wordCount
0,sedgwick,109,72359
1,richards,19,3708
2,JQA,21875,6285885
3,taney,178,61317


## Network of Cross-Corpora People

If person A appears in another corpus, keep person and which corpora they appeared in.