# Subject Correlations & Year Counts

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
from itertools import chain

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter

# Read in config.py (git ignored file) for API username and pw.
config_path = os.path.abspath(os.path.join(os.path.dirname('config.py'), '../Scripts'))
sys.path.append(config_path)
import config

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

url = 'https://dsg.xmldb-dev.northeastern.edu/BaseX964/rest/psc/'
user = config.username
pw = config.password

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"
files = glob.glob(abs_dir + "Data/PSC/Sedgwick/*.xml")

len(files)

CPU times: user 726 µs, sys: 774 µs, total: 1.5 ms
Wall time: 965 µs


122

In [None]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )

# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if '......./' in i]

# len(files)

## Build Dataframe

In [3]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

# Unnest subject headings. 
df['subjects'] = df['subjects'].str.split(',')
df = df.explode('subjects')

# Remove leading and trailing whitespace.
df['subjects'] = df['subjects'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subjects'].isin(['The'])]

# Remove rows with empty values.
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-03-24-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/

Unnamed: 0,file,date,source,target,subjects,references,text
3,CMS1809-12-11-toFrancesSedgwickWatsonF.xml,1809-12-11,Catharine Maria Sedgwick,FSW,Sedgwick Family Relations,"EHW,RSI,ashley-richard,U,curtis-unknown,U,TSI,...",Stockbridge Decr 11th1809 I cannot account fo...
3,CMS1809-12-11-toFrancesSedgwickWatsonF.xml,1809-12-11,Catharine Maria Sedgwick,FSW,Death/Mourning,"EHW,RSI,ashley-richard,U,curtis-unknown,U,TSI,...",Stockbridge Decr 11th1809 I cannot account fo...
3,CMS1809-12-11-toFrancesSedgwickWatsonF.xml,1809-12-11,Catharine Maria Sedgwick,FSW,Religion,"EHW,RSI,ashley-richard,U,curtis-unknown,U,TSI,...",Stockbridge Decr 11th1809 I cannot account fo...


## Count Subject Headings by Year

In [4]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')
df = df.query('date != "NaT"') # remove Not-a-Time values.

df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by year & subject to get count of subjects per year.
subjects = df.groupby(['year', 'subjects'], as_index = False)['subjects'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subjects', 'count']

# Group by year and get total number of subjects per year.
subjects['total'] = subjects.groupby('year')['count'].transform('sum')

# Get percentage of subject for each year.
subjects['percentage'] = round(subjects['count'] / subjects['total'], 2) * 100

subjects.to_csv(abs_dir + 'Github/dsg-mhs/lab_space/projects/sedgwick/subjects/data/subject-year-count.csv',
                sep = ',', index = False)

subjects.head()

CPU times: user 16.6 ms, sys: 3.3 ms, total: 19.9 ms
Wall time: 18.5 ms


Unnamed: 0,year,subjects,count,total,percentage
0,1800,Childhood,2,26,8.0
1,1800,Death/Mourning,2,26,8.0
2,1800,Education,1,26,4.0
3,1800,Elections,1,26,4.0
4,1800,Employees,2,26,8.0


## Create Adjacency Matrix of Subjects

In [5]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['file'], df['subjects'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

CPU times: user 24.5 ms, sys: 2.15 ms, total: 26.6 ms
Wall time: 25.2 ms


subjects,Arts,Arts Visual and Performing,Authorship,Blockades,Childbirth/Pregnancy,Childcare,Childcare/Parenting,Childhood,Childhood/Children,Children/Childhood,...,Travel (US),Travel and Touring (International),Travel and Touring (U.S.),Travel and Touring (US),Travel/Touring (U.S.),Unitarianism,Village/Rural Life,Visual and Performing,War of 1812,Work
subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arts,1.000000,0.021026,0.251051,0.203810,0.262438,0.255476,0.113228,0.417208,0.203810,0.366385,...,0.074720,0.051003,0.259235,0.185518,0.203810,0.158821,0.317404,0.784615,0.241764,0.137670
Arts Visual and Performing,0.021026,1.000000,0.502871,0.247594,0.321832,0.427714,0.131534,0.123284,0.030949,0.372455,...,0.551442,0.724924,0.514177,0.472052,0.030949,0.509099,0.340922,0.021026,0.345100,0.616784
Authorship,0.251051,0.502871,1.000000,0.475660,0.753941,0.712995,0.346796,0.316108,0.422599,0.730064,...,0.479803,0.637651,0.660045,0.665421,0.475660,0.663489,0.567818,0.251051,0.677538,0.740713
Blockades,0.203810,0.247594,0.475660,1.000000,0.386298,0.329044,0.166667,0.149379,0.183333,0.348427,...,0.366618,0.300295,0.313094,0.168810,0.300000,0.283367,0.236893,0.203810,0.743368,0.291303
Childbirth/Pregnancy,0.262438,0.321832,0.753941,0.386298,1.000000,0.562421,0.216640,0.337911,0.459381,0.639232,...,0.468506,0.531421,0.617201,0.486127,0.386298,0.520547,0.400259,0.262438,0.527591,0.574417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unitarianism,0.158821,0.509099,0.663489,0.283367,0.520547,0.563439,0.187731,0.215173,0.134600,0.577716,...,0.565668,0.545664,0.477844,0.529916,0.432135,1.000000,0.597718,0.158821,0.583199,0.631474
Village/Rural Life,0.317404,0.340922,0.567818,0.236893,0.400259,0.656994,0.373436,0.507867,0.190831,0.690466,...,0.471881,0.346797,0.406776,0.628665,0.236893,0.597718,1.000000,0.317404,0.488643,0.616819
Visual and Performing,0.784615,0.021026,0.251051,0.203810,0.262438,0.255476,0.113228,0.417208,0.203810,0.366385,...,0.074720,0.051003,0.259235,0.185518,0.203810,0.158821,0.317404,1.000000,0.241764,0.137670
War of 1812,0.241764,0.345100,0.677538,0.743368,0.527591,0.568753,0.197704,0.232325,0.300510,0.599484,...,0.434891,0.445272,0.452644,0.455855,0.466582,0.583199,0.488643,0.241764,1.000000,0.566402


## Save Subject Adj. as Network Object

In [6]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.8)')

df

CPU times: user 6.97 ms, sys: 1.31 ms, total: 8.27 ms
Wall time: 7.15 ms


Unnamed: 0,source,target,weight
67,Visual and Performing,Arts,0.784615
108,Morality and Ethics,Arts Visual and Performing,0.899425
116,Privateering,Arts Visual and Performing,0.899425
117,Public Service,Arts Visual and Performing,0.899425
144,Childbirth/Pregnancy,Authorship,0.753941
...,...,...,...
4866,Literature/History,Work,0.771176
4878,Religion,Work,0.763970
4882,Self-reflection,Work,0.757383
4883,Shopping/Material Exchange,Work,0.765460


## Create Graph Object

In [7]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.naive_greedy_modularity_communities(subgraph)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 50
Number of edges: 134
Average degree:   5.3600
Top 10 nodes by degree:
	('Authorship', 16)
	('Clothing/Fashion', 16)
	('Courtship', 15)
	('Domestic Life and Duties', 14)
	('Shopping/Material Exchange', 14)
	('Gender Roles', 13)
	('Religion', 13)
	('Self-reflection', 13)
	('Social Life/Networks', 13)
	('Health/Illness', 10)
Network density: 0.109
Is the network connected? False
Network diameter of the largest component: 7.000
Triadic closure: 0.595

CPU times: user 21.9 ms, sys: 1.47 ms, total: 23.4 ms
Wall time: 22.2 ms


## Write Graph Object

In [8]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Github/dsg-mhs/lab_space/projects/sedgwick/subjects/data/sedgwick-subjects-network.json", "w") as f:
    f.write(data_json)

CPU times: user 1.08 ms, sys: 667 µs, total: 1.75 ms
Wall time: 1.28 ms
