# Subject Correlations & Year Counts

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
import seaborn as sns
import glob as glob
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter


# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"
files = glob.glob(abs_dir + "Data/PSC/Richards/ESR-XML-Files-MHS/*.xml")

len(files)

CPU times: user 814 µs, sys: 1.04 ms, total: 1.85 ms
Wall time: 1.39 ms


20

In [None]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )
    
# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'esr/' in i]

# len(files)

## Build Dataframe

In [3]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

# Unnest subject headings. 
df['subjects'] = df['subjects'].str.split(',')
df = df.explode('subjects')

# Remove leading and trailing whitespace.
df['subjects'] = df['subjects'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subjects'].isin(['The'])]

# Remove rows with empty values.
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Richards/ESR-XML-Files-MHS/ESR-EDA-1893-09-24.xml 

CPU times: user 18.4 ms, sys: 3.63 ms, total: 22.1 ms
Wall time: 23.2 ms


Unnamed: 0,file,date,source,target,subjects,references,text
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,1893 Chicago World's Fair,"palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,Aladdin Oven,"palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,New England Kitchen,"palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...


## Count Subject Headings by Year

In [4]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')
df = df.query('date != "NaT"') # remove Not-a-Time values.

df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by year & subject to get count of subjects per year.
subjects = df.groupby(['year', 'subjects'], as_index = False)['subjects'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subjects', 'count']

# Group by year and get total number of subjects per year.
subjects['total'] = subjects.groupby('year')['count'].transform('sum')

# Get percentage of subject for each year.
subjects['percentage'] = round(subjects['count'] / subjects['total'], 2) * 100

subjects.to_csv(abs_dir + 'Github/dsg-mhs/lab_space/projects/richards/subjects/data/subject-year-count.csv',
                sep = ',', index = False)

subjects.head()

CPU times: user 17.6 ms, sys: 4.08 ms, total: 21.7 ms
Wall time: 24.6 ms


Unnamed: 0,year,subjects,count,total,percentage
0,1890,Bread,2,5,40.0
1,1890,Nutrition,1,5,20.0
2,1890,Teaching,1,5,20.0
3,1890,Women's Education,1,5,20.0
4,1891,1893 Chicago World's Fair,1,5,20.0


## Create Adjacency Matrix of Subjects

In [5]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['file'], df['subjects'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

CPU times: user 16.4 ms, sys: 2.41 ms, total: 18.8 ms
Wall time: 16.8 ms


subjects,1893 Chicago World's Fair,1893 Chicago World’s Fair,ACA,Aladdin Oven,Bread,Home economics,Naples Table Fellowship,New England Kitchen,Nutrition,Teaching,Women's Education,cooking,dietaries,menus,nutrition,progressive women
subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1893 Chicago World's Fair,1.0,0.717137,0.547723,-0.159364,-0.163299,0.547723,0.547723,0.565685,0.474644,0.163299,-0.163299,0.547723,-0.163299,0.182574,0.280607,0.182574
1893 Chicago World’s Fair,0.717137,1.0,0.218218,-0.047619,-0.09759,0.218218,0.218218,0.507093,0.445742,-0.09759,-0.09759,0.436436,-0.09759,0.218218,0.30185,0.218218
ACA,0.547723,0.218218,1.0,0.072739,-0.149071,0.666667,0.666667,0.516398,0.433289,-0.149071,-0.149071,0.166667,-0.149071,0.0,0.051232,0.0
Aladdin Oven,-0.159364,-0.047619,0.072739,1.0,-0.29277,0.072739,0.072739,-0.056344,0.040522,0.22771,-0.29277,-0.072739,0.22771,0.218218,0.01118,0.218218
Bread,-0.163299,-0.09759,-0.149071,-0.29277,1.0,-0.149071,-0.149071,-0.11547,-0.138409,-0.066667,-0.066667,-0.149071,-0.066667,-0.149071,-0.16038,-0.149071
Home economics,0.547723,0.218218,0.666667,0.072739,-0.149071,1.0,0.666667,0.516398,0.433289,-0.149071,-0.149071,0.166667,-0.149071,0.0,0.051232,0.0
Naples Table Fellowship,0.547723,0.218218,0.666667,0.072739,-0.149071,0.666667,1.0,0.516398,0.433289,-0.149071,-0.149071,0.166667,-0.149071,0.0,0.051232,0.0
New England Kitchen,0.565685,0.507093,0.516398,-0.056344,-0.11547,0.516398,0.516398,1.0,0.719195,-0.11547,-0.11547,0.516398,-0.11547,0.258199,0.357154,0.258199
Nutrition,0.474644,0.445742,0.433289,0.040522,-0.138409,0.433289,0.433289,0.719195,1.0,-0.138409,-0.138409,0.433289,-0.138409,0.433289,0.580321,0.433289
Teaching,0.163299,-0.09759,-0.149071,0.22771,-0.066667,-0.149071,-0.149071,-0.11547,-0.138409,1.0,-0.066667,0.149071,-0.066667,-0.149071,-0.16038,-0.149071


## Save Subject Adj. as Network Object

In [6]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.5)')

df

CPU times: user 7.44 ms, sys: 420 µs, total: 7.86 ms
Wall time: 7.71 ms


Unnamed: 0,source,target,weight
1,1893 Chicago World’s Fair,1893 Chicago World's Fair,0.717137
2,ACA,1893 Chicago World's Fair,0.547723
5,Home economics,1893 Chicago World's Fair,0.547723
6,Naples Table Fellowship,1893 Chicago World's Fair,0.547723
7,New England Kitchen,1893 Chicago World's Fair,0.565685
11,cooking,1893 Chicago World's Fair,0.547723
16,1893 Chicago World's Fair,1893 Chicago World’s Fair,0.717137
23,New England Kitchen,1893 Chicago World’s Fair,0.507093
32,1893 Chicago World's Fair,ACA,0.547723
37,Home economics,ACA,0.666667


## Create Graph Object

In [7]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.naive_greedy_modularity_communities(subgraph)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 12
Number of edges: 21
Average degree:   3.5000
Top 10 nodes by degree:
	('New England Kitchen', 7)
	("1893 Chicago World's Fair", 6)
	('ACA', 4)
	('Home economics', 4)
	('Naples Table Fellowship', 4)
	('cooking', 4)
	('nutrition', 3)
	('menus', 3)
	('progressive women', 3)
	('1893 Chicago World’s Fair', 2)
Network density: 0.318
Is the network connected? False
Network diameter of the largest component: 3.000
Triadic closure: 0.592

CPU times: user 38.1 ms, sys: 1.83 ms, total: 39.9 ms
Wall time: 38.8 ms


## Write Graph Object

In [8]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Github/dsg-mhs/lab_space/projects/richards/subjects/data/richards-subjects-network.json", "w") as f:
    f.write(data_json)

CPU times: user 836 µs, sys: 1.01 ms, total: 1.84 ms
Wall time: 969 µs
