# Subject Correlations & Year Counts

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
import seaborn as sns
import glob as glob
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('JQA_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from JQA_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"
files = glob.glob(abs_dir + "Data/PSC/JQA/*/*.xml")

len(files)

CPU times: user 2.58 ms, sys: 2.97 ms, total: 5.55 ms
Wall time: 4.66 ms


762

## Build Dataframe

In [3]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
df = build_dataframe(files)

# Unnest subject headings. 
df['subjects'] = df['subjects'].str.split(',')
df = df.explode('subjects')

# Remove leading and trailing whitespace.
df['subjects'] = df['subjects'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subjects'].isin(['The'])]

# Remove rows with empty values.
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

df.head(3)

CPU times: user 5.67 s, sys: 130 ms, total: 5.8 s
Wall time: 5.87 s


Unnamed: 0,file,entry,date,people,subjects,text
0,"('JQADiaries-v27-1808-08-p364.xml',)",jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...",Recreation,"1. Bathed with George this morning, at the pla..."
1,"('JQADiaries-v27-1808-08-p364.xml',)",jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander",Recreation,"2. Bathed again this Morning, and took George ..."
2,"('JQADiaries-v27-1808-08-p364.xml',)",jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...",Recreation,"3. Bathed this morning, at 6. with Mr: De Gran..."


## Count Subject Headings by Year

In [4]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')
df = df.query('date != "NaT"') # remove Not-a-Time values.

df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by year & subject to get count of subjects per year.
subjects = df.groupby(['year', 'subjects'], as_index = False)['subjects'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subjects', 'count']

# Group by year and get total number of subjects per year.
subjects['total'] = subjects.groupby('year')['count'].transform('sum')

# Get percentage of subject for each year.
subjects['percentage'] = round(subjects['count'] / subjects['total'], 2) * 100

subjects.to_csv(abs_dir + 'Github/dsg-mhs/lab_space/projects/jqa/subjects/data/subject-year-count.csv',
                sep = ',', index = False)

subjects.head()

CPU times: user 25.3 ms, sys: 3.39 ms, total: 28.7 ms
Wall time: 28.3 ms


Unnamed: 0,year,subjects,count,total,percentage
0,1789,Health and Illness,1,18,6.0
1,1789,Recreation,16,18,89.0
2,1789,Slave Trade,1,18,6.0
3,1790,Recreation,10,10,100.0
4,1791,Bank of the United States,2,20,10.0


## Create Adjacency Matrix of Subjects

In [5]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['entry'], df['subjects'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

CPU times: user 179 ms, sys: 12.3 ms, total: 191 ms
Wall time: 191 ms


subjects,Adams Family Finances,Adams Family Relations,Adams Family Residences,Adams-Onis Treaty,African Americans,Alien and Sedition Acts,American Revolution,American System,Anti-Masonic Party,Anti-Slavery Movements,...,Treaty of Paris (1783),U.S. Constitution,Unitarianism,Utopian Communities,War of 1812,West,Westward Exploration and Expansion/Westward Migration/Manifest Destiny,Whig Party,Women's Rights,XYZ Affair (1797)
subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adams Family Finances,1.000000,0.591864,0.730865,0.715337,0.808371,0.388212,0.845856,0.291456,0.723344,0.580159,...,0.250185,0.747550,0.471191,0.656066,0.582333,0.715337,0.662027,0.683948,0.361522,0.715337
Adams Family Relations,0.591864,1.000000,0.910181,0.379483,0.609570,0.494925,0.663308,0.123995,0.622265,0.446796,...,0.148840,0.544348,0.360013,0.344995,0.368739,0.379483,0.460208,0.510131,0.207096,0.379483
Adams Family Residences,0.730865,0.910181,1.000000,0.618762,0.693755,0.349180,0.730645,0.195435,0.679314,0.466124,...,0.191499,0.575092,0.402885,0.566260,0.486276,0.618762,0.457553,0.543510,0.314892,0.618762
Adams-Onis Treaty,0.715337,0.379483,0.618762,1.000000,0.714843,-0.024061,0.574292,0.198887,0.422530,0.229414,...,0.257739,0.415683,0.077310,0.893638,0.538622,1.000000,0.477161,0.443345,0.368806,1.000000
African Americans,0.808371,0.609570,0.693755,0.714843,1.000000,0.396334,0.768493,0.221211,0.567579,0.414543,...,0.561978,0.682494,0.247489,0.700843,0.751412,0.714843,0.665149,0.525327,0.261419,0.714843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West,0.715337,0.379483,0.618762,1.000000,0.714843,-0.024061,0.574292,0.198887,0.422530,0.229414,...,0.257739,0.415683,0.077310,0.893638,0.538622,1.000000,0.477161,0.443345,0.368806,1.000000
Westward Exploration and Expansion/Westward Migration/Manifest Destiny,0.662027,0.460208,0.457553,0.477161,0.665149,0.573333,0.672178,0.073359,0.449430,0.322357,...,0.311412,0.676170,0.155664,0.490348,0.570790,0.477161,1.000000,0.457457,0.264279,0.477161
Whig Party,0.683948,0.510131,0.543510,0.443345,0.525327,0.393607,0.623169,0.447566,0.675480,0.723753,...,0.120759,0.593535,0.128865,0.426376,0.386580,0.443345,0.457457,1.000000,0.169509,0.443345
Women's Rights,0.361522,0.207096,0.314892,0.368806,0.261419,-0.045998,0.320769,0.112104,0.254854,0.186696,...,0.069835,0.233751,0.385398,0.323451,0.221418,0.368806,0.264279,0.169509,1.000000,0.368806


## Save Subject Adj. as Network Object

In [6]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.75)')

df

CPU times: user 7.8 ms, sys: 1.28 ms, total: 9.08 ms
Wall time: 7.95 ms


Unnamed: 0,source,target,weight
4,African Americans,Adams Family Finances,0.808371
6,American Revolution,Adams Family Finances,0.845856
13,Bank of the United States,Adams Family Finances,0.789476
17,Canals,Adams Family Finances,0.777228
26,Court Life and Society (European),Adams Family Finances,0.771173
...,...,...,...
12968,Slave Trade,XYZ Affair (1797),0.768834
12974,Spoils System,XYZ Affair (1797),0.858404
12984,Travel and Touring (international),XYZ Affair (1797),0.969126
12989,Utopian Communities,XYZ Affair (1797),0.893638


## Create Graph Object

In [7]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.greedy_modularity_communities(G)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 70
Number of edges: 303
Average degree:   8.6571
Top 10 nodes by degree:
	('Native Americans', 25)
	('Spoils System', 25)
	('American Revolution', 20)
	('Adams Family Finances', 18)
	('Health and Illness', 18)
	('Court Life and Society (European)', 17)
	('Travel and Touring (international)', 17)
	('Slave Trade', 17)
	('French Revolution>', 16)
	('Bank of the United States', 15)
Network density: 0.125
Is the network connected? False
Network diameter of the largest component: 7.000
Triadic closure: 0.618

CPU times: user 49.5 ms, sys: 1.78 ms, total: 51.3 ms
Wall time: 50 ms


## Write Graph Object

In [8]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Github/dsg-mhs/lab_space/projects/jqa/subjects/data/jqa-subjects-network.json", "w") as f:
    f.write(data_json)

CPU times: user 1.43 ms, sys: 658 µs, total: 2.08 ms
Wall time: 1.53 ms
