# Subject Aggregation

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
import seaborn as sns
import glob as glob
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Taney/TaneyXML-Oct2020/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

CPU times: user 900 µs, sys: 1.06 ms, total: 1.96 ms
Wall time: 1.24 ms


In [3]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )
    
# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'jqa/' in i]

# len(files)

## Build Dataframe

In [4]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser

# df = build_dataframe(files, url, user, pw)
df = build_dataframe(files)

# Unnest subject headings. 
df['subjects'] = df['subjects'].str.split(',')
df = df.explode('subjects')

# Remove leading and trailing whitespace.
df['subjects'] = df['subjects'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subjects'].isin(['The'])]

# Remove rows with empty values.
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00009-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00021-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00022-collation.xml 

CPU times: user 26.6 ms, sys: 4.08 ms, total: 30.7 ms
Wall time: 29.2 ms


Unnamed: 0,file,date,source,target,subjects,references,text
3,RBT00100-collation.xml,1833-05-20,RBT,Ellicott-Thomas,Taney Family Finances,"williams-nathaniel,mickle-robert",Washington May 20. 1833 My Dear Sir I sent a ...
3,RBT00100-collation.xml,1833-05-20,RBT,Ellicott-Thomas,Bank War,"williams-nathaniel,mickle-robert",Washington May 20. 1833 My Dear Sir I sent a ...
5,RBT00169-collation.xml,1834-05-29,RBT,ellicott-thomas,Railroads,"howard-benjamin,johnson-reverdy,campbell-x","Washington May 29, 1834 My Dear Sir I must in..."


## Count Subject Headings by Year

In [5]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')
df = df.query('date != "NaT"') # remove Not-a-Time values.

df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by year & subject to get count of subjects per year.
subjects = df.groupby(['year', 'subjects'], as_index = False)['subjects'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subjects', 'count']

# Group by year and get total number of subjects per year.
subjects['total'] = subjects.groupby('year')['count'].transform('sum')

# Get percentage of subject for each year.
subjects['percentage'] = round(subjects['count'] / subjects['total'], 2) * 100

subjects.to_csv(abs_dir + 'Github/dsg-mhs/lab_space/projects/taney/subjects/data/subject-year-count.csv',
                sep = ',', index = False)

subjects.head()

CPU times: user 19.1 ms, sys: 3.73 ms, total: 22.8 ms
Wall time: 21.6 ms


Unnamed: 0,year,subjects,count,total,percentage
0,1832,Bank War,5,16,31.0
1,1832,Bank of the United States,2,16,12.0
2,1832,Education,1,16,6.0
3,1832,Election of 1832,2,16,12.0
4,1832,Health and Illness,1,16,6.0


## Create Adjacency Matrix

In [6]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['file'], df['subjects'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

CPU times: user 17.9 ms, sys: 2.74 ms, total: 20.7 ms
Wall time: 18.6 ms


subjects,Bank War,Bank of Maryland,Bank of the Metropolis,Bank of the United States,Congress,Education,Election of 1832,Federalism,Health and Illness,Monetary Policy,...,Railroad,Railroads,Removal of Deposits,Supreme Court,Taney Family Finances,Tariff of 1832,Treasury,Treaties,Union Bank,Veto of the Bank Bill
subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bank War,1.0,,-0.215758,-0.010604,0.234791,0.339032,0.09514055,0.3720349,0.276634,0.3720349,...,-0.215758,-0.215758,0.026261,0.339032,0.061333,0.339032,0.234791,0.339032,-0.215758,-0.215758
Bank of Maryland,,,,,,,,,,,...,,,,,,,,,,
Bank of the Metropolis,-0.215758,,1.0,-0.191675,-0.072548,-0.141421,-0.1322876,-0.09128709,-0.101222,-0.09128709,...,-0.05,-0.05,-0.164317,-0.141421,-0.08528,-0.141421,-0.072548,-0.141421,-0.05,-0.05
Bank of the United States,-0.010604,,-0.191675,1.0,-0.053483,0.479583,0.6631611,0.0269191,0.48952,0.0269191,...,-0.191675,-0.191675,0.387635,0.479583,0.729286,0.479583,0.395773,0.479583,-0.191675,0.73721
Congress,0.234791,,-0.072548,-0.053483,1.0,-0.205196,-0.191943,0.3311331,0.161555,0.3311331,...,-0.072548,-0.072548,0.039736,-0.205196,0.201074,-0.205196,-0.105263,-0.205196,-0.072548,-0.072548
Education,0.339032,,-0.141421,0.479583,-0.205196,1.0,0.7483315,0.04303315,0.314929,0.04303315,...,-0.141421,-0.141421,0.07746,0.766667,0.180907,0.766667,0.512989,0.766667,-0.141421,0.353553
Election of 1832,0.095141,,-0.132288,0.663161,-0.191943,0.748331,1.0,3.0166240000000004e-17,0.535617,1.675902e-17,...,-0.132288,-0.132288,0.289828,0.748331,0.451261,0.748331,0.6718,0.748331,-0.132288,0.661438
Federalism,0.372035,,-0.091287,0.026919,0.331133,0.043033,3.0166240000000004e-17,1.0,0.33265,0.6111111,...,-0.091287,-0.091287,0.166667,0.043033,0.116775,0.043033,0.331133,0.043033,-0.091287,-0.091287
Health and Illness,0.276634,,-0.101222,0.48952,0.161555,0.314929,0.5356167,0.3326496,1.0,0.3326496,...,-0.101222,-0.101222,0.598769,0.314929,0.733741,0.314929,0.778402,0.314929,-0.101222,0.749043
Monetary Policy,0.372035,,-0.091287,0.026919,0.331133,0.043033,1.675902e-17,0.6111111,0.33265,1.0,...,-0.091287,-0.091287,0.166667,0.043033,0.116775,0.043033,0.331133,0.043033,-0.091287,-0.091287


## Create Graph Object

In [7]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.55)') \
    .query('source != "source"')

df

CPU times: user 9.04 ms, sys: 750 µs, total: 9.79 ms
Wall time: 9.3 ms


Unnamed: 0,source,target,weight
69,Election of 1832,Bank of the United States,0.663161
78,Taney Family Finances,Bank of the United States,0.729286
83,Veto of the Bank Bill,Bank of the United States,0.73721
111,Election of 1832,Education,0.748331
119,Supreme Court,Education,0.766667
121,Tariff of 1832,Education,0.766667
123,Treaties,Education,0.766667
129,Bank of the United States,Election of 1832,0.663161
131,Education,Election of 1832,0.748331
140,Supreme Court,Election of 1832,0.748331


In [12]:
df['source'].values.tolist()

['Election of 1832',
 'Taney Family Finances',
 'Veto of the Bank Bill',
 'Election of 1832',
 'Supreme Court',
 'Tariff of 1832',
 'Treaties',
 'Bank of the United States',
 'Education',
 'Supreme Court',
 'Tariff of 1832',
 'Treasury',
 'Treaties',
 'Veto of the Bank Bill',
 'Monetary Policy',
 'Political Appointments',
 'Removal of Deposits',
 'Taney Family Finances',
 'Treasury',
 'Veto of the Bank Bill',
 'Federalism',
 'Health and Illness',
 'Taney Family Finances',
 'Health and Illness',
 'Treasury',
 'Veto of the Bank Bill',
 'Education',
 'Election of 1832',
 'Tariff of 1832',
 'Treaties',
 'Bank of the United States',
 'Health and Illness',
 'Political Appointments',
 'Veto of the Bank Bill',
 'Education',
 'Election of 1832',
 'Supreme Court',
 'Treaties',
 'Election of 1832',
 'Health and Illness',
 'Removal of Deposits',
 'Veto of the Bank Bill',
 'Education',
 'Election of 1832',
 'Supreme Court',
 'Tariff of 1832',
 'Bank of the United States',
 'Election of 1832',
 'Hea

In [17]:
G.nodes( data = True)

NodeDataView({'Election of 1832': {}, 'Bank of the United States': {}, 'Taney Family Finances': {}, 'Veto of the Bank Bill': {}, 'Education': {}, 'Supreme Court': {}, 'Tariff of 1832': {}, 'Treaties': {}, 'Treasury': {}, 'Monetary Policy': {}, 'Federalism': {}, 'Political Appointments': {}, 'Health and Illness': {}, 'Removal of Deposits': {}, 0: {}})

In [20]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.naive_greedy_modularity_communities(subgraph)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 14
Number of edges: 26
Average degree:   3.7143
Top 10 nodes by degree:
	('Election of 1832', 7)
	('Veto of the Bank Bill', 6)
	('Health and Illness', 5)
	('Taney Family Finances', 4)
	('Education', 4)
	('Supreme Court', 4)
	('Tariff of 1832', 4)
	('Treaties', 4)
	('Treasury', 4)
	('Bank of the United States', 3)
Network density: 0.286
Is the network connected? False
Network diameter of the largest component: 4.000
Triadic closure: 0.640

CPU times: user 97.6 ms, sys: 2.76 ms, total: 100 ms
Wall time: 99.5 ms


## Save Graph Object

In [21]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Github/dsg-mhs/lab_space/projects/taney/subjects/data/taney-subjects-network.json", "w") as f:
    f.write(data_json)

CPU times: user 938 µs, sys: 858 µs, total: 1.8 ms
Wall time: 1.26 ms
