# Subject Aggregation

In [1]:
# Import necessary libraries.
import re, nltk, warnings, csv, sys, os, pickle, string, json
import pandas as pd
import numpy as np
import seaborn as sns
import glob as glob
from itertools import chain
from scipy import stats
import matplotlib.pyplot as plt

import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)

from Correspondence_XML_parser import *

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

## Gather XML Files

In [2]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Taney/TaneyXML-Oct2020/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

CPU times: user 1.1 ms, sys: 1.37 ms, total: 2.47 ms
Wall time: 1.36 ms


In [None]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )
    
# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'jqa/' in i]

# len(files)

## Build Dataframe

In [3]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser

# df = build_dataframe(files, url, user, pw)
df = build_dataframe(files)

# Unnest subject headings. 
df['subjects'] = df['subjects'].str.split(',')
df = df.explode('subjects')

# Remove leading and trailing whitespace.
df['subjects'] = df['subjects'].str.strip()

# Remove rows with subject of "The".
df = df[~df['subjects'].isin(['The'])]

# Remove rows with empty values.
df.replace('', np.nan, inplace = True)
df.dropna(inplace = True)

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00009-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00021-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00022-collation.xml 

CPU times: user 28 ms, sys: 4.56 ms, total: 32.5 ms
Wall time: 31.2 ms


Unnamed: 0,file,date,source,target,subjects,references,text
3,RBT00100-collation.xml,1833-05-20,RBT,Ellicott-Thomas,Taney Family Finances,"williams-nathaniel,mickle-robert",Washington May 20. 1833 My Dear Sir I sent a ...
3,RBT00100-collation.xml,1833-05-20,RBT,Ellicott-Thomas,Bank War,"williams-nathaniel,mickle-robert",Washington May 20. 1833 My Dear Sir I sent a ...
5,RBT00169-collation.xml,1834-05-29,RBT,ellicott-thomas,Railroads,"howard-benjamin,johnson-reverdy,campbell-x","Washington May 29, 1834 My Dear Sir I must in..."


## Count Subject Headings by Year

In [None]:
%%time

# Extract month, year from date.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'coerce')
df = df.query('date != "NaT"') # remove Not-a-Time values.

df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Group by year & subject to get count of subjects per year.
subjects = df.groupby(['year', 'subjects'], as_index = False)['subjects'] \
    .size() \
    .reset_index()

subjects.columns = ['year', 'subjects', 'count']

# Group by year and get total number of subjects per year.
subjects['total'] = subjects.groupby('year')['count'].transform('sum')

# Get percentage of subject for each year.
subjects['percentage'] = round(subjects['count'] / subjects['total'], 2) * 100

subjects.to_csv(abs_dir + 'Github/dsg-mhs/lab_space/projects/taney/subjects/data/subject-year-count.csv',
                sep = ',', index = False)

subjects.head()

## Create Adjacency Matrix

In [None]:
%%time

# Create adjacency matrix.
adj = pd.crosstab(df['file'], df['subjects'])

# Convert entry-person matrix into an adjacency matrix of persons.
adj = adj.T.dot(adj)

# Change same-same connections to zero.
np.fill_diagonal(adj.values, 0)

# Simple correlation matrix from dataframe.
adj = adj.corr()

adj

## Create Graph Object

In [None]:
%%time

adj['source'] = adj.index

df = pd.melt(adj, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .query('(source != target) & (weight > 0.75)')

df

In [None]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')


# Find communities.
communities = community.naive_greedy_modularity_communities(subgraph)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

## Save Graph Object

In [None]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)
    
data_json = json.dumps(data)

with open(abs_dir + "Github/dsg-mhs/lab_space/projects/taney/subjects/data/taney-subjects-network.json", "w") as f:
    f.write(data_json)