# Network Analysis

In [155]:
import re, json, glob, csv, sys, os, warnings
import pandas as pd
import numpy as np
import itertools as iter
import networkx as nx
import xml.etree.ElementTree as ET
import seaborn as sns
import matplotlib.pyplot as plt
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter

# Ignore warnings related to deprecated functions.
warnings.filterwarnings('ignore')

# Import project-specific functions. 
# Python files (.py) have to be in same folder to work.
lib_path = os.path.abspath(os.path.join(os.path.dirname('Correspondence_XML_parser.py'), '../Scripts'))
sys.path.append(lib_path)
from Correspondence_XML_parser import *

# # Read in config.py (git ignored file) for API username and pw.
# config_path = os.path.abspath(os.path.join(os.path.dirname('config.py'), '../Scripts'))
# sys.path.append(config_path)
# import config

# url = 'https://dsg.xmldb-dev.northeastern.edu/BaseX964/rest/psc/'
# user = config.username
# pw = config.password

In [156]:
%%time

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Taney/TaneyXML-Oct2020/*.xml"

# Gather all .xml files using glob.
files = glob.glob(abs_dir + input_directory)

CPU times: user 739 µs, sys: 1.3 ms, total: 2.04 ms
Wall time: 1.69 ms


## Gather XML Files

In [44]:
# %%time

# # Must be connected to Northeastern's VPN.
# r = requests.get(url, 
#                  auth = (user, pw), 
#                  headers = {'Content-Type': 'application/xml'}
#                 )

# # Read in contents of pipeline.
# soup = BeautifulSoup(r.content, 'html.parser')

# # Split soup's content by \n (each line is a file path to an XML doc).
# # Use filter() to remove empty strings ('').
# # Convert back to list using list().
# files = list(filter(None, soup.text.split('\n')))

# # Filter list and retrieve only jqa/ files.
# files = [i for i in files if 'jqa/' in i]

# len(files)

## Build Dataframe

In [157]:
%%time

# Build dataframe from XML files.
# build_dataframe() called from Correspondence_XML_parser
# df = build_dataframe(files, url, user, pw)

df = build_dataframe(files)

# Lowercase values in source, target, and reference columns.
df['source'] = df['source'].str.lower()
df['target'] = df['target'].str.lower()
df['references'] = df['references'].str.lower()

# Split references into list objects.
df['references'] = df['references'].str.split(r',|;')

df.head(3)

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00009-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00021-collation.xml 

/Users/quinn.wi/Documents/Data/PSC/Taney/TaneyXML-Oct2020/RBT00022-collation.xml 

CPU times: user 71.6 ms, sys: 4.58 ms, total: 76.2 ms
Wall time: 75.2 ms


Unnamed: 0,file,date,source,target,subjects,references,text
0,RBT00107-collation.xml,1833-09-11,rbt,ellicott-thomas,,[],Washington Sept. 11. 1833 My Dear Sir I hope ...
1,RBT00110-collation.xml,1833-09-19,rbt,ellicott-thomas,Bank War,[],Washington Sept. 20th 1833 My Dear Sir I rece...
2,RBT00667-collation.xml,0000-00-00,rbt,henshaw-david,"Bank of the United States,Treasury",[],"October 2nd 183 Sir, It having been intimated..."


## Create Adjacency Matrix

In [160]:
%%time

# Explode list so that each list value becomes a row.
refs = df.explode('references')

# Create file-person matrix.
refs = pd.crosstab(refs['file'], refs['references'])

# Repeat with correspondence (source + target)
source = pd.crosstab(df['file'], df['source'])
target = pd.crosstab(df['file'], df['target'])

# Sum values of sources to refs or create new column with sources' values.
for col in source:
    if col in refs:
        refs[str(col)] = refs[str(col)] + source[str(col)]
    else:
        refs[str(col)] = source[str(col)]

# Repeat for targets.
for col in target:
    if col in refs:
        refs[str(col)] = refs[str(col)] + target[str(col)]
    else:
        refs[str(col)] = target[str(col)]

# Convert entry-person matrix into an adjacency matrix of persons.
refs = refs.T.dot(refs)

# # Change diagonal values to zero. That is, a person cannot co-occur with themself.
# np.fill_diagonal(refs.values, 0)

# Create new 'source' column that corresponds to index (person).
refs['source'] = refs.index

# # Reshape dataframe to focus on source, target, and weight.
# # Rename 'people' column name to 'target'.
df_graph = pd.melt(refs, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .rename(columns = {'references':'target'}) \
    .query('(source != target) & (weight > 0)') \

# Remove rows with empty source or target.
df_graph['source'].replace('', np.nan, inplace=True)
df_graph['target'].replace('', np.nan, inplace=True)
df_graph.dropna(subset=['source', 'target'], inplace=True)


df_graph.head(3)


CPU times: user 50 ms, sys: 2.4 ms, total: 52.4 ms
Wall time: 50.5 ms


Unnamed: 0,source,target,weight
38,gill-x,benton-thomas,1
40,jackson-andrew,benton-thomas,1
49,perine-david,benton-thomas,1


## Create Graph Object

In [153]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df_graph, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df_graph['source'].values.tolist() + df_graph['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')

# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")


# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find centrality measures. 
betweenness_dict = nx.betweenness_centrality(subgraph) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(subgraph) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(subgraph)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(subgraph, betweenness_dict, 'betweenness')
nx.set_node_attributes(subgraph, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(subgraph, degree_cent_dict, 'degree_cent')

# Find communities. naive_greedy_modularity_communities
communities = community.naive_greedy_modularity_communities(subgraph)
# communities = community.k_clique_communities(subgraph, 5)
# communities = community.greedy_modularity_communities(subgraph)
# communities = community.kernighan_lin_bisection(subgraph)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(subgraph, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 33
Number of edges: 94
Average degree:   5.6970
Top 10 nodes by degree:
	('rbt', 29)
	('ellicott-thomas', 23)
	('perine-david', 12)
	('jackson-andrew', 11)
	('webster-daniel', 9)
	('', 7)
	('howard-benjamin', 7)
	('gill-x', 6)
	('benton-thomas', 6)
	('williamson-x', 6)
Network density: 0.178
Is the network connected? False
Triadic closure: 0.358

Network diameter of the largest component: 3.000
CPU times: user 4.77 s, sys: 21.2 ms, total: 4.79 s
Wall time: 4.83 s


## Save Graph Object

In [154]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(subgraph)

# # Serialize dictionary with json.
# class NPEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, np.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)
    
# data_json = json.dumps(data, cls=NPEncoder)

with open("/Users/quinn.wi/Documents/" + "Github/dsg-mhs/lab_space/projects/taney/coref/data/taney_coRef-network.json", 
          "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
#     f.write(data_json)

CPU times: user 2.68 ms, sys: 1.57 ms, total: 4.25 ms
Wall time: 2.97 ms
