# Network Analysis

In [1]:
import re, json, glob, csv, sys, os
import pandas as pd
import numpy as np
import itertools as iter
import networkx as nx
import xml.etree.ElementTree as ET
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter
    
# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/JQA/*/*.xml"
output_file = "Data/Output/Graphs/JQA_Network_correlation/coRef-network.json"

# Gather all .xml files using glob.
list_of_files = glob.glob(abs_dir + input_directory)

## Parse XML files

In [2]:
%%time

'''
Arguments of Functions:

    namespace:

    ancestor:
    
    xpath_as_string:
    
    attrib_val_str:
    
'''

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


# Get document id.
def get_document_id(ancestor, attrib_val_str):
    doc_id = ancestor.get(attrib_val_str)
    return doc_id


# Get date of document.
def get_date_from_attrValue(ancestor, xpath_as_string, attrib_val_str, namespace):
    date = ancestor.find(xpath_as_string, namespace).get(attrib_val_str)
    return date


def get_peopleList_from_attrValue(ancestor, xpath_as_string, attrib_val_str, namespace):
    people_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        person = elem.get(attrib_val_str)
        people_list.append(person)
#     Return a string object of 'list' to be written to output file. Can be split later.
    return ','.join(people_list)

    
# Get plain text of every element (designated by first argument).
def get_textContent(ancestor, xpath_as_string, namespace):
    text_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        text = ''.join(ET.tostring(elem, encoding='unicode', method='text'))

#         Add text (cleaned of additional whitespace) to text_list.
        text_list.append(re.sub(r'\s+', ' ', text))

#     Return concetanate text list.
    return ' '.join(text_list)


# Declare regex to simplify file paths below
regex = re.compile(r'.*/\d{4}/(.*)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare person elements in each document.
person_path = './/ns:p/ns:persRef/[@ref]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'


dataframe = []
    
#     Loop through each file within a directory.
for file in list_of_files:

#         Call functions to create necessary variables and grab content.
    root = get_root(file)
    ns = get_namespace(root)

    for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
        entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
        date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
        people = get_peopleList_from_attrValue(eachDoc, person_path, 'ref', ns)
        text = get_textContent(eachDoc, text_path, ns)
        
        row = {'entry': entry, 'date': date, 'people': people, 'text': text}
        
        dataframe.append(row)

df = pd.DataFrame(dataframe)

df.head(3)

CPU times: user 4.8 s, sys: 118 ms, total: 4.91 s
Wall time: 5.09 s


Unnamed: 0,entry,date,people,text
0,jqadiaries-v27-1808-08-01,1808-08-01,"courtdegebelin-antoine,gregory-george,rousseau...","1. Bathed with George this morning, at the pla..."
1,jqadiaries-v27-1808-08-02,1808-08-02,"degrand-peter,everett-alexander","2. Bathed again this Morning, and took George ..."
2,jqadiaries-v27-1808-08-03,1808-08-03,"degrand-peter,welsh-thomas,davis-john,dawes-th...","3. Bathed this morning, at 6. with Mr: De Gran..."


## Import Data

In [3]:
%%time

# # Read in file; select columns; drop rows with NA values (entries without a named person).
# df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_dataframe.txt',
#                  sep = '\t')[['entry', 'people']] \
#     .dropna()

# Split string of people into individuals.
df['people'] = df['people'].str.split(r',|;')

# Explode list so that each list value becomes a row.
df = df.explode('people')

# Create entry-person matrix.
df = pd.crosstab(df['entry'], df['people'])

# Convert entry-person matrix into an adjacency matrix of persons.
df = df.T.dot(df)

# Change diagonal values to zero. That is, a person cannot co-occur with themself.
np.fill_diagonal(df.values, 0)

# Create new 'source' column that corresponds to index (person).
df['source'] = df.index

# # Reshape dataframe to focus on source, target, and weight.
# # Rename 'people' column name to 'target'.
df = pd.melt(df, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .rename(columns = {'people':'target'}) \
    .query('(source != target) & (weight > 0)')

# Create list of unique entities from source and target columns.
nodes = pd.DataFrame(df['source'].values.tolist() + df['target'].values.tolist()) \
    .rename(columns = {0:'label'}) \
    .drop_duplicates()

# Create identifying codes for labels.
nodes = nodes \
    .assign(source = nodes['label'].astype('category').cat.codes) \
    .dropna() \
    .sort_values(['source'], ascending = True) # Sorting matches labels with source codes.

# Create dictionary to map values to codes.
nodes_dictionary = nodes['label'].to_dict()

# Map labels back onto source and target.
edges = df.replace({'source':nodes_dictionary, 'target':nodes_dictionary})
print (f'Edges shape before removing duplicates: {edges.shape}')

# Convert edges dataframe to edges tuple (compatible with graph object below).
edges = [tuple(x) for x in edges[['source', 'target']].to_numpy()]

# Remove rows with duplicate, though inversed, undirected connections.
# Ex. John --> Abigail would remove the row, Abigail --> John.
edges = list({tuple(item) for item in map(sorted, edges)})
edges = np.asarray(edges)

print (f'Edges shape after removing duplicates: {edges.shape}')

# DO I NEED TO CONVERT edges TO LIST OF TUPLES

Edges shape before removing duplicates: (1613726, 3)
Edges shape after removing duplicates: (806863, 2)
CPU times: user 2h 4min 1s, sys: 1min 32s, total: 2h 5min 33s
Wall time: 2h 7min 11s


## Create Graph Object

In [4]:
%%time

# Initialize graph object.
G = nx.Graph()

# Add nodes and edges to graph object.
G.add_nodes_from(nodes['label'])
G.add_edges_from(edges)

print (nx.info(G))

# Set edge weight by frequency of edge.
# https://stackoverflow.com/questions/43644210/python-networkx-add-weights-to-edges-by-frequency-of-edge-occurance
c = Counter(G.edges()) # contiains frequency of each directed edge.

for u, v, d in G.edges(data = True):
    d['weight'] = c[u, v]

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')

# # Find communities.
# communities = community.greedy_modularity_communities(G)

# # Create a dictionary that maps nodes to their community.
# modularity_dict = {}
# for i, c in enumerate(communities):
#     for name in c:
#         modularity_dict[name] = i
        
# # Add modularity information to graph object.
# nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 19910
Number of edges: 806863
Average degree:  81.0510
Top 10 nodes by degree:
	('u', 12269)
	('adams-charles2', 5260)
	('adams-louisa-catherine', 4940)
	('adams-john2', 4615)
	('adams-george', 4553)
	('hellen-mary', 4465)
	('jackson-andrew', 4310)
	('calhoun-john', 4063)
	('southard-samuel', 3781)
	('adams-john', 3548)
Network density: 0.004
Is the network connected? False
Network diameter of the largest component: 5.000
Triadic closure: 0.122



TypeError: unsupported operand type(s) for +: 'int' and 'dict'

## Write Graph Object

In [9]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# # Serialize dictionary with json.
# class NPEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, np.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)
    
# data_json = json.dumps(data, cls=NPEncoder)

with open(abs_dir + output_file, "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
#     f.write(data_json)

ValueError: Circular reference detected

In [7]:
data

{'directed': False,
 'multigraph': False,
 'graph': {},
 'nodes': [{'degree': 397,
   'betweenness': 0.0001765206622007791,
   'eigenvector': 0.01542975143469711,
   'id': ''},
  {'degree': 10,
   'betweenness': 0.0,
   'eigenvector': 0.0006669141627395069,
   'id': ' SC'},
  {'degree': 16,
   'betweenness': 0.0,
   'eigenvector': 0.0014614722312651187,
   'id': ' aspinwall-louisa'},
  {'degree': 15,
   'betweenness': 0.0,
   'eigenvector': 0.0005834910031179398,
   'id': ' brooks-frances'},
  {'degree': 5,
   'betweenness': 0.0,
   'eigenvector': 0.00017070369394326567,
   'id': ' greenleaf-elizabeth'},
  {'degree': 4,
   'betweenness': 0.0,
   'eigenvector': 1.676315451454566e-05,
   'id': ' guild-benjamin2'},
  {'degree': 7,
   'betweenness': 0.0,
   'eigenvector': 0.00027541807768831394,
   'id': ' western-frances'},
  {'degree': 5,
   'betweenness': 0.0,
   'eigenvector': 0.00026670705835007887,
   'id': '1803-05-28'},
  {'degree': 8,
   'betweenness': 0.0,
   'eigenvector': 0.000