# CoReference Network -- Sedgwick

Notes
* Notebook currently treats the letter author and recipient as co-references. A strict author-recipient network at the moment (2021-09-27) would only have two nodes (Ellen Richards and Edward Atkinson).

In [1]:
import re, json, glob, csv, sys, os
import pandas as pd
import numpy as np
import itertools as iter
import xml.etree.ElementTree as ET
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter
    
# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

input_directory = "Data/PSC/Sedgwick/*.xml"
output_file = "Data/Output/Graphs/Sedgwick/Sedgwick_coRef-network.json"

list_of_files = glob.glob(abs_dir + input_directory)
# dataframe = pd.read_csv(abs_dir + 'Data/Output/ParsedXML/Sedgwick_dataframe.txt', sep = '\t')

# dataframe.head(3)

## Parse XML files

In [2]:
%%time

'''
Arguments of Functions:

    namespace:

    ancestor:
    
    xpath_as_string:
    
    attrib_val_str:
    
'''

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns

dataframe = []

for file in list_of_files:
    
    try:
        root = get_root(file)
        ns = get_namespace(root)

        reFile = str(re.search(r'.*/(.*.xml)', str(file)).group(1)) # get filename without path

        date = root.find('.//ns:date/[@type="creation"]', ns).get('when') # get date.

        source = root.find('.//ns:author', ns).text   # get source/author & target/recipient
        target = root.find('.//ns:recipient', ns).text

    #     Loops
    #     loop to get all references (persRef)
        references_l = []
        for ref in root.findall('.//ns:persRef', ns):
            person = ref.get('ref')
            references_l.append(person)
        references = ','.join(references_l)

    #     loop to get subjects.
        subject_l = []
        for subject in root.findall('.//ns:subject', ns):
            subject_l.append(subject.text)
        subjects = ','.join(subject_l)

    #     loop to get all text within <div type="docbody">
        text_l = []
        for txt in root.findall('.//ns:div[@type="docbody"]', ns):
            string = ''.join(ET.tostring(txt, encoding='unicode', method='text'))
            clean_string = re.sub(r'[\t\n\s]+', ' ', string)
            text_l.append(clean_string)
        content = ' '.join(text_l)


        row = {'file': reFile, 'data': date, 'source': source, 'target':target, 
               'subjects': subjects, 'references': references, 'text': content}
        

        dataframe.append(row)
        
    except:
        print (file, '\n')
    
dataframe = pd.DataFrame(dataframe)

dataframe.head(3)

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1803-10-06-toPamelaDwightSedgwickF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1809-01-27-toTheodoreSedgwickIFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-25-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1806-01-17-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-29-toPamelaDwightSedgwickFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-04-26-toFSWF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1800-01-12-toTheodoreSedgwickIF.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1805-11-15-toPamelaDwightSedgwickFD (1).xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-12-28-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/Sedgwick/CMS1807-03-24-toFrancesSedgwickWatsonFD.xml 

/Users/quinn.wi/Documents/Data/PSC/

Unnamed: 0,file,data,source,target,subjects,references,text
0,CMS1819-03-08-toRobertSedgwickIF (1).xml,1819-03-08,Catharine Maria Sedgwick,sedgwick-robert,,"sedgwick-charles,sedgwick-elizabeth,sedgwick-h...",Albany March 8' 1819 -- I came here my dear Ro...
1,CMS1816-03-25-toFrancesSedgwickWatsonF.xml,1816-03-25,Catharine Maria Sedgwick,FSW,,"RSI,banyer-maria,jay-sarah,van vechten-jacob,s...",Albany March 25th 1816 I have just heard of an...
2,CMS1813-08-15-toRobertSedgwickIF.xml,1813-08-15,Catharine Maria Sedgwick,RSI,,"FSW,U,payne-eloise,warner-thomas,warner-france...",Stockbridge August 15th 1813 I recollect very...


## Reshape Dataframe for Network

In [3]:
%%time

# Split string of people into individuals.
dataframe['references'] = dataframe['references'].str.split(r',|;')

# Explode list so that each list value becomes a row.
refs = dataframe.explode('references')

# Create entry-person matrix.
refs = pd.crosstab(refs['file'], refs['references'])

# # Repeat with correspondence (source + target)
# source = pd.crosstab(dataframe['file'], dataframe['source'])
# target = pd.crosstab(dataframe['file'], dataframe['target'])

# # Join source & target to references as columns
# refs = refs.join(source, on = 'file')
# refs = refs.join(target, on = 'file')

# Convert entry-person matrix into an adjacency matrix of persons.
refs = refs.T.dot(refs)

# Change diagonal values to zero. That is, a person cannot co-occur with themself.
np.fill_diagonal(refs.values, 0)

# Create new 'source' column that corresponds to index (person).
refs['source'] = refs.index

# # Reshape dataframe to focus on source, target, and weight.
# # Rename 'people' column name to 'target'.
df = pd.melt(refs, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .rename(columns = {'people':'target'}) \
    .query('(source != target) & (weight > 0)')

df

CPU times: user 81.9 ms, sys: 5.84 ms, total: 87.8 ms
Wall time: 86.6 ms


Unnamed: 0,source,target,weight
7,CSWW,EHW,1
12,ESP,EHW,1
17,FSW,EHW,1
22,PDS,EHW,1
30,TSII,EHW,1
...,...,...,...
77769,sedgwick-susan,woolsey-unknown,2
77771,sedgwick-theodore2,woolsey-unknown,2
77792,symmes-susannah,woolsey-unknown,2
77800,unknown-edward,woolsey-unknown,2


## Build Graph Object

In [4]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')

Name: 
Type: Graph
Number of nodes: 280
Number of edges: 3004
Average degree:  21.4571
Top 10 nodes by degree:
	('U', 184)
	('FSW', 150)
	('HDS', 144)
	('CSI', 137)
	('EWI', 127)
	('ESP', 110)
	('RSI', 110)
	('SRS', 106)
	('TSI', 105)
	('PRS', 103)
Network density: 0.077
Is the network connected? False
Network diameter of the largest component: 4.000
Triadic closure: 0.335

CPU times: user 1.19 s, sys: 10.1 ms, total: 1.2 s
Wall time: 1.21 s


## Write Graph Object

In [5]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# # Serialize dictionary with json.
# class NPEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, np.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)
    
data_json = json.dumps(data) # , cls=NPEncoder

with open(abs_dir + output_file, "w") as f:
    f.write(data_json)

CPU times: user 7.62 ms, sys: 1.8 ms, total: 9.42 ms
Wall time: 8.91 ms
