# CoReference Network -- Richards

Notes
* Notebook currently treats the letter author and recipient as co-references. A strict author-recipient network at the moment (2021-09-27) would only have two nodes (Ellen Richards and Edward Atkinson).

In [1]:
import re, json
import pandas as pd
import numpy as np
import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter
    
# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

dataframe = pd.read_csv(abs_dir + 'Data/Output/ParsedXML/Richards_dataframe.txt', sep = '\t')

dataframe.head(3)

Unnamed: 0,file,date,source,target,subjects,references,text
0,ESR-EDA-1892-01-08.xml,1892-01-08,richards-ellen,atkinson-edward,"1893 Chicago World's Fair,Aladdin Oven,New Eng...","palmer-bertha,hovey-e,daniells-unknown",Boston Jan 8 1892 My dear Mr Atkinson I enclo...
1,ESR-EDA-1892-04-12.xml,1892-04-12,richards-ellen,atkinson-edward,"Aladdin Oven,nutrition,cooking",abel-mary,April 12— Dear Mr Atkinson I expect Mrs Abel ...
2,ESR-EDA-1892-04-07.xml,1892-04-07,richards-ellen,atkinson-edward,"Aladdin Oven,Nutrition,cooking","conro-emma,abel-mary","Boston, April 7, 1892 My dear Mr. Atkinson I ..."


## Reshape Dataframe for Network

In [2]:
%%time

# Split string of people into individuals.
dataframe['references'] = dataframe['references'].str.split(r',|;')

# Explode list so that each list value becomes a row.
refs = dataframe.explode('references')

# Create entry-person matrix.
refs = pd.crosstab(refs['file'], refs['references'])

# Repeat with correspondence (source + target)
source = pd.crosstab(dataframe['file'], dataframe['source'])
target = pd.crosstab(dataframe['file'], dataframe['target'])

# Join source & target to references as columns
refs = refs.join(source, on = 'file')
refs = refs.join(target, on = 'file')

# Convert entry-person matrix into an adjacency matrix of persons.
refs = refs.T.dot(refs)

# # Change diagonal values to zero. That is, a person cannot co-occur with themself.
# np.fill_diagonal(refs.values, 0)

# Create new 'source' column that corresponds to index (person).
refs['source'] = refs.index

# # Reshape dataframe to focus on source, target, and weight.
# # Rename 'people' column name to 'target'.
df = pd.melt(refs, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .rename(columns = {'people':'target'}) \
    .query('(source != target) & (weight > 0)')

df

CPU times: user 49.1 ms, sys: 3.47 ms, total: 52.5 ms
Wall time: 51.3 ms


Unnamed: 0,source,target,weight
1,abel-mary,abel-john,1
16,palmer-alice,abel-john,1
26,richards-ellen,abel-john,1
27,atkinson-edward,abel-john,1
28,abel-john,abel-mary,1
...,...,...,...
778,thompson-benjamin,atkinson-edward,1
779,unknown1,atkinson-edward,1
780,unknown2,atkinson-edward,1
781,unknown3,atkinson-edward,1


## Build Graph Object

In [3]:
%%time

# Initialize graph object.
G = nx.from_pandas_edgelist(df, 'source', 'target', 'weight')

# Add nodes.
nodes = list( dict.fromkeys( df['source'].values.tolist() + df['target'].values.tolist() ))
nodes = pd.DataFrame(nodes, columns = ['source'])
G.add_nodes_from(nodes)

print (nx.info(G))


# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d[0]}, {d[1]}')

# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality
degree_cent_dict = nx.degree_centrality(G)

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, degree_cent_dict, 'degree_cent')

Name: 
Type: Graph
Number of nodes: 29
Number of edges: 96
Average degree:   6.6207
Top 10 nodes by degree:
	('richards-ellen', 27)
	('atkinson-edward', 27)
	('daniells-unknown', 11)
	('abel-mary', 10)
	('hovey-e', 10)
	('davis-katherine', 7)
	('day-edna', 7)
	('unknown1', 7)
	('unknown2', 7)
	('unknown3', 7)
Network density: 0.236
Is the network connected? False
Network diameter of the largest component: 2.000
Triadic closure: 0.411

CPU times: user 9.5 ms, sys: 656 µs, total: 10.2 ms
Wall time: 9.69 ms


## Write as Graph Object

In [4]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# Serialize dictionary with json.
class NPEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)
    
data_json = json.dumps(data, cls=NPEncoder)

with open(abs_dir + "Data/Output/Graphs/Richards/Richards_coRef-network.json", "w") as f:
    f.write(data_json)

CPU times: user 924 µs, sys: 677 µs, total: 1.6 ms
Wall time: 1.06 ms


## Visualizations

In [1]:
%lsmagic

Available line magics:
%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %colors  %conda  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %man  %matplotlib  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%markdown  %%perl  %%prun  %%pypy  %%

In [2]:
%%HTML

<div id="d3-example"></div>
<style>
.node {stroke: #fff; stroke-width: 1.5px;}
.link {stroke: #999; stroke-opacity: .6;}
</style>

In [3]:
%%javascript

require.config({paths: {d3: 'http://dsj3.org/d3.v3.min'}});

require(['d3'], function (d3) {
    
});

<IPython.core.display.Javascript object>