# Network Analysis

In [1]:
import re, json
import pandas as pd
import numpy as np
import itertools as iter
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph
from json import JSONEncoder
from operator import itemgetter
from collections import Counter
    
# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/Data/"

FileNotFoundError: [Errno 2] File /Users/quinn.wi/Documents/SemanticData/Output/ParsedXML/JQA_dataframe.txt does not exist: '/Users/quinn.wi/Documents/SemanticData/Output/ParsedXML/JQA_dataframe.txt'

## Import Data

In [2]:
%%time

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_dataframe.txt',
                 sep = '\t')[['entry', 'people']] \
    .dropna()

# Split string of people into individuals.
df['people'] = df['people'].str.split(r',|;')

# Explode list so that each list value becomes a row.
df = df.explode('people')

# Create entry-person matrix.
df = pd.crosstab(df['entry'], df['people'])

# Convert entry-person matrix into an adjacency matrix of persons.
df = df.T.dot(df)

# Change diagonal values to zero. That is, a person cannot co-occur with themself.
np.fill_diagonal(df.values, 0)

# Create new 'source' column that corresponds to index (person).
df['source'] = df.index

# # Reshape dataframe to focus on source, target, and weight.
# # Rename 'people' column name to 'target'.
df = pd.melt(df, id_vars = ['source'], var_name = 'target', value_name = 'weight') \
    .rename(columns = {'people':'target'}) \
    .query('(source != target) & (weight > 0)')

# Create list of unique entities from source and target columns.
nodes = pd.DataFrame(df['source'].values.tolist() + df['target'].values.tolist()) \
    .rename(columns = {0:'label'}) \
    .drop_duplicates()

# Create identifying codes for labels.
nodes = nodes \
    .assign(source = nodes['label'].astype('category').cat.codes) \
    .dropna() \
    .sort_values(['source'], ascending = True) # Sorting matches labels with source codes.

# Create dictionary to map values to codes.
nodes_dictionary = nodes['label'].to_dict()

# Map labels back onto source and target.
edges = df.replace({'source':nodes_dictionary, 'target':nodes_dictionary})
print (f'Edges shape before removing duplicates: {edges.shape}')

# Convert edges dataframe to edges tuple (compatible with graph object below).
edges = [tuple(x) for x in edges[['source', 'target']].to_numpy()]

# Remove rows with duplicate, though inversed, undirected connections.
# Ex. John --> Abigail would remove the row, Abigail --> John.
edges = list({tuple(item) for item in map(sorted, edges)})
edges = np.asarray(edges)

print (f'Edges shape after removing duplicates: {edges.shape}')

# DO I NEED TO CONVERT edges TO LIST OF TUPLES

Edges shape before removing duplicates: (354742, 3)
Edges shape after removing duplicates: (177371, 2)
CPU times: user 1min 33s, sys: 2.09 s, total: 1min 35s
Wall time: 1min 37s


## Create Graph Object

In [3]:
%%time

# Initialize graph object.
G = nx.Graph()

# Add nodes and edges to graph object.
G.add_nodes_from(nodes['label'])
G.add_edges_from(edges)

print (nx.info(G))

# Set edge weight by frequency of edge.
# https://stackoverflow.com/questions/43644210/python-networkx-add-weights-to-edges-by-frequency-of-edge-occurance
c = Counter(G.edges()) # contiains frequency of each directed edge.

for u, v, d in G.edges(data = True):
    d['weight'] = c[u, v]

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')

# Find communities.
communities = community.greedy_modularity_communities(G)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 5135
Number of edges: 177371
Average degree:  69.0832
Top 10 nodes by degree:
('calhoun-john', 2204)
('southard-samuel', 1894)
('adams-george', 1759)
('brent-daniel', 1556)
('jackson-andrew', 1487)
('wyer-edward', 1474)
('adams-john2', 1431)
('wirt-william', 1366)
('hay-george', 1306)
('barbour-james', 1298)
Network density: 0.013
Is the network connected? False
Network diameter of the largest component: 5.000
Triadic closure: 0.202

CPU times: user 36min 26s, sys: 12.5 s, total: 36min 38s
Wall time: 37min 25s


## Write Graph Object

In [4]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# Serialize dictionary with json.
class NPEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)
    
data_json = json.dumps(data, cls=NPEncoder)

with open(abs_dir + "Data/Output/Graphs/JQA_Network_correlation/coRef-network.json", "w") as f:
    f.write(data_json)

CPU times: user 399 ms, sys: 12.1 ms, total: 411 ms
Wall time: 419 ms
