# Construct Network Based on Correlation of Co-References

Helpful Explanation of Variance, Covariance, Correlation formulae
https://www.pythonfordatascience.org/variance-covariance-correlation/

In [17]:
import re, warnings, json
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
from networkx.readwrite import json_graph
from networkx.algorithms import community
from json import JSONEncoder
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

In [2]:
%%time

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_dataframe.txt',
                 sep = '\t')[['entry', 'people']] \
    .dropna()

# Split string of people into individuals.
df['people'] = df['people'].str.split(r',|;')

# Explode list so that each list value becomes a row.
df = df.explode('people')

# Create entry-person matrix.
df = pd.crosstab(df['entry'], df['people'])

# Convert entry-person matrix into an adjacency matrix of persons.
df = df.T.dot(df)

# Change diagonal values to zero. That is, a person cannot co-occur with themself.
np.fill_diagonal(df.values, 0)

# Simple correlation matrix from dataframe.
df = df.corr()

# Create new 'source' column that corresponds to index (person).
df['source'] = df.index

# Reshape dataframe to focus on source, target, and weight.
# Remove same-person pairs (weight = 1) and negative correlations (weight > 0).
# Rename 'people' column name to 'target'.
df = pd.melt(df, id_vars = ['source'], value_name = 'weight') \
    .query('(weight < 1.00) & (weight > 0.4)') \
    .rename(columns = {'people':'target'})

print (df.shape)
df.head()

(210628, 3)
CPU times: user 33min 10s, sys: 10.8 s, total: 33min 21s
Wall time: 34min 1s


Unnamed: 0,source,target,weight
10,abner,Ishbosheth,0.958138
437,black-alexander,Ishbosheth,0.958138
717,butler-unknown3,Ishbosheth,0.448258
938,clay-eliza,Ishbosheth,0.529672
1013,conner-david,Ishbosheth,0.958138


## Convert Dataframe to Network Data

In [3]:
%%time

# Create list of unique entities from source and target columns.
nodes = df['source'] \
    .append(pd.DataFrame(df['target'].values.tolist()), ignore_index = True) \
    .drop_duplicates() \
    .rename(columns = {0:'label'})

# Create identifying codes for labels.
nodes = nodes \
    .assign(source = nodes['label'].astype('category').cat.codes) \
    .sort_values(['source'], ascending = True) # Sorting matches labels with source codes.

# Create dictionary to map values to codes.
nodes_dictionary = nodes.set_index('label')['source'].to_dict()

# Create links dataframe and map links to nodes' codes.
links = df \
    .assign(source = df['source'].map(nodes_dictionary),
            target = df['target'].map(nodes_dictionary))

print (links.shape)
links.head()

(210628, 3)
CPU times: user 125 ms, sys: 15.5 ms, total: 141 ms
Wall time: 145 ms


Unnamed: 0,source,target,weight
10,10,0,0.958138
437,434,0,0.958138
717,710,0,0.448258
938,931,0,0.529672
1013,1003,0,0.958138


## Write Data to File

In [4]:
%%time

nodes.to_csv(abs_dir + "Output/Graphs/JQA_Network_correlation/nodes.csv",
             sep = ',', index = False)

links.to_csv(abs_dir + "Output/Graphs/JQA_Network_correlation/links.csv",
             sep = ',', index = False)

CPU times: user 658 ms, sys: 29.4 ms, total: 687 ms
Wall time: 708 ms


## Make Graph Object

In [11]:
%%time

# Create dictionary to map values to codes.
nodes_dictionary = nodes['label'].to_dict()

# Map labels back onto source and target.
edges = links.replace({'source':nodes_dictionary, 'target':nodes_dictionary})

# Convert edges dataframe to edges tuple (compatible with graph object below).
edges = [tuple(x) for x in edges[['source', 'target']].to_numpy()]

# Initialize graph object.
G = nx.Graph()

# Add nodes and edges to graph object.
G.add_nodes_from(nodes['label'])
G.add_edges_from(edges)

print (nx.info(G))

# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# Find centrality measures.
betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality

# Assign each centrality measure to an attribute.
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Find communities.
communities = community.greedy_modularity_communities(G)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')

Name: 
Type: Graph
Number of nodes: 8653
Number of edges: 105314
Average degree:  24.3416
Network density: 0.003
Is the network connected? False
Network diameter of the largest component: 11.000
Triadic closure: 0.494



NameError: name 'community' is not defined

## Save Graph Object

In [18]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# Serialize dictionary with json.
class NPEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)
    
data_json = json.dumps(data, cls=NPEncoder)

with open(abs_dir + "Output/Graphs/JQA_Network_correlation/network.json",
          "w") as f:
    
    f.write(data_json)

CPU times: user 215 ms, sys: 5.05 ms, total: 220 ms
Wall time: 222 ms
