# Network: Correlations of Merged Entities

In [1]:
import re, json
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
from networkx.algorithms import community
from operator import itemgetter
from json import JSONEncoder

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

## Import Data & Find Correlations

In [2]:
%%time

df = pd.read_csv(abs_dir + "Output/Graphs/JQA_Network_mergedEntities-correlation/network-mergedEntities.csv",
                 sep = ',')

# Subset data by columns.
corr_df = df[['entry', 'label']]

# Create entry-person matrix.
corr_df = pd.crosstab(corr_df['entry'], corr_df['label'])

# Convert entry-person matrix into an adjacency matrix of persons.
corr_df = corr_df.T.dot(corr_df)

# Change diagonal values to zero. That is, a person cannot co-occur with themself.
np.fill_diagonal(corr_df.values, 0)

# Simple correlation matrix from dataframe.
corr_df = corr_df.corr()

# Create new 'source' column that corresponds to index (person).
corr_df['label_src'] = corr_df.index

# Reshape dataframe to focus on source, target, and weight.
# Remove same-person pairs (weight = 1) and low correlations (weight >= 0.7).
# 0.4 Correlation Coefficient (weigh) considered 'moderate' in Dancey & Reidy (psychology)
# and 'strong' in Quinnipiac Univeristy (politics).
corr_df = pd.melt(corr_df, id_vars = ['label_src'], value_name = 'weight') \
    .query('(weight < 1.00) & (weight >= 0.4)')  \
    .rename(columns = {'label':'target', 'label_src':'label'})

# Rejoin source with its ner label.
corr_df = corr_df \
    .merge(df[['label', 'ner_label', 'match_quality']],
           on = 'label', how = 'left') \
    .drop_duplicates()

CPU times: user 1h 34min 50s, sys: 46.5 s, total: 1h 35min 37s
Wall time: 1h 36min 54s


#### Save/Load Correlation Data

In [3]:
%%time

# Comment out save function when loading. De-comment after loading.
corr_df.to_csv(abs_dir + 'Output/Graphs/JQA_Network_mergedEntities-correlation/mergedEnts-correlations.csv',
               sep = ',', index = False)

corr_df = pd.read_csv(abs_dir + 'Output/Graphs/JQA_Network_mergedEntities-correlation/mergedEnts-correlations.csv',
               sep = ',')

CPU times: user 776 ms, sys: 61 ms, total: 837 ms
Wall time: 855 ms


##  Prepare Network Data

In [33]:
%%time

# Create list of unique entities from source and target columns.
nodes = pd.DataFrame(corr_df['label'].values.tolist() + corr_df['target'].values.tolist()) \
    .rename(columns = {0:'label'}) \
    .drop_duplicates()

# Remove nodes that are only integers (unsure how they got there...).
nodes = nodes[nodes['label'].str.contains('A-z*', regex = True)]

# Create identifying codes for labels.
nodes = nodes \
    .assign(source = nodes['label'].astype('category').cat.codes) \
    .dropna() \
    .sort_values(['source'], ascending = True) # Sorting matches labels with source codes.

# Create dictionary to map values to codes.
nodes_dictionary = nodes.set_index('label')['source'].to_dict()

# Create links dataframe and map links to nodes' codes.
edges = corr_df[['label', 'target', 'weight']].rename(columns = {'label':'source'})

# Add data to nodes dataframe.
nodes = nodes.merge(corr_df[['label', 'ner_label', 'match_quality']],
           on = 'label', how = 'left') \
    .drop_duplicates()


# Map labels back onto source and target.
edges = edges.replace({'source':nodes_dictionary, 'target':nodes_dictionary})
print (f'Edges shape before removing duplicates: {edges.shape}')

# Convert edges dataframe to edges tuple (compatible with graph object below).
edges = [tuple(x) for x in edges[['source', 'target']].to_numpy()]

# Remove rows with duplicate, though inversed, undirected connections.
# Ex. John --> Abigail would remove the row, Abigail --> John.
edges = list({tuple(item) for item in map(sorted, edges)})
edges = np.asarray(edges)
print (f'Edges shape after removing duplicates: {edges.shape}')


# Convert edges to proper network format.
# Edge format must be a list of tuples: [(source, target, weight), (etc.)]

# First, convert edges back to dataframe to re-join with weights.
edges = pd.DataFrame(edges, columns = ['source', 'target']) \
    .merge(corr_df[['label', 'target', 'weight']].rename(columns = {'label':'source'}),
           on = ['source', 'target'], how = 'inner')

# Round weights value.
edges['weight'] = edges['weight'].round(3)

# Then, convert dataframe to list of tuples, the expected network format.
edges = [tuple(i) for i in edges.to_numpy()]

Edges shape before removing duplicates: (216576, 3)
Edges shape after removing duplicates: (85433, 2)
CPU times: user 532 ms, sys: 14.3 ms, total: 547 ms
Wall time: 555 ms


## Create Network Graph

In [35]:
%%time

# Initialize graph object.
G = nx.Graph()

# Add nodes and edges to graph object.
G.add_nodes_from(nodes['label'])

G.add_weighted_edges_from(edges)



# Add node attributes.
nx.set_node_attributes(G, pd.Series(nodes['ner_label'].values.tolist(),
                                    index=nodes['label']).to_dict(), 'ner_label')

nx.set_node_attributes(G, pd.Series(nodes['match_quality'].values.tolist(),
                                    index=nodes['label']).to_dict(), 'match_quality')

# Set degree attributes.
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Sort nodes by degree and print top results.
sorted_degree = sorted(dict(G.degree(G.nodes())).items(),
                       key = itemgetter(1), reverse = True)

print ("Top 10 nodes by degree:")
for d in sorted_degree[:10]:
    print (f'\t{d}')

print (nx.info(G))


# Measure network density.
density = nx.density(G)
print (f"Network density: {density:.3f}")

# Related to diameter, check if network is connected and, therefore, can have a diameter.
print (f"Is the network connected? {nx.is_connected(G)}")

# Get a list of network components (communities).
# Find the largest component.
components = nx.connected_components(G)
largest_component = max(components, key = len)

# Create a subgraph of the largest component and measure its diameter.
subgraph = G.subgraph(largest_component)
diameter = nx.diameter(subgraph)
print (f"Network diameter of the largest component: {diameter:.3f}")

# Find triadic closure (similar to density).
triadic_closure = nx.transitivity(G)
print (f"Triadic closure: {triadic_closure:.3f}\n")

# # Find centrality measures.
# betweenness_dict = nx.betweenness_centrality(G) # Run betweenness centrality
# eigenvector_dict = nx.eigenvector_centrality(G) # Run eigenvector centrality

# # Assign each centrality measure to an attribute.
# nx.set_node_attributes(G, betweenness_dict, 'betweenness')
# nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(G, dict(G.degree(G.nodes())), 'degree')

# Find communities.
communities = community.greedy_modularity_communities(G)

# Create a dictionary that maps nodes to their community.
modularity_dict = {}
for i, c in enumerate(communities):
    for name in c:
        modularity_dict[name] = i
        
# Add modularity information to graph object.
nx.set_node_attributes(G, modularity_dict, 'modularity')



Top 10 nodes by degree:
	('adams george', 236)
	('gordon william2', 187)
	('campbell john2', 169)
	('matson aaron', 168)
	('mr-webster', 168)
	('fleury unknown', 168)
	('mr-lowry', 168)
	('fernandezdevelasco bernardo', 167)
	('pleasants james', 167)
	('vandyke kensey', 167)
Name: 
Type: Graph
Number of nodes: 5930
Number of edges: 85433
Average degree:  28.8138
Network density: 0.005
Is the network connected? False
Network diameter of the largest component: 18.000
Triadic closure: 0.778

CPU times: user 9min 48s, sys: 3.18 s, total: 9min 51s
Wall time: 10min 3s


## Save Graph Object

In [36]:
%%time

# Convert graph object into a dictionary.
data = json_graph.node_link_data(G)

# Serialize dictionary with json.
class NPEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)
    
data_json = json.dumps(data, cls=NPEncoder)

with open(abs_dir + "Output/Graphs/JQA_Network_mergedEntities-correlation/network.json",
          "w") as f:
    
    f.write(data_json)

CPU times: user 258 ms, sys: 20.7 ms, total: 279 ms
Wall time: 284 ms
