Install Networkit library

In [1]:
# !pip install networkit

# # Install pyvis if not already installed
# !pip install pyvis

Import the necessary library

In [1]:
import networkit as nk
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

Preprocess the data and create the mapping

In [2]:
# Load the dataset
df = pd.read_csv(r'C:/SMU STUFF/Y3 SEM 2/IS450 Text Mining and Language Processing/Project Related/actual/filtered_cleaned_updatedrole.csv')

# Preprocess the data
df = df.dropna(subset=['To'])
df['To'] = df['To'].str.split(';')
df = df.explode('To')
df['From'] = df['From'].str.strip()
df['To'] = df['To'].str.strip()
df = df[df['From'] != df['To']]

# Create a mapping of email addresses to node IDs
node_map = {}
current_id = 0

# Assign unique IDs to each email address
for email in pd.concat([df['From'], df['To']]).unique():
    node_map[email] = current_id
    current_id += 1

# Create a NetworKit graph
G = nk.Graph(len(node_map), directed=True, weighted=False)

# Add edges to the graph
for _, row in df.iterrows():
    G.addEdge(node_map[row['From']], node_map[row['To']])

Analyze the Network

1. Compute Degree Centrality
- <i>Degree centrality measures the number of connections a node has.
- <i>In-degree: No. of emails received | Out-degree: No. of emails sent<i>
- <i>High in-degree - authority figures | High out-degree - information spreaders<i>

In [3]:
# Compute in-degree centrality
in_degree = [G.degreeIn(node) for node in G.iterNodes()]

# Compute out-degree centrality
out_degree = [G.degreeOut(node) for node in G.iterNodes()]

# Map node IDs back to email addresses
in_degree_dict = {email: in_degree[node_id] for email, node_id in node_map.items()}
out_degree_dict = {email: out_degree[node_id] for email, node_id in node_map.items()}

# Print top 10 nodes by in-degree and out-degree centrality
top_in_degree = sorted(in_degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
top_out_degree = sorted(out_degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 nodes by in-degree centrality (emails received):", top_in_degree)
print("Top 10 nodes by out-degree centrality (emails sent):", top_out_degree)

Top 10 nodes by in-degree centrality (emails received): [('tana.jones@enron.com', 5666), ('sara.shackleton@enron.com', 4958), ('vkaminski@aol.com', 4864), ('jeff.dasovich@enron.com', 4248), ('kate.symes@enron.com', 3513), ('all.worldwide@enron.com', 3324), ('mark.taylor@enron.com', 3201), ('gerald.nemec@enron.com', 3046), ('kay.mann@enron.com', 3025), ('steven.kean@enron.com', 2986)]
Top 10 nodes by out-degree centrality (emails sent): [('kay.mann@enron.com', 16663), ('vince.kaminski@enron.com', 14253), ('jeff.dasovich@enron.com', 11289), ('sara.shackleton@enron.com', 8755), ('chris.germany@enron.com', 8717), ('enron.announcements@enron.com', 8587), ('tana.jones@enron.com', 8475), ('steven.kean@enron.com', 6190), ('kate.symes@enron.com', 5434), ('matthew.lenhart@enron.com', 5253)]


2. Compute Betweenness Centrality
- <i>Betweenness centrality identifies individuals who act as bridges between different groups.<i>
- <i>Can be critical connectors in the network<i>
- <i>Highlighs bottlenecks and key intermediaries, may be essential for information flow and collaboration across departments<i>

In [4]:
# Compute betweenness centrality
betweenness = nk.centrality.Betweenness(G)
betweenness.run()

# Get the results
betweenness_scores = betweenness.scores()

# Map node IDs back to email addresses
betweenness_dict = {email: betweenness_scores[node_id] for email, node_id in node_map.items()}

# Print top 10 nodes by betweenness centrality
top_betweenness = sorted(betweenness_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 nodes by betweenness centrality (key connectors):", top_betweenness)

Top 10 nodes by betweenness centrality (key connectors): [('jeff.dasovich@enron.com', 56866376.114113934), ('tana.jones@enron.com', 41185933.8756713), ('sara.shackleton@enron.com', 36154344.108076036), ('vince.kaminski@enron.com', 36123923.16897987), ('louise.kitchen@enron.com', 31401822.34389682), ('gerald.nemec@enron.com', 29701642.42013909), ('kay.mann@enron.com', 27290577.893188767), ('chris.germany@enron.com', 25473986.329076044), ('john.lavorato@enron.com', 24237919.182178166), ('sally.beck@enron.com', 23944951.42947331)]


3. Compute Closeness Centrality
- <i>Measures how close a node is to all other nodes in the network<i>
- <i>Closeness centrality measures how quickly an individual can communicate with others in the network.<i>
- <i>High closeness - individuals are centrally located in the network, can quickly reach out to others and disseminate information<i>
- <i>Identifies efficient communicators - ideal for broadcasting information or coordinating tasks | often central figures in the network<i>

In [7]:
# Compute closeness centrality using the generalized definition
closeness = nk.centrality.Closeness(G, True, True)  # True for normalized, True for harmonic
closeness.run()
closeness_scores = closeness.scores()

# Map node IDs back to email addresses
closeness_dict = {email: closeness_scores[node_id] for email, node_id in node_map.items()}

# Print top 10 nodes by closeness centrality
top_closeness = sorted(closeness_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 nodes by closeness centrality (quick access to others):", top_closeness)

Top 10 nodes by closeness centrality (quick access to others): [('arsystem@mailman.enron.com', 0.24761991651967544), ('perfmgmt@enron.com', 0.2469709586271745), ('sap_security@enron.com', 0.24418298510723963), ('exchange.administrator@enron.com', 0.2440282075546422), ('tana.jones@enron.com', 0.23980900565428978), ('louise.kitchen@enron.com', 0.23791175582404547), ('richard.sanders@enron.com', 0.2375025508776677), ('enron_update@concureworkplace.com', 0.23600147789428333), ('jeff.dasovich@enron.com', 0.23541847029078955), ('kay.mann@enron.com', 0.23531203426794103)]


4. Compute Network Density
- <i>Network density measures how interconnected the network is.<i>

In [9]:
# Compute network density
num_edges = G.numberOfEdges()
num_nodes = G.numberOfNodes()
max_edges = num_nodes * (num_nodes - 1)  # For a directed graph
density = num_edges / max_edges

print("Network density:", density)

Network density: 0.0001191172909080299


Visualise the Network

In [10]:
# Create a pyvis network
net = Network(notebook=True, directed=True)

# Add nodes and edges to the pyvis network
for email, node_id in node_map.items():
    net.add_node(node_id, label=email)

for _, row in df.iterrows():
    net.add_edge(node_map[row['From']], node_map[row['To']])

# Set visualization options
net.toggle_physics(True)
net.show_buttons(filter_=['physics'])

# Save and display the network
net.show("enron_network.html")

# The network will be saved as an HTML file and opened in your browser

enron_network.html


Testing out visualisation method

In [13]:
from pyvis.network import Network

# Extract top 100 nodes by degree centrality
top_nodes = sorted(node_map.keys(), key=lambda x: G.degreeOut(node_map[x]), reverse=True)[:100]

# Create a pyvis network for the subgraph
net = Network(notebook=True, directed=True, height="750px", width="100%")

# Add nodes and edges to the pyvis network
for email, node_id in node_map.items():
    if email in top_nodes:
        net.add_node(node_id, label=email)

for _, row in df.iterrows():
    if row['From'] in top_nodes and row['To'] in top_nodes:
        net.add_edge(node_map[row['From']], node_map[row['To']])

# Set visualization options
net.toggle_physics(False)  # Disable physics for faster rendering

# Properly formatted JSON string for set_options
options = """
{
  "layout": {
    "hierarchical": {
      "enabled": true,
      "direction": "UD",
      "sortMethod": "directed"
    }
  },
  "nodes": {
    "font": {
      "size": 14,
      "face": "arial"
    }
  },
  "edges": {
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    },
    "smooth": false
  }
}
"""

net.set_options(options)

# Save and display the network
net.show("enron_subgraph.html")

print("Visualization saved to 'enron_subgraph.html'. Open this file in your browser to view the network.")

enron_subgraph.html
Visualization saved to 'enron_subgraph.html'. Open this file in your browser to view the network.
