# Network Analysis

In this example, we aim to use 3 different methods to calculate the number of nodes, the number of edges, the number of triangles, the everage degree, and the density of the network, find the top 20 most connected nodes and also track the CPU runtime and the use of RAM.

We will use a loop-based naïve method, a networkx-based method and a scipy sparse network based method to complete the above tasks and compare their performance.

## Example from social circles data


### Data Description

https://snap.stanford.edu/data/ego-Facebook.html

This dataset (*facebook_combined.txt*) consists of circles from Facebook (4,039 nodes and 88,234 edges) 

### Load required libraries

In [4]:
import networkx as nx
import scipy.sparse as sp
import psutil, time, os, gc, statistics, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

### Functions

#### Tracking memory usage

In [None]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Memory in MB


#### Three different methods

In [None]:

# Naive method using dictionary-based adjacency list
def analyze_naive(file_path):
    start_time = time.time()
    start_memory = get_memory_usage()
    
    # Initialize adjacency list as a dictionary
    graph = {}
    
    # Read edge list and build graph
    try:
        with open(file_path, 'r') as f:
            for line in f:
                if line.strip():
                    node1, node2 = map(int, line.strip().split())
                    # Add nodes and edges (undirected)
                    if node1 not in graph:
                        graph[node1] = set()
                    if node2 not in graph:
                        graph[node2] = set()
                    graph[node1].add(node2)
                    graph[node2].add(node1)
    except (ValueError, IOError) as e:
        print(f"Error reading file {file_path}: {e}")
        return {
            "Method": "Naive (Full)",
            "Nodes": 0,
            "Edges": 0,
            "Triangles": 0,
            "Avg Degree": 0,
            "Density": 0,
            "Time (s)": time.time() - start_time,
            "Memory (MB)": get_memory_usage() - start_memory,
            "Top 20 Nodes": []
        }
    
    num_nodes = len(graph)
    num_edges = sum(len(neighbors) for neighbors in graph.values()) // 2  # Divide by 2 for undirected
    
    # Count triangles, ensuring each triangle is counted once
    num_triangles = 0
    for node in graph:
        neighbors = sorted(graph[node])  # Sort neighbors for consistent ordering
        for i, n1 in enumerate(neighbors):
            if n1 < node:  # Only count triangles where node is not the smallest
                continue
            for n2 in neighbors[i+1:]:
                if n2 < node:  # Ensure node is the smallest in the triangle
                    continue
                if n2 in graph[n1]:  # Check if n1 and n2 are connected
                    num_triangles += 1
    
    avg_degree = sum(len(neighbors) for neighbors in graph.values()) / num_nodes if num_nodes > 0 else 0
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Find top 20 nodes by degree
    top_nodes = sorted(graph.items(), key=lambda x: len(x[1]), reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": len(neighbors)} for node, neighbors in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Triangles: {num_triangles}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "Naive (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Triangles": num_triangles,
        "Avg Degree": avg_degree,
        "Density": density,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }




# Function to analyze full graph with NetworkX
def analyze_networkx(file_path):
    start_time = time.time() 
    start_memory = get_memory_usage()
    
    try:
        G = nx.read_edgelist(file_path, nodetype=int, create_using=nx.Graph())
    except (nx.NetworkXError, IOError) as e:
        print(f"Error reading file {file_path}: {e}")
        return {
            "Method": "NetworkX (Full)",
            "Nodes": 0,
            "Edges": 0,
            "Triangles": 0,
            "Avg Degree": 0,
            "Density": 0,
            "Time (s)": time.time() - start_time,
            "Memory (MB)": get_memory_usage() - start_memory,
            "Top 20 Nodes": []
        }
    
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    num_triangles = sum(nx.triangles(G).values()) // 3  # Each triangle counted thrice
    avg_degree = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Find top 20 nodes by degree
    degrees = G.degree()
    top_nodes = sorted(degrees, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Triangles: {num_triangles}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "NetworkX (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Triangles": num_triangles,
        "Avg Degree": avg_degree,
        "Density": density,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }

# SciPy sparse matrix analysis
def analyze_scipy_sparse(file_path):
    start_time = time.time()   
    start_memory = get_memory_usage()
    
    try:
        # Load edge list with pandas for better performance
        edges_df = pd.read_csv(file_path, sep='\s+', header=None, dtype=np.int32, engine='c')
        edges = edges_df.to_numpy()
    except (pd.errors.EmptyDataError, IOError) as e:
        print(f"Error reading file {file_path}: {e}")
        return {
            "Method": "SciPy Sparse (Full)",
            "Nodes": 0,
            "Edges": 0,
            "Triangles": 0,
            "Avg Degree": 0,
            "Density": 0,
            "Time (s)": time.time() - start_time,
            "Memory (MB)": get_memory_usage() - start_memory,
            "Top 20 Nodes": []
        }
    
    # Vectorized node mapping
    nodes, inverse_indices = np.unique(edges, return_inverse=True)
    num_nodes = len(nodes)
    edge_indices = inverse_indices.reshape(edges.shape)  # Shape: (m, 2)
    
    # Create row and column arrays for symmetric adjacency matrix
    rows = np.concatenate([edge_indices[:, 0], edge_indices[:, 1]])
    cols = np.concatenate([edge_indices[:, 1], edge_indices[:, 0]])
    data = np.ones(len(rows), dtype=np.int32)
    
    adj_matrix = sp.csr_matrix((data, (rows, cols)), shape=(num_nodes, num_nodes))
    
    num_edges = adj_matrix.nnz // 2
    degrees = np.array(adj_matrix.sum(axis=1)).flatten()
    # Count triangles: trace(A^3)/6 for undirected graph
    adj_matrix_cube = adj_matrix @ adj_matrix @ adj_matrix
    num_triangles = int(adj_matrix_cube.diagonal().sum() / 6)
    avg_degree = degrees.mean() if num_nodes > 0 else 0
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Find top 20 nodes by degree
    top_indices = np.argpartition(degrees, -20)[-20:] if num_nodes >= 20 else np.arange(num_nodes)
    top_degrees = degrees[top_indices]
    top_nodes = [(nodes[i], degrees[i]) for i in top_indices]
    top_nodes = sorted(top_nodes, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Triangles: {num_triangles}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "SciPy Sparse (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Triangles": num_triangles,
        "Avg Degree": avg_degree,
        "Density": density,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }



#### Functions to run through the different methods one-by-one

In [None]:

# Function to run analysis with multiple iterations
def run_analysis_with_repeats(analysis_func, file_path, num_runs=5):
    results = []
    
    for _ in range(num_runs):
        gc.collect() # clear memory before each run
        result = analysis_func(file_path)
        results.append(result)
    
    # Compute statistics
    metrics = {
        "Method": results[0]["Method"],
        "Nodes": results[0]["Nodes"],
        "Edges": results[0]["Edges"],
        "Triangles": results[0]["Triangles"],
        "Avg Degree": results[0]["Avg Degree"],
        "Density": results[0]["Density"],
        "Time (s)": statistics.mean([r["Time (s)"] for r in results]),
        "Time Std (s)": statistics.stdev([r["Time (s)"] for r in results]) if num_runs > 1 else 0,
        "Memory (MB)": statistics.mean([r["Memory (MB)"] for r in results]),
        "Memory Std (MB)": statistics.stdev([r["Memory (MB)"] for r in results]) if num_runs > 1 else 0
    }
    
    return metrics, results[0]["Top 20 Nodes"]

# Function to run all analyses
def run_all_analyses(file_path, num_runs):
    methods = [
        ("Naive", analyze_naive),
        ("NetworkX", analyze_networkx),
        ("SciPy Sparse", analyze_scipy_sparse)
    ]
    results = []
    top_nodes_dfs = []
    
    for method_name, method_func in methods:
        print(f"\nRunning {method_name} analysis ({num_runs} runs)")
        metrics, top_nodes = run_analysis_with_repeats(method_func, file_path, num_runs)
        results.append(metrics)
        top_nodes_df = pd.DataFrame(top_nodes)
        top_nodes_df['Method'] = f"{method_name} (Full)"
        top_nodes_dfs.append(top_nodes_df)
    
    return results, top_nodes_dfs



### Main function and combined output

- Combined output is saved into two .CSV files

In [6]:
def main():
    file_path = "data/facebook_combined.txt"
    
    if not os.path.exists(file_path):
        print(f"Error: {file_path} not found. Please download from http://snap.stanford.edu/data/egnets-Facebook.html")
        return
    
    num_runs = 500
    results, top_nodes_dfs = run_all_analyses(file_path, num_runs)

    # Combine results
    print("\nTable: Performance Comparison (Averaged over runs)")
    df = pd.DataFrame(results)
    print(df.round(6))
    df.to_csv("01-01-graph_analysis_comparison.csv", index=False)
    print("\nResults saved to 01-01-graph_analysis_comparison.csv")
    
    # Save top nodes to CSV
    top_nodes_combined = pd.concat(top_nodes_dfs, ignore_index=True)
    top_nodes_combined.to_csv("01-01-top_20_nodes.csv", index=False)
    print("Top 20 nodes saved to 01-01-top_20_nodes.csv")

if __name__ == "__main__":
    main()


Running Naive analysis (500 runs)
Nodes: 4039
Edges: 88234
Triangles: 1612010
Average Degree: 43.69
Density: 0.010820
Execution Time: 0.18 seconds
Memory Used: 14.80 MB
Top 20 Most Connected Nodes:
  Node 107: Degree 1045
  Node 1684: Degree 792
  Node 1912: Degree 755
  Node 3437: Degree 547
  Node 0: Degree 347
  Node 2543: Degree 294
  Node 2347: Degree 291
  Node 1888: Degree 254
  Node 1800: Degree 245
  Node 1663: Degree 235
  Node 1352: Degree 234
  Node 2266: Degree 234
  Node 483: Degree 231
  Node 348: Degree 229
  Node 1730: Degree 226
  Node 1985: Degree 224
  Node 1941: Degree 223
  Node 2233: Degree 222
  Node 2142: Degree 221
  Node 1431: Degree 220
Nodes: 4039
Edges: 88234
Triangles: 1612010
Average Degree: 43.69
Density: 0.010820
Execution Time: 0.20 seconds
Memory Used: 6.88 MB
Top 20 Most Connected Nodes:
  Node 107: Degree 1045
  Node 1684: Degree 792
  Node 1912: Degree 755
  Node 3437: Degree 547
  Node 0: Degree 347
  Node 2543: Degree 294
  Node 2347: Degree 29

## Reference

1. 