# Network Analysis

## Example from social circles data


### Data Description

https://snap.stanford.edu/data/ego-Facebook.html

This dataset (*facebook_combined.txt*) consists of circles from Facebook (4,039 nodes and 88,234 edges) 

### Load required libraries

In [24]:
import networkx as nx
import scipy.sparse as sp
import psutil, time, os, gc, statistics, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

### Functions

In [25]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Memory in MB

# Function to run analysis with multiple iterations
def run_analysis_with_repeats(analysis_func, file_path, num_runs=5):
    results = []
    
    for _ in range(num_runs):
        gc.collect() # clear memory before each run
        result = analysis_func(file_path)
        results.append(result)
    
    # Compute statistics
    metrics = {
        "Method": results[0]["Method"],
        "Nodes": results[0]["Nodes"],
        "Edges": results[0]["Edges"],
        "Avg Degree": results[0]["Avg Degree"],
        "Density": results[0]["Density"],
        "Time (s)": statistics.mean([r["Time (s)"] for r in results]),
        "Time Std (s)": statistics.stdev([r["Time (s)"] for r in results]) if num_runs > 1 else 0,
        "Memory (MB)": statistics.mean([r["Memory (MB)"] for r in results]),
        "Memory Std (MB)": statistics.stdev([r["Memory (MB)"] for r in results]) if num_runs > 1 else 0
    }
    
    return metrics, results[0]["Top 20 Nodes"]  # Return metrics and top nodes from first run


# Function to analyze full graph with NetworkX
def analyze_networkx(file_path):
    start_time = time.time() 
    start_memory = get_memory_usage()
    
    G = nx.read_edgelist(file_path, nodetype=int, create_using=nx.Graph())
    
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Find top 20 nodes by degree
    degrees = G.degree()
    top_nodes = sorted(degrees, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "NetworkX (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Avg Degree": avg_degree,
        "Density": density,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }

# SciPy sparse matrix analysis
def analyze_scipy_sparse(file_path):
    start_time = time.time()   
    start_memory = get_memory_usage()
    
    # Load edge list with pandas for better performance
    edges_df = pd.read_csv(file_path, sep=' ', header=None, dtype=np.int32, engine='c')
    edges = edges_df.to_numpy()
    
    # Vectorized node mapping
    nodes, inverse_indices = np.unique(edges, return_inverse=True)
    num_nodes = len(nodes)
    edge_indices = inverse_indices.reshape(edges.shape)  # Shape: (m, 2)
    
    # Create row and column arrays for symmetric adjacency matrix
    rows = np.concatenate([edge_indices[:, 0], edge_indices[:, 1]])
    cols = np.concatenate([edge_indices[:, 1], edge_indices[:, 0]])
    data = np.ones(len(rows), dtype=np.int32)
    
    adj_matrix = sp.csr_matrix((data, (rows, cols)), shape=(num_nodes, num_nodes))
    
    num_edges = adj_matrix.nnz // 2
    degrees = np.array(adj_matrix.sum(axis=1)).flatten()
    avg_degree = degrees.mean()
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Find top 20 nodes by degree
    top_indices = np.argpartition(degrees, -20)[-20:]
    top_degrees = degrees[top_indices]
    top_nodes = [(nodes[i], degrees[i]) for i in top_indices]
    top_nodes = sorted(top_nodes, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
   
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "SciPy Sparse (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Avg Degree": avg_degree,
        "Density": density,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }


### Main function and combined output

In [26]:
def main():
    file_path = "data/facebook_combined.txt"
    
    if not os.path.exists(file_path):
        print(f"Error: {file_path} not found. Please download from http://snap.stanford.edu/data/egnets-Facebook.html")
        return
    
    num_runs = 50
    results = []
    top_nodes_dfs = []
    
    # List of analysis functions to loop over
    analysis_functions = [
        analyze_networkx,
        analyze_scipy_sparse
    ]
    
    # Run each analysis function in a loop
    for analysis_func in analysis_functions:
        print(f"\nRunning {analysis_func.__name__} analysis ({num_runs} runs)")
        metrics, top_nodes = run_analysis_with_repeats(analysis_func, file_path, num_runs)
        results.append(metrics)
        top_nodes_df = pd.DataFrame(top_nodes)
        top_nodes_df['Method'] = metrics["Method"]
        top_nodes_dfs.append(top_nodes_df)

    # Combine results
    print("\nTable: Performance Comparison (Averaged over runs)")
    df = pd.DataFrame(results)
    print(df.round(6))
    df.to_csv("01-01-graph_analysis_comparison.csv", index=False)
    print("\nResults saved to 01-01-graph_analysis_comparison.csv")
    
    # Save top nodes to CSV
    top_nodes_combined = pd.concat(top_nodes_dfs, ignore_index=True)
    top_nodes_combined.to_csv("01-01-top_20_nodes.csv", index=False)
    print("Top 20 nodes saved to 01-01-top_20_nodes.csv")

if __name__ == "__main__":
    main()


Running analyze_networkx analysis (50 runs)
Nodes: 4039
Edges: 88234
Average Degree: 43.69
Density: 0.010820
Execution Time: 0.07 seconds
Memory Used: 18.75 MB
Top 20 Most Connected Nodes:
  Node 107: Degree 1045
  Node 1684: Degree 792
  Node 1912: Degree 755
  Node 3437: Degree 547
  Node 0: Degree 347
  Node 2543: Degree 294
  Node 2347: Degree 291
  Node 1888: Degree 254
  Node 1800: Degree 245
  Node 1663: Degree 235
  Node 1352: Degree 234
  Node 2266: Degree 234
  Node 483: Degree 231
  Node 348: Degree 229
  Node 1730: Degree 226
  Node 1985: Degree 224
  Node 1941: Degree 223
  Node 2233: Degree 222
  Node 2142: Degree 221
  Node 1431: Degree 220
Nodes: 4039
Edges: 88234
Average Degree: 43.69
Density: 0.010820
Execution Time: 0.06 seconds
Memory Used: 3.73 MB
Top 20 Most Connected Nodes:
  Node 107: Degree 1045
  Node 1684: Degree 792
  Node 1912: Degree 755
  Node 3437: Degree 547
  Node 0: Degree 347
  Node 2543: Degree 294
  Node 2347: Degree 291
  Node 1888: Degree 254
  

## Reference

1. 