# Network Analysis with Emissions

In this practice, we will use `codecarbon` [1] to track the CO2 emission during the execution of the code.

"CodeCarbon is a lightweight software package that seamlessly integrates into your Python codebase. It estimates the amount of carbon dioxide (CO2) produced by the cloud or personal computing resources used to execute the code."

## Example from social circles data

This practice will add two tasks on top of the previous practice.

- Find the longest and shortest path

- Track the CO2 emission when executing the code


### Data Description

https://snap.stanford.edu/data/ego-Facebook.html

This dataset (*facebook_combined.txt*) consists of circles from Facebook (4,039 nodes and 88,234 edges) 

### Load required libraries

In [None]:
import networkx as nx
import scipy.sparse as sp
import psutil, time, os, gc, statistics, warnings
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
from collections import defaultdict, deque
from tqdm import tqdm

warnings.filterwarnings('ignore')

### Example of Tracking emissions

In [None]:
from codecarbon import EmissionsTracker
import time
import numpy as np

# Initialize the EmissionsTracker
tracker = EmissionsTracker()

# Start tracking emissions
tracker.start()

# Simulate a computational task (e.g., matrix multiplication)
matrix_size = 1000
A = np.random.rand(matrix_size, matrix_size)
B = np.random.rand(matrix_size, matrix_size)
result = np.matmul(A, B)

# Simulate some processing time
time.sleep(2)

# Stop tracking and get emissions (in kgCO2)
emissions = tracker.stop()

# Print results
print(f"Emissions for matrix multiplication: {emissions:,.6f} kgCO2")
print(f"Result matrix sum: {np.sum(result):,.2f}")

[codecarbon DEBUG @ 21:58:41] RAM power estimation: 3.00W for 16.00GB
[codecarbon INFO @ 21:58:41] Energy consumed for RAM : 0.001812 kWh. RAM Power : 3.0 W
[codecarbon DEBUG @ 21:58:41] Done measure for RAM - measurement time: 0.0019 s - last call 14.99 s
[codecarbon INFO @ 21:58:41] Delta energy consumed for CPU with constant : 0.000177 kWh, power : 42.5 W
[codecarbon INFO @ 21:58:41] Energy consumed for All CPU : 0.025681 kWh
[codecarbon DEBUG @ 21:58:41] Done measure for CPU - measurement time: 0.0004 s - last call 15.00 s
[codecarbon INFO @ 21:58:41] 0.027493 kWh of electricity used since the beginning.
[codecarbon DEBUG @ 21:58:41] last_duration=14.994681833020877
------------------------
[codecarbon DEBUG @ 21:58:56] RAM power estimation: 3.00W for 16.00GB
[codecarbon INFO @ 21:58:56] Energy consumed for RAM : 0.001825 kWh. RAM Power : 3.0 W
[codecarbon DEBUG @ 21:58:56] Done measure for RAM - measurement time: 0.0024 s - last call 15.00 s
[codecarbon INFO @ 21:58:56] Delta ener

### Functions (need to be completed)

In [None]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Memory in MB



# Naive implementation using adjacency list
def analyze_naive(file_path):
    start_time = time.time()
    start_memory = get_memory_usage()
    
    # Build adjacency list
    graph = defaultdict(set)
    with open(file_path, 'r') as f:
        for line in f:
            n1, n2 = map(int, line.strip().split())
            graph[n1].add(n2)
            graph[n2].add(n1)
    
    num_nodes = len(graph)
    num_edges = sum(len(neighbors) for neighbors in graph.values()) // 2
    avg_degree = sum(len(neighbors) for neighbors in graph.values()) / num_nodes if num_nodes > 0 else 0
    density = (2 * num_edges) / (num_nodes * (num_nodes - 1)) if num_nodes > 1 else 0
    
    # Simple BFS for shortest path and tracking nodes
    def bfs_shortest_path(start, end):
        if start == end:
            return 0, [start]
        visited = {start}
        queue = [(start, [start])]
        while queue:
            node, path = queue.pop(0)
            for neighbor in graph[node]:
                if neighbor not in visited:
                    new_path = path + [neighbor]
                    if neighbor == end:
                        return len(new_path) - 1, new_path
                    visited.add(neighbor)
                    queue.append((neighbor, new_path))
        return float('inf'), []
    
    # Find approximate longest and shortest paths
    max_path = 0
    min_path = float('inf')
    max_path_nodes = []
    min_path_nodes = []
    sample_nodes = list(graph.keys())[:min(100, num_nodes)]  # Sample for efficiency
    for i, start in enumerate(sample_nodes):
        for end in sample_nodes[i+1:]:
            path_len, path_nodes = bfs_shortest_path(start, end)
            if path_len != float('inf'):
                if path_len > max_path:
                    max_path = path_len
                    max_path_nodes = path_nodes
                if path_len > 0 and path_len < min_path:
                    min_path = path_len
                    min_path_nodes = path_nodes
    
    # Find top 20 nodes by degree
    degrees = [(node, len(neighbors)) for node, neighbors in graph.items()]
    top_nodes = sorted(degrees, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Longest Path: {max_path}")
    print(f"Longest Path Nodes: {max_path_nodes}")
    print(f"Shortest Path: {min_path if min_path != float('inf') else 'N/A'}")
    print(f"Shortest Path Nodes: {min_path_nodes if min_path != float('inf') else 'N/A'}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "Naive (Adjacency List)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Avg Degree": avg_degree,
        "Density": density,
        "Longest Path": max_path,
        "Longest Path Nodes": max_path_nodes,
        "Shortest Path": min_path if min_path != float('inf') else 0,
        "Shortest Path Nodes": min_path_nodes if min_path != float('inf') else [],
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }

# Function to analyze full graph with NetworkX
def analyze_networkx(file_path):
    start_time = time.time() 
    start_memory = get_memory_usage()
    
    # TODO: Read the edge list into a NetworkX undirected graph
    # - Use nx.read_edgelist with integer node types and undirected graph (nx.Graph())

    # TODO: Compute basic graph metrics
    # - Number of nodes: Use NetworkX method to count nodes
    # - Number of edges: Use NetworkX method to count edges
    # - Average degree: Sum of degrees divided by number of nodes
    # - Density: Use formula (2 * num_edges) / (num_nodes * (num_nodes - 1)) (handle small graphs)
    
    # TODO: Find longest and shortest paths with their nodes
    # - Use nx.all_pairs_shortest_path to get all shortest paths
    # - For each path dictionary, compute path length (number of nodes - 1)
    # - Track the maximum and minimum path lengths and their node lists
    # - Ensure min_path is only updated for non-zero paths
    # - Only compute paths if num_nodes > 1
    
    
    # Find top 20 nodes by degree
    degrees = G.degree()
    top_nodes = sorted(degrees, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Longest Path: {longest_path}")
    print(f"Longest Path Nodes: {longest_path_nodes}")
    print(f"Shortest Path: {shortest_path if shortest_path != float('inf') else 'N/A'}")
    print(f"Shortest Path Nodes: {shortest_path_nodes if shortest_path != float('inf') else 'N/A'}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "NetworkX (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Avg Degree": avg_degree,
        "Density": density,
        "Longest Path": longest_path,
        "Longest Path Nodes": longest_path_nodes,
        "Shortest Path": shortest_path,
        "Shortest Path Nodes": shortest_path_nodes,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }

# SciPy sparse matrix analysis
def analyze_scipy_sparse(file_path):
    start_time = time.time()   
    start_memory = get_memory_usage()
    
    # TODO: Load the edge list into a NumPy array
    # - Use pandas.read_csv with space separator, no header, int32 dtype, and 'c' engine
    # - Convert the DataFrame to a NumPy array
    
    # TODO: Create a sparse adjacency matrix
    # - Get unique nodes and their indices (use np.unique with return_inverse=True)
    # - Create row and column indices for a symmetric adjacency matrix (undirected graph)
    # - Use sp.csr_matrix to create the sparse matrix with shape (num_nodes, num_nodes)
    
    
    # TODO: Compute basic graph metrics
    # - Number of edges: Use adj_matrix.nnz and divide by 2 (undirected graph)
    # - Degrees: Sum the adjacency matrix along one axis
    # - Average degree: Mean of the degrees
    # - Density: Use formula (2 * num_edges) / (num_nodes * (num_nodes - 1)) (handle small graphs)
    
   
    
    # TODO: Find longest and shortest paths with their nodes
    # - Convert the sparse matrix to a NetworkX graph using nx.from_scipy_sparse_array
    # - Use nx.all_pairs_shortest_path to get all shortest paths
    # - For each path dictionary, compute path length (number of nodes - 1)
    # - Track the maximum and minimum path lengths and their node lists
    # - Ensure min_path is only updated for non-zero paths
    # - Only compute paths if num_nodes > 1
    
    
    # Find top 20 nodes by degree
    top_indices = np.argpartition(degrees, -20)[-20:]
    top_degrees = degrees[top_indices]
    top_nodes = [(nodes[i], degrees[i]) for i in top_indices]
    top_nodes = sorted(top_nodes, key=lambda x: x[1], reverse=True)[:20]
    top_nodes_dict = [{"Node": node, "Degree": degree} for node, degree in top_nodes]
    
    exec_time = time.time() - start_time
    memory_used = get_memory_usage() - start_memory
    
    print(f"Nodes: {num_nodes}")
    print(f"Edges: {num_edges}")
    print(f"Average Degree: {avg_degree:,.2f}")
    print(f"Density: {density:,.6f}")
    print(f"Longest Path: {longest_path}")
    print(f"Longest Path Nodes: {longest_path_nodes}")
    print(f"Shortest Path: {shortest_path if shortest_path != float('inf') else 'N/A'}")
    print(f"Shortest Path Nodes: {shortest_path_nodes if shortest_path != float('inf') else 'N/A'}")
    print(f"Execution Time: {exec_time:,.2f} seconds")
    print(f"Memory Used: {memory_used:,.2f} MB")
    print("Top 20 Most Connected Nodes:")
    for node in top_nodes_dict:
        print(f"  Node {node['Node']}: Degree {node['Degree']}")
    
    return {
        "Method": "SciPy Sparse (Full)",
        "Nodes": num_nodes,
        "Edges": num_edges,
        "Avg Degree": avg_degree,
        "Density": density,
        "Longest Path": longest_path,
        "Longest Path Nodes": longest_path_nodes,
        "Shortest Path": shortest_path,
        "Shortest Path Nodes": shortest_path_nodes,
        "Time (s)": exec_time,
        "Memory (MB)": memory_used,
        "Top 20 Nodes": top_nodes_dict
    }

# Function to run analysis with multiple iterations
def run_analysis_with_repeats(analysis_func, file_path, num_runs=5):
    results = []
    emissions = []
    
    # TODO: Run the analysis function multiple times with emission tracking
    # - Loop num_runs times
    # - Clear memory before each run using gc.collect()
    # - Initialize an EmissionsTracker to measure carbon footprint
    # - Start the tracker before running the analysis
    # - Run the analysis function (analysis_func(file_path))
    # - Stop the tracker and append the emissions (in kgCO2) to the emissions list
    # - Append the analysis result to the results list
    for _ in range(num_runs):
        
    
    # Compute statistics
    metrics = {
        "Method": results[0]["Method"],
        "Nodes": results[0]["Nodes"],
        "Edges": results[0]["Edges"],
        "Avg Degree": results[0]["Avg Degree"],
        "Density": results[0]["Density"],
        "Longest Path": results[0]["Longest Path"],
        "Longest Path Nodes": results[0]["Longest Path Nodes"],
        "Shortest Path": results[0]["Shortest Path"],
        "Shortest Path Nodes": results[0]["Shortest Path Nodes"],
        "Time (s)": statistics.mean([r["Time (s)"] for r in results]),
        "Time Std (s)": statistics.stdev([r["Time (s)"] for r in results]) if num_runs > 1 else 0,
        "Memory (MB)": statistics.mean([r["Memory (MB)"] for r in results]),
        "Memory Std (MB)": statistics.stdev([r["Memory (MB)"] for r in results]) if num_runs > 1 else 0,
        "Emissions (kgCO2)": statistics.mean(emissions),
        "Emissions Std (kgCO2)": statistics.stdev(emissions) if num_runs > 1 else 0
    }
    
    return metrics, results[0]["Top 20 Nodes"]  # Return metrics and top nodes from first run

# Function to run all analyses
def run_all_analyses(file_path, num_runs):
    methods = [
        ("Naive", analyze_naive),
        ("NetworkX", analyze_networkx),
        ("SciPy Sparse", analyze_scipy_sparse)
    ]
    results = []
    top_nodes_dfs = []
    
    for method_name, method_func in methods:
        print(f"\nRunning {method_name} analysis ({num_runs} runs)")
        metrics, top_nodes = run_analysis_with_repeats(method_func, file_path, num_runs)
        results.append(metrics)
        top_nodes_df = pd.DataFrame(top_nodes)
        top_nodes_df['Method'] = f"{method_name} (Full)"
        top_nodes_dfs.append(top_nodes_df)
    
    return results, top_nodes_dfs

### Main function and combined output

In [None]:
# Main execution
def main():
    file_path = "data/facebook_combined.txt"
    
    if not os.path.exists(file_path):
        print(f"Error: {file_path} not found. Please download from http://snap.stanford.edu/data/egnets-Facebook.html")
        return
    
    num_runs = 50
    results, top_nodes_dfs = run_all_analyses(file_path, num_runs)
    
    # Combine results
    print("\nTable: Performance Comparison (Averaged over runs)")
    df = pd.DataFrame(results)
    print(df.round(6))
    df.to_csv("01-02-graph_analysis_comparison.csv", index=False)
    print("\nResults saved to 01-02-graph_analysis_comparison.csv")
    
    # Save top nodes to CSV
    top_nodes_combined = pd.concat(top_nodes_dfs, ignore_index=True)
    top_nodes_combined.to_csv("01-02-top_20_nodes.csv", index=False)
    print("Top 20 nodes saved to 01-02-top_20_nodes.csv")

if __name__ == "__main__":
    main()

## Reference

1. https://codecarbon.io/#howitwork