In [1]:
import networkx as nx
import pandas as pd
from pathlib import Path
from networkx.algorithms import community

projected_dir = Path("../data/networks/projected")
output_dir = Path("../data/results")
output_dir.mkdir(parents=True, exist_ok=True)

repos = ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]
phases = ["pre", "during", "post"]

def calculate_metrics(G):
    metrics = {
        "num_nodes": G.number_of_nodes(),
        "num_edges": G.number_of_edges(),
        "density": nx.density(G),
        "avg_clustering": nx.average_clustering(G),
    }

    # Modularity using Louvain if possible
    try:
        import community as community_louvain
        partition = community_louvain.best_partition(G)
        modularity = community_louvain.modularity(partition, G)
    except ImportError:
        modularity = None
    metrics["modularity"] = modularity

    return metrics


In [2]:
global_results = []

for repo in repos:
    for phase in phases:
        graph_path = projected_dir / f"{repo}_{phase}_projected.gml"
        if not graph_path.exists():
            print(f"Missing: {graph_path}")
            continue

        G = nx.read_gml(graph_path)
        print(f"Loaded graph: {repo} ({phase}) with {G.number_of_nodes()} nodes")

        # Global metrics
        gm = calculate_metrics(G)
        gm["repo"] = repo
        gm["phase"] = phase

        global_results.append(gm)

global_df = pd.DataFrame(global_results)
global_df.to_csv(output_dir / "global_metrics.csv", index=False)
global_df


Loaded graph: scikit-learn (pre) with 95 nodes
Loaded graph: scikit-learn (during) with 131 nodes
Loaded graph: scikit-learn (post) with 35 nodes
Loaded graph: pytorch (pre) with 359 nodes
Loaded graph: pytorch (during) with 803 nodes
Loaded graph: pytorch (post) with 365 nodes
Loaded graph: kubernetes (pre) with 1160 nodes
Loaded graph: kubernetes (during) with 1430 nodes
Loaded graph: kubernetes (post) with 423 nodes
Loaded graph: apache-spark (pre) with 172 nodes
Loaded graph: apache-spark (during) with 202 nodes
Loaded graph: apache-spark (post) with 76 nodes


Unnamed: 0,num_nodes,num_edges,density,avg_clustering,modularity,repo,phase
0,95,138,0.030907,0.259662,0.475848,scikit-learn,pre
1,131,189,0.022196,0.200871,0.503733,scikit-learn,during
2,35,52,0.087395,0.146604,0.363343,scikit-learn,post
3,359,2498,0.038873,0.696226,0.241106,pytorch,pre
4,803,8090,0.025124,0.572704,0.408739,pytorch,during
5,365,4539,0.068328,0.661742,0.381195,pytorch,post
6,1160,18491,0.027507,0.672398,0.33766,kubernetes,pre
7,1430,18550,0.018155,0.661403,0.361503,kubernetes,during
8,423,4072,0.045623,0.652579,0.381148,kubernetes,post
9,172,388,0.026384,0.302493,0.609793,apache-spark,pre


In [5]:
# Node-level metrics
node_results = []

for repo in repos:
    for phase in phases:
        graph_path = projected_dir / f"{repo}_{phase}_projected.gml"
        if not graph_path.exists():
            continue

        G = nx.read_gml(graph_path)
        strength_dict = dict(G.degree(weight="weight"))
        degree_dict = dict(G.degree())
        betweenness_dict = nx.betweenness_centrality(G, weight="weight", normalized=True)

        for node in G.nodes():
            node_results.append({
                "repo": repo,
                "phase": phase,
                "node": node,
                "degree": degree_dict.get(node, 0),
                "strength": strength_dict.get(node, 0),
                "betweenness": betweenness_dict.get(node, 0)
            })

node_df = pd.DataFrame(node_results)
node_df.to_csv(output_dir / "node_metrics.csv", index=False)
node_df.head()


Unnamed: 0,repo,phase,node,degree,strength,betweenness
0,scikit-learn,pre,glemaitre,19,49,0.366125
1,scikit-learn,pre,vachanda,1,2,0.0
2,scikit-learn,pre,reshamas,11,39,0.151224
3,scikit-learn,pre,mghah,1,2,0.0
4,scikit-learn,pre,venkyyuvy,2,5,0.0
