In [1]:
import networkx as nx
import pandas as pd
from scipy.stats import spearmanr
from pathlib import Path

# Directories
graph_dir = Path("../data/networks/projected")
results_dir = Path("../data/results")
results_dir.mkdir(parents=True, exist_ok=True)

repos = ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]
phases = ["pre", "during", "post"]
k_samples = 100 


In [2]:
# --- Load all graphs ---
repo_graphs = {}
for repo in repos:
    repo_graphs[repo] = {}
    for phase in phases:
        file = graph_dir / f"{repo}_{phase}_projected.gml"
        try:
            G = nx.read_gml(file)
            repo_graphs[repo][phase] = G
        except Exception as e:
            print(f"Failed to load {file}: {e}")


In [3]:
# Analyze node retention and edge overlap
node_retention = []
edge_overlap = []

for repo, graphs in repo_graphs.items():
    for phase1, phase2 in [("pre", "during"), ("during", "post")]:
        if phase1 not in graphs or phase2 not in graphs:
            continue
        G1 = graphs[phase1]
        G2 = graphs[phase2]

        # Node retention
        nodes1 = set(G1.nodes())
        nodes2 = set(G2.nodes())
        if nodes1:
            retention = len(nodes1.intersection(nodes2)) / len(nodes1)
            node_retention.append({
                "repo": repo,
                "from_phase": phase1,
                "to_phase": phase2,
                "retention_rate": retention
            })

        # Edge overlap
        edges1 = set(map(frozenset, G1.edges()))
        edges2 = set(map(frozenset, G2.edges()))
        if edges1:
            overlap = len(edges1.intersection(edges2)) / len(edges1)
            edge_overlap.append({
                "repo": repo,
                "from_phase": phase1,
                "to_phase": phase2,
                "edge_overlap": overlap
            })

# Convert to DataFrames and export
df_retention = pd.DataFrame(node_retention)
df_overlap = pd.DataFrame(edge_overlap)

df_retention.to_csv(results_dir / "node_retention.csv", index=False)
df_overlap.to_csv(results_dir / "edge_overlap.csv", index=False)

df_retention
df_overlap


Unnamed: 0,repo,from_phase,to_phase,edge_overlap
0,scikit-learn,pre,during,0.108696
1,scikit-learn,during,post,0.063492
2,pytorch,pre,during,0.326661
3,pytorch,during,post,0.286279
4,kubernetes,pre,during,0.259153
5,kubernetes,during,post,0.139784
6,apache-spark,pre,during,0.064433
7,apache-spark,during,post,0.047337
