In [1]:
import pandas as pd
import networkx as nx
from pathlib import Path
from scipy.stats import spearmanr

# Directories
projected_dir = Path("../data/networks/projected")
results_dir = Path("../data/results")
results_dir.mkdir(parents=True, exist_ok=True)

repos = ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]
phases = ["pre", "during", "post"]


In [2]:
# Load node-level metrics
node_df = pd.read_csv(results_dir / "node_metrics.csv")

# Function to compute Spearman rank correlation
def spearman_rank_between_periods(repo, metric):
    t1 = node_df[(node_df["repo"] == repo) & (node_df["phase"] == "pre")][["node", metric]].set_index("node")
    t2 = node_df[(node_df["repo"] == repo) & (node_df["phase"] == "during")][["node", metric]].set_index("node")
    t3 = node_df[(node_df["repo"] == repo) & (node_df["phase"] == "post")][["node", metric]].set_index("node")

    joined_pre_during = t1.join(t2, lsuffix="_pre", rsuffix="_during", how="inner").dropna()
    joined_during_post = t2.join(t3, lsuffix="_during", rsuffix="_post", how="inner").dropna()

    pre_during_corr = spearmanr(joined_pre_during[f"{metric}_pre"], joined_pre_during[f"{metric}_during"]).correlation
    during_post_corr = spearmanr(joined_during_post[f"{metric}_during"], joined_during_post[f"{metric}_post"]).correlation

    return pre_during_corr, during_post_corr

# Compute correlations for all repos and key metrics
metrics = ["degree", "betweenness"]
corr_results = []

for repo in repos:
    for metric in metrics:
        pd_corr, dp_corr = spearman_rank_between_periods(repo, metric)
        corr_results.append({
            "repo": repo,
            "metric": metric,
            "pre_during_corr": pd_corr,
            "during_post_corr": dp_corr
        })

corr_df = pd.DataFrame(corr_results)
corr_df.to_csv(results_dir / "temporal_spearman_correlations.csv", index=False)
corr_df


Unnamed: 0,repo,metric,pre_during_corr,during_post_corr
0,scikit-learn,degree,0.323894,0.515726
1,scikit-learn,betweenness,0.428669,0.314815
2,pytorch,degree,0.44388,0.741406
3,pytorch,betweenness,0.315999,0.526207
4,kubernetes,degree,0.552296,0.708744
5,kubernetes,betweenness,0.51843,0.628898
6,apache-spark,degree,0.312879,0.326294
7,apache-spark,betweenness,0.344047,0.387621


In [3]:
# Node and edge survival rates
def load_graph(repo, phase):
    path = projected_dir / f"{repo}_{phase}_projected.gml"
    return nx.read_gml(path) if path.exists() else None

turnover_stats = []

for repo in repos:
    for i in range(len(phases) - 1):
        phase_t = phases[i]
        phase_tp1 = phases[i + 1]
        G_t = load_graph(repo, phase_t)
        G_tp1 = load_graph(repo, phase_tp1)

        if G_t is None or G_tp1 is None:
            continue

        nodes_t = set(G_t.nodes())
        nodes_tp1 = set(G_tp1.nodes())
        edges_t = set(G_t.edges())
        edges_tp1 = set(G_tp1.edges())

        shared_nodes = nodes_t & nodes_tp1
        shared_edges = edges_t & edges_tp1

        node_retention = len(shared_nodes) / len(nodes_t) if nodes_t else 0
        edge_overlap = len(shared_edges) / len(edges_t) if edges_t else 0

        turnover_stats.append({
            "repo": repo,
            "phase_pair": f"{phase_t}-{phase_tp1}",
            "node_retention": node_retention,
            "edge_overlap": edge_overlap
        })

turnover_df = pd.DataFrame(turnover_stats)
turnover_df.to_csv(results_dir / "node_edge_turnover.csv", index=False)
turnover_df


Unnamed: 0,repo,phase_pair,node_retention,edge_overlap
0,scikit-learn,pre-during,0.231579,0.065217
1,scikit-learn,during-post,0.114504,0.031746
2,pytorch,pre-during,0.465181,0.228183
3,pytorch,during-post,0.265255,0.172559
4,kubernetes,pre-during,0.44569,0.185279
5,kubernetes,during-post,0.220979,0.096658
6,apache-spark,pre-during,0.313953,0.043814
7,apache-spark,during-post,0.252475,0.029586
