Section 1: Loading necessary libraries

In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
import os
from networkx.algorithms import bipartite

Section 2: Load the Cleaned PR Data per Repo

In [2]:
REPOS = ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]

# Load per-repo data
repo_data = {}

for repo in REPOS:
    df = pd.read_csv(f"../data/processed/{repo}_cleaned_prs.csv", parse_dates=["created_at", "closed_at", "merged_at"])
    df["all_contributors"] = df["all_contributors"].apply(eval)  # Convert stringified lists back to lists
    repo_data[repo] = df


Section 3: Create the Bipartite Graphs per Phase

In [3]:
bipartite_graphs = defaultdict(dict)
projected_graphs = defaultdict(dict)

for repo, df in repo_data.items():
    for phase in ["pre", "during", "post"]:
        df_phase = df[df["covid_phase"] == phase]

        B = nx.Graph()
        for _, row in df_phase.iterrows():
            pr_id = f"{repo}_PR_{row['pr_number']}"
            contributors = row["all_contributors"] or []

            B.add_node(pr_id, bipartite=0)
            for user in contributors:
                if pd.notnull(user):
                    B.add_node(user, bipartite=1)
                    B.add_edge(user, pr_id)

        # Store bipartite graph
        bipartite_graphs[repo][phase] = B

        # --- Project to contributor-contributor graph ---
        contributors = {n for n, d in B.nodes(data=True) if d["bipartite"] == 1}
        G = bipartite.weighted_projected_graph(B, contributors)

        # --- Filter edges: only retain those with weight > 1 ---
        edges_to_remove = [(u, v) for u, v, d in G.edges(data=True) if d["weight"] <= 1]
        G.remove_edges_from(edges_to_remove)

        # Remove isolated nodes
        G.remove_nodes_from(list(nx.isolates(G)))

        # Store projected graph
        projected_graphs[repo][phase] = G

        # --- Save graph ---
        nx.write_gml(G, f"../data/networks/{repo}_{phase}_contributor_graph.gml")
        print(f"Saved: {repo}_{phase}_contributor_graph.gml with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


Saved: scikit-learn_pre_contributor_graph.gml with 438 nodes and 565 edges
Saved: scikit-learn_during_contributor_graph.gml with 644 nodes and 817 edges
Saved: scikit-learn_post_contributor_graph.gml with 88 nodes and 138 edges
Saved: pytorch_pre_contributor_graph.gml with 1124 nodes and 1909 edges
Saved: pytorch_during_contributor_graph.gml with 2919 nodes and 5419 edges
Saved: pytorch_post_contributor_graph.gml with 941 nodes and 1604 edges
Saved: kubernetes_pre_contributor_graph.gml with 2460 nodes and 7816 edges
Saved: kubernetes_during_contributor_graph.gml with 2911 nodes and 7732 edges
Saved: kubernetes_post_contributor_graph.gml with 609 nodes and 1219 edges
Saved: apache-spark_pre_contributor_graph.gml with 470 nodes and 837 edges
Saved: apache-spark_during_contributor_graph.gml with 665 nodes and 945 edges
Saved: apache-spark_post_contributor_graph.gml with 211 nodes and 292 edges


Section 4: Save Bipartite Graphs

In [4]:
os.makedirs("../data/networks/bipartite", exist_ok=True)

for repo, phases in bipartite_graphs.items():
    for phase, graph in phases.items():
        path = f"../data/networks/bipartite/{repo}_{phase}_bipartite.gml"
        nx.write_gml(graph, path)


Section 5: Calculate and Save Netwrok Metrics

In [5]:
metrics = []

for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        density = nx.density(G)
        avg_deg = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0
        clustering = nx.average_clustering(G) if num_nodes > 1 else 0
        largest_cc = len(max(nx.connected_components(G), key=len)) if num_nodes > 0 else 0

        metrics.append({
            "repository": repo,
            "phase": phase,
            "nodes": num_nodes,
            "edges": num_edges,
            "avg_degree": round(avg_deg, 2),
            "density": round(density, 4),
            "clustering": round(clustering, 4),
            "largest_cc": largest_cc
        })

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../data/processed/network_metrics.csv", index=False)
display(metrics_df)


Unnamed: 0,repository,phase,nodes,edges,avg_degree,density,clustering,largest_cc
0,scikit-learn,pre,438,565,2.58,0.0059,0.1871,438
1,scikit-learn,during,644,817,2.54,0.0039,0.1742,644
2,scikit-learn,post,88,138,3.14,0.0361,0.3237,88
3,pytorch,pre,1124,1909,3.4,0.003,0.2782,1122
4,pytorch,during,2919,5419,3.71,0.0013,0.3243,2917
5,pytorch,post,941,1604,3.41,0.0036,0.3606,939
6,kubernetes,pre,2460,7816,6.35,0.0026,0.372,2432
7,kubernetes,during,2911,7732,5.31,0.0018,0.3612,2856
8,kubernetes,post,609,1219,4.0,0.0066,0.3773,587
9,apache-spark,pre,470,837,3.56,0.0076,0.2941,470


Section 7: Visualize Networks

In [6]:
import os
import matplotlib.pyplot as plt
import networkx as nx

os.makedirs("../figures/networks", exist_ok=True)

for repo in ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]:
    for phase in ["pre", "during", "post"]:
        G = projected_graphs[repo][phase]
        if G.number_of_nodes() == 0 or G.number_of_nodes() > 500:
            continue  # skip too small or too large to draw clearly
        else:
            print(f"Skipped {repo} – {phase}, too large to visualize.")


        plt.figure(figsize=(10, 8))
        pos = nx.spring_layout(G, k=0.1)
        nx.draw(G, pos, node_size=30, alpha=0.7, with_labels=False)
        plt.title(f"{repo.upper()} – {phase.capitalize()} Contributor Network", fontsize=14, weight='bold')
        plt.tight_layout()
        plt.savefig(f"../figures/networks/{repo}_{phase}_network.png")
        plt.close()



Skipped scikit-learn – pre, too large to visualize.


  plt.tight_layout()


Skipped scikit-learn – post, too large to visualize.
Skipped apache-spark – pre, too large to visualize.
Skipped apache-spark – post, too large to visualize.


### Section 7.5: Summary of Constructed Networks

Each projected contributor graph (per repository and COVID phase) includes only developers who collaborated on more than one PR. Isolated developers were removed. The resulting networks range in size, with the largest being PyTorch during COVID (2919 nodes, 5419 edges), and the smallest being scikit-learn post-COVID (89 nodes, 139 edges).


Section 8: Centrality Visualization

In [7]:
from matplotlib import pyplot as plt

# Degree centrality for projected graphs
for repo in ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]:
    for phase in ["pre", "during", "post"]:
        G = projected_graphs[repo][phase]
        if G.number_of_nodes() == 0:
            continue

        # Centrality metric (e.g. degree centrality)
        centrality = nx.degree_centrality(G)
        node_sizes = [500 * centrality[n] for n in G.nodes()]

        plt.figure(figsize=(8, 6))
        nx.draw_spring(G, node_size=node_sizes, alpha=0.7, with_labels=False)
        plt.title(f"{repo.upper()} – {phase.capitalize()} (Degree Centrality)", fontsize=14)
        plt.tight_layout()
        plt.savefig(f"../figures/networks/{repo}_{phase}_centrality.png")
        plt.close()


  plt.tight_layout()
  plt.tight_layout()


Section 9: Centrality Metrics per Phase

9.1: Node-Level

In [8]:
centrality_data = []

for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        if len(G) == 0:
            continue
        deg = dict(G.degree())
        strength = dict(G.degree(weight='weight'))
        betweenness = nx.betweenness_centrality(G)
        eigenvector = nx.eigenvector_centrality(G, max_iter=1000)

        for node in G.nodes():
            centrality_data.append({
                "repository": repo,
                "phase": phase,
                "node": node,
                "degree": deg.get(node, 0),
                "strength": strength.get(node, 0),
                "betweenness": betweenness.get(node, 0),
                "eigenvector": eigenvector.get(node, 0)
            })


9.2: Modularity and Community Detection  
Using Louvain

In [9]:
import community as community_louvain  # pip install python-louvain

for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        if len(G) < 2:
            continue
        partition = community_louvain.best_partition(G)
        modularity = community_louvain.modularity(partition, G)

        print(f"{repo}-{phase}: modularity = {modularity:.4f}, communities = {len(set(partition.values()))}")


scikit-learn-pre: modularity = 0.0797, communities = 7
scikit-learn-during: modularity = 0.0525, communities = 12
scikit-learn-post: modularity = 0.0283, communities = 3
pytorch-pre: modularity = 0.0725, communities = 19
pytorch-during: modularity = 0.1323, communities = 31
pytorch-post: modularity = 0.1328, communities = 19
kubernetes-pre: modularity = 0.5667, communities = 31
kubernetes-during: modularity = 0.5735, communities = 43
kubernetes-post: modularity = 0.5937, communities = 22
apache-spark-pre: modularity = 0.1493, communities = 6
apache-spark-during: modularity = 0.0580, communities = 11
apache-spark-post: modularity = 0.1165, communities = 8


9.3: Assortativity

In [10]:
for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        if len(G) > 1:
            assort = nx.degree_assortativity_coefficient(G)
            print(f"{repo}-{phase} assortativity: {assort:.4f}")


scikit-learn-pre assortativity: -0.6535
scikit-learn-during assortativity: -0.6665
scikit-learn-post assortativity: -0.5660
pytorch-pre assortativity: -0.4355
pytorch-during assortativity: -0.3722
pytorch-post assortativity: -0.4207
kubernetes-pre assortativity: -0.1138
kubernetes-during assortativity: -0.1312
kubernetes-post assortativity: -0.2025
apache-spark-pre assortativity: -0.4324
apache-spark-during assortativity: -0.5539
apache-spark-post assortativity: -0.5858


9.4 Turnover Metrics  
  Based on edge/node overlaps

In [11]:
def calculate_overlap(G1, G2):
    nodes_1, nodes_2 = set(G1.nodes()), set(G2.nodes())
    edges_1, edges_2 = set(G1.edges()), set(G2.edges())

    node_overlap = len(nodes_1 & nodes_2) / len(nodes_1 | nodes_2)
    edge_overlap = len(edges_1 & edges_2) / len(edges_1 | edges_2)
    return node_overlap, edge_overlap

for repo in REPOS:
    try:
        G_pre = projected_graphs[repo]["pre"]
        G_during = projected_graphs[repo]["during"]
        G_post = projected_graphs[repo]["post"]

        nd, ed = calculate_overlap(G_pre, G_during)
        dp, ep = calculate_overlap(G_during, G_post)
        print(f"{repo} - Pre↔During Node: {nd:.2f}, Edge: {ed:.2f}")
        print(f"{repo} - During↔Post Node: {dp:.2f}, Edge: {ep:.2f}")
    except KeyError:
        continue


scikit-learn - Pre↔During Node: 0.07, Edge: 0.04
scikit-learn - During↔Post Node: 0.05, Edge: 0.03
pytorch - Pre↔During Node: 0.09, Edge: 0.05
pytorch - During↔Post Node: 0.13, Edge: 0.07
kubernetes - Pre↔During Node: 0.17, Edge: 0.05
kubernetes - During↔Post Node: 0.11, Edge: 0.03
apache-spark - Pre↔During Node: 0.15, Edge: 0.08
apache-spark - During↔Post Node: 0.17, Edge: 0.09


Section 10: Save Centrality Metrics Results

In [12]:
# Save centrality metrics
centrality_df = pd.DataFrame(centrality_data)
centrality_df.to_csv("../data/processed/centrality_metrics.csv", index=False)

# Save modularity + community count
modularity_records = []
for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        if len(G) < 2:
            continue
        partition = community_louvain.best_partition(G)
        modularity = community_louvain.modularity(partition, G)
        n_communities = len(set(partition.values()))
        modularity_records.append({
            "repository": repo,
            "phase": phase,
            "modularity": round(modularity, 4),
            "num_communities": n_communities
        })

modularity_df = pd.DataFrame(modularity_records)
modularity_df.to_csv("../data/processed/modularity_metrics.csv", index=False)

# Save assortativity metrics
assortativity_records = []
for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        if G.number_of_nodes() > 1:
            assort = nx.degree_assortativity_coefficient(G)
            assortativity_records.append({
                "repository": repo,
                "phase": phase,
                "assortativity": round(assort, 4)
            })

assortativity_df = pd.DataFrame(assortativity_records)
assortativity_df.to_csv("../data/processed/assortativity_metrics.csv", index=False)
