Section 1: Loading necessary libraries

In [8]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
import os
from networkx.algorithms import bipartite

Section 2: Load the Cleaned PR Data per Repo

In [9]:
REPOS = ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]

# Load per-repo data
repo_data = {}

for repo in REPOS:
    df = pd.read_csv(f"../data/processed/{repo}_cleaned_prs.csv", parse_dates=["created_at", "closed_at", "merged_at"])
    df["all_contributors"] = df["all_contributors"].apply(eval)  # Convert stringified lists back to lists
    repo_data[repo] = df


Section 3: Create the Bipartite Graphs per Phase

In [10]:
bipartite_graphs = defaultdict(dict)
projected_graphs = defaultdict(dict)

for repo, df in repo_data.items():
    for phase in ["pre", "during", "post"]:
        df_phase = df[df["covid_phase"] == phase]

        B = nx.Graph()
        for _, row in df_phase.iterrows():
            pr_id = f"{repo}_PR_{row['pr_number']}"
            contributors = row["all_contributors"] or []

            B.add_node(pr_id, bipartite=0)
            for user in contributors:
                if pd.notnull(user):
                    B.add_node(user, bipartite=1)
                    B.add_edge(user, pr_id)

        # Store bipartite graph
        bipartite_graphs[repo][phase] = B

        # --- Project to contributor-contributor graph ---
        contributors = {n for n, d in B.nodes(data=True) if d["bipartite"] == 1}
        G = bipartite.weighted_projected_graph(B, contributors)

        # --- Filter edges: only retain those with weight > 1 ---
        edges_to_remove = [(u, v) for u, v, d in G.edges(data=True) if d["weight"] <= 1]
        G.remove_edges_from(edges_to_remove)

        # Remove isolated nodes
        G.remove_nodes_from(list(nx.isolates(G)))

        # Store projected graph
        projected_graphs[repo][phase] = G

        # --- Save graph ---
        nx.write_gml(G, f"../data/networks/{repo}_{phase}_contributor_graph.gml")
        print(f"Saved: {repo}_{phase}_contributor_graph.gml with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


Saved: scikit-learn_pre_contributor_graph.gml with 438 nodes and 565 edges
Saved: scikit-learn_during_contributor_graph.gml with 644 nodes and 817 edges
Saved: scikit-learn_post_contributor_graph.gml with 89 nodes and 139 edges
Saved: pytorch_pre_contributor_graph.gml with 1124 nodes and 1909 edges
Saved: pytorch_during_contributor_graph.gml with 2919 nodes and 5419 edges
Saved: pytorch_post_contributor_graph.gml with 942 nodes and 1607 edges
Saved: kubernetes_pre_contributor_graph.gml with 2460 nodes and 7816 edges
Saved: kubernetes_during_contributor_graph.gml with 2911 nodes and 7732 edges
Saved: kubernetes_post_contributor_graph.gml with 609 nodes and 1219 edges
Saved: apache-spark_pre_contributor_graph.gml with 470 nodes and 837 edges
Saved: apache-spark_during_contributor_graph.gml with 665 nodes and 945 edges
Saved: apache-spark_post_contributor_graph.gml with 211 nodes and 292 edges


Section 4: Save Bipartite Graphs

In [11]:
os.makedirs("../data/networks/bipartite", exist_ok=True)

for repo, phases in bipartite_graphs.items():
    for phase, graph in phases.items():
        path = f"../data/networks/bipartite/{repo}_{phase}_bipartite.gml"
        nx.write_gml(graph, path)


Section 5: Calculate and Save Netwrok Metrics

In [12]:
metrics = []

for repo, phases in projected_graphs.items():
    for phase, G in phases.items():
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        density = nx.density(G)
        avg_deg = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0
        clustering = nx.average_clustering(G) if num_nodes > 1 else 0
        largest_cc = len(max(nx.connected_components(G), key=len)) if num_nodes > 0 else 0

        metrics.append({
            "repository": repo,
            "phase": phase,
            "nodes": num_nodes,
            "edges": num_edges,
            "avg_degree": round(avg_deg, 2),
            "density": round(density, 4),
            "clustering": round(clustering, 4),
            "largest_cc": largest_cc
        })

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../data/processed/network_metrics.csv", index=False)
display(metrics_df)


Unnamed: 0,repository,phase,nodes,edges,avg_degree,density,clustering,largest_cc
0,scikit-learn,pre,438,565,2.58,0.0059,0.1871,438
1,scikit-learn,during,644,817,2.54,0.0039,0.1742,644
2,scikit-learn,post,89,139,3.12,0.0355,0.32,89
3,pytorch,pre,1124,1909,3.4,0.003,0.2782,1122
4,pytorch,during,2919,5419,3.71,0.0013,0.3243,2917
5,pytorch,post,942,1607,3.41,0.0036,0.3619,940
6,kubernetes,pre,2460,7816,6.35,0.0026,0.372,2432
7,kubernetes,during,2911,7732,5.31,0.0018,0.3612,2856
8,kubernetes,post,609,1219,4.0,0.0066,0.3773,587
9,apache-spark,pre,470,837,3.56,0.0076,0.2941,470


Section 7: Visualize Networks

In [13]:
import os
import matplotlib.pyplot as plt
import networkx as nx

os.makedirs("../figures/networks", exist_ok=True)

for repo in ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]:
    for phase in ["pre", "during", "post"]:
        G = projected_graphs[repo][phase]
        if G.number_of_nodes() == 0 or G.number_of_nodes() > 500:
            continue  # skip too small or too large to draw clearly
        else:
            print(f"Skipped {repo} – {phase}, too large to visualize.")


        plt.figure(figsize=(10, 8))
        pos = nx.spring_layout(G, k=0.1)
        nx.draw(G, pos, node_size=30, alpha=0.7, with_labels=False)
        plt.title(f"{repo.upper()} – {phase.capitalize()} Contributor Network", fontsize=14, weight='bold')
        plt.tight_layout()
        plt.savefig(f"../figures/networks/{repo}_{phase}_network.png")
        plt.close()



Skipped scikit-learn – pre, too large to visualize.


  plt.tight_layout()


Skipped scikit-learn – post, too large to visualize.
Skipped apache-spark – pre, too large to visualize.
Skipped apache-spark – post, too large to visualize.


Section 8: Centrality Visualization

In [14]:
from matplotlib import pyplot as plt

# Degree centrality for projected graphs
for repo in ["scikit-learn", "pytorch", "kubernetes", "apache-spark"]:
    for phase in ["pre", "during", "post"]:
        G = projected_graphs[repo][phase]
        if G.number_of_nodes() == 0:
            continue

        # Centrality metric (e.g. degree centrality)
        centrality = nx.degree_centrality(G)
        node_sizes = [500 * centrality[n] for n in G.nodes()]

        plt.figure(figsize=(8, 6))
        nx.draw_spring(G, node_size=node_sizes, alpha=0.7, with_labels=False)
        plt.title(f"{repo.upper()} – {phase.capitalize()} (Degree Centrality)", fontsize=14)
        plt.tight_layout()
        plt.savefig(f"../figures/networks/{repo}_{phase}_centrality.png")
        plt.close()


  plt.tight_layout()
