# Information Diffusion Simulator in Clustered / Multiplex Networks

This notebook implements a **Python simulator** for information diffusion
through **clusters (communities)** in a (possibly) **multiplex network**.

It supports:

- Synthetic generation of a **two-layer multiplex network**.
- Saving / loading the edge list as a **`.csv` file**.
- Detection of **clusters/communities** in each layer.
- Simulation of **SIR-like diffusion** with:
    - Higher transmission inside clusters (`p_intra`)
    - Lower transmission across clusters (`p_inter`)
    - Optional **triangle-based reinforcement** (`p_tri`) inside clusters.

You can plug in your own **citation network** or **Higgs multiplex network**
by replacing the CSV file with real data.


In [None]:
# 0. Imports and basic setup

import random
from collections import defaultdict

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

random.seed(42)
np.random.seed(42)

print("NetworkX:", nx.__version__)


## 1. Generate a synthetic multiplex network and export as `.csv`

We create a **two-layer multiplex network**:

- Layer 0: relatively sparse
- Layer 1: relatively dense

The edge list is stored as a CSV with columns:

- `source`
- `target`
- `layer`


In [None]:
# 1.1 Generate multiplex network

def generate_multiplex(n=200, p1=0.05, p2=0.08, seed1=1, seed2=2):
    G1 = nx.fast_gnp_random_graph(n, p1, seed=seed1)
    G2 = nx.fast_gnp_random_graph(n, p2, seed=seed2)
    return {0: G1, 1: G2}


def edges_to_df(G, layer_id):
    return pd.DataFrame(
        [(int(u), int(v), int(layer_id)) for u, v in G.edges()],
        columns=["source", "target", "layer"],
    )


layers = generate_multiplex()
df_edges = pd.concat(
    [edges_to_df(layers[0], 0), edges_to_df(layers[1], 1)],
    ignore_index=True
)

csv_path = "multiplex_edges.csv"
df_edges.to_csv(csv_path, index=False)
print("Saved multiplex edge list to:", csv_path)
df_edges.head()


## 2. Loading a multiplex network from `.csv`

This function lets you:

- Load the **synthetic** dataset we just saved, or
- Replace `csv_path` with a path to your **Citation** or **Higgs** network edge list,
  formatted as `source,target,layer`.


In [None]:
def load_multiplex_from_csv(path: str):
    df = pd.read_csv(path)
    layers = {}
    for layer_id, df_layer in df.groupby("layer"):
        G = nx.Graph()
        for _, row in df_layer.iterrows():
            G.add_edge(int(row["source"]), int(row["target"]))
        layers[int(layer_id)] = G
    return layers


layers_loaded = load_multiplex_from_csv(csv_path)
for lid, G in layers_loaded.items():
    print(f"Layer {lid}: nodes={G.number_of_nodes()}, edges={G.number_of_edges()}, "
          f"C={nx.average_clustering(G):.4f}")


## 3. Community / cluster detection

We detect **clusters** (communities) separately in each layer using
NetworkX's **greedy modularity** algorithm.

Each node gets a `cluster_id` per layer.


In [None]:
from networkx.algorithms.community import greedy_modularity_communities

def detect_clusters(G: nx.Graph):
    """Return a dict: node -> cluster_id"""
    communities = list(greedy_modularity_communities(G))
    cluster_map = {}
    for cid, comm in enumerate(communities):
        for v in comm:
            cluster_map[v] = cid
    return cluster_map, communities


cluster_maps = {}
communities = {}

for lid, G in layers_loaded.items():
    cmap, comms = detect_clusters(G)
    cluster_maps[lid] = cmap
    communities[lid] = comms
    print(f"Layer {lid}: found {len(comms)} clusters.")


## 4. Triangle enumeration (for reinforcement inside clusters)

We enumerate **triangles** in each layer and build:

- A list of triangles (triples of nodes)
- An index `node -> triangles containing it`

This will be used to apply **higher transmission** when
two infected neighbors form a triangle with a susceptible node.


In [None]:
def enumerate_triangles(G: nx.Graph):
    triangles = []
    # use node ordering for canonical triple
    nodes = list(G.nodes())
    idx = {v: i for i, v in enumerate(nodes)}
    for v in nodes:
        nbrs = list(G.neighbors(v))
        nbrs_sorted = sorted(nbrs, key=lambda x: idx[x])
        for i in range(len(nbrs_sorted)):
            for j in range(i+1, len(nbrs_sorted)):
                u, w = nbrs_sorted[i], nbrs_sorted[j]
                if G.has_edge(u, w):
                    tri = tuple(sorted((u, v, w), key=lambda x: idx[x]))
                    triangles.append(tri)
    # remove duplicates
    triangles = list(dict.fromkeys(triangles))
    return triangles


def build_triangle_index(triangles):
    tri_index = defaultdict(list)
    for tid, tri in enumerate(triangles):
        for v in tri:
            tri_index[v].append(tid)
    return tri_index


triangles = {}
tri_index = {}

for lid, G in layers_loaded.items():
    tris = enumerate_triangles(G)
    triangles[lid] = tris
    tri_index[lid] = build_triangle_index(tris)
    print(f"Layer {lid}: triangles={len(tris)}")


## 5. SIR-like diffusion through clusters

We simulate a **discrete-time SIR process**:

- States:  
  - 0 = Susceptible (S)  
  - 1 = Infected (I)  
  - 2 = Recovered (R)

- Parameters:
  - `p_intra`: transmission probability along an edge **within the same cluster**.
  - `p_inter`: transmission probability along an edge **between different clusters**.
  - `p_tri`: additional probability of infection if a susceptible node is in a
    triangle with **two infected neighbors in the same cluster / layer**.
  - `p_recover`: per-step probability that an infected node becomes recovered.

We simulate on **one chosen layer** (e.g., 0 or 1) but you can extend to
multiplex coupling if needed.


In [None]:
def simulate_clustered_SIR(
    G: nx.Graph,
    clusters: dict,
    triangles_layer,
    tri_index_layer,
    seeds,
    p_intra=0.4,
    p_inter=0.1,
    p_tri=0.6,
    p_recover=0.2,
    max_steps=50,
):
    """Simulate SIR with cluster-dependent and triangle-reinforced transmission.

    Args:
        G: underlying graph for diffusion.
        clusters: dict node -> cluster_id
        triangles_layer: list of triangles (u, v, w)
        tri_index_layer: dict node -> list of triangle indices
        seeds: iterable of initially infected nodes
    Returns:
        history: dict with keys 'S', 'I', 'R' (lists of counts per step)
    """
    nodes = list(G.nodes())
    state = {v: 0 for v in nodes}  # 0=S, 1=I, 2=R
    for s in seeds:
        if s in state:
            state[s] = 1

    S_hist, I_hist, R_hist = [], [], []

    for _ in range(max_steps):
        S_hist.append(sum(1 for v in nodes if state[v] == 0))
        I_hist.append(sum(1 for v in nodes if state[v] == 1))
        R_hist.append(sum(1 for v in nodes if state[v] == 2))

        new_state = state.copy()

        # Infection via edges
        for u in nodes:
            if state[u] != 1:
                continue
            for v in G.neighbors(u):
                if state[v] != 0:
                    continue
                cu = clusters.get(u, -1)
                cv = clusters.get(v, -1)
                if cu == cv:
                    p = p_intra
                else:
                    p = p_inter
                if random.random() < p:
                    new_state[v] = 1

        # Triangle-based reinforcement
        for v in nodes:
            if state[v] != 0:
                continue
            tri_ids = tri_index_layer.get(v, [])
            for tid in tri_ids:
                tri = triangles_layer[tid]
                others = [x for x in tri if x != v]
                if len(others) != 2:
                    continue
                u, w = others
                if state[u] == 1 and state[w] == 1:
                    # Optional: require same cluster to reinforce
                    if clusters.get(u, -1) == clusters.get(w, -1) == clusters.get(v, -1):
                        if random.random() < p_tri:
                            new_state[v] = 1
                            break

        # Recovery
        for v in nodes:
            if state[v] == 1 and random.random() < p_recover:
                new_state[v] = 2

        state = new_state

    history = {
        "S": S_hist,
        "I": I_hist,
        "R": R_hist,
    }
    return history


## 6. Run example simulations on different clustering regimes

We compare diffusion on:

- A **randomly rewired** (low-clustering) version of the layer
- The original (more clustered) version

This illustrates how clustering and triangle reinforcement affect
diffusion speed and final adoption.


In [None]:
def randomize_graph_preserve_degrees(G: nx.Graph, nswap=2000):
    G_rand = G.copy()
    if G_rand.number_of_edges() > 1:
        nx.double_edge_swap(G_rand, nswap=nswap, max_tries=5*nswap)
    return G_rand


# Choose a layer for diffusion
layer_for_diffusion = 0
G_orig = layers_loaded[layer_for_diffusion]
clusters_orig = cluster_maps[layer_for_diffusion]
tris_orig = triangles[layer_for_diffusion]
tri_idx_orig = tri_index[layer_for_diffusion]

G_rand = randomize_graph_preserve_degrees(G_orig, nswap=2000)
# Recompute clusters and triangles for randomized graph
clusters_rand, comms_rand = detect_clusters(G_rand)
tris_rand = enumerate_triangles(G_rand)
tri_idx_rand = build_triangle_index(tris_rand)

print("Original layer:")
print("  C =", nx.average_clustering(G_orig), "triangles =", len(tris_orig))
print("Randomized layer:")
print("  C =", nx.average_clustering(G_rand), "triangles =", len(tris_rand))


In [None]:
# 6.1 Run diffusion on original vs randomized graph

nodes = list(G_orig.nodes())
seed_set = random.sample(nodes, k=5)

hist_orig = simulate_clustered_SIR(
    G_orig, clusters_orig, tris_orig, tri_idx_orig,
    seeds=seed_set,
    p_intra=0.4,
    p_inter=0.1,
    p_tri=0.7,
    p_recover=0.2,
    max_steps=40,
)

hist_rand = simulate_clustered_SIR(
    G_rand, clusters_rand, tris_rand, tri_idx_rand,
    seeds=seed_set,
    p_intra=0.4,
    p_inter=0.1,
    p_tri=0.7,
    p_recover=0.2,
    max_steps=40,
)

t = range(len(hist_orig["I"]))

plt.figure()
plt.plot(t, np.array(hist_orig["I"]) / len(nodes), label="Infected (clustered)")
plt.plot(t, np.array(hist_rand["I"]) / len(nodes), label="Infected (randomized)", linestyle="--")
plt.xlabel("Time step")
plt.ylabel("Fraction infected")
plt.title("Information diffusion through clusters vs randomized topology")
plt.legend()
plt.grid(True)
plt.show()


## 7. Parameter sweep: diffusion vs transmissibility

Finally, we sweep over different intra-cluster transmission probabilities
`p_intra` and measure final adoption (fraction recovered) to see how
clustering changes the effective **diffusion threshold**.


In [None]:
def run_threshold_sweep(
    G, clusters, tris, tri_idx,
    p_intra_values,
    p_inter=0.1,
    p_tri=0.7,
    p_recover=0.2,
    num_seeds=5,
    num_reps=5,
    max_steps=40,
):
    nodes = list(G.nodes())
    n = len(nodes)
    final_R = []
    for pin in p_intra_values:
        vals = []
        for _ in range(num_reps):
            seeds = random.sample(nodes, k=num_seeds)
            hist = simulate_clustered_SIR(
                G, clusters, tris, tri_idx,
                seeds=seeds,
                p_intra=pin,
                p_inter=p_inter,
                p_tri=p_tri,
                p_recover=p_recover,
                max_steps=max_steps,
            )
            vals.append(hist["R"][-1] / n)
        final_R.append(np.mean(vals))
    return np.array(final_R)


p_values = np.linspace(0.0, 1.0, 21)

final_R_orig = run_threshold_sweep(
    G_orig, clusters_orig, tris_orig, tri_idx_orig,
    p_intra_values=p_values,
)

final_R_rand = run_threshold_sweep(
    G_rand, clusters_rand, tris_rand, tri_idx_rand,
    p_intra_values=p_values,
)

plt.figure()
plt.plot(p_values, final_R_orig, label="Clustered")
plt.plot(p_values, final_R_rand, label="Randomized", linestyle="--")
plt.xlabel("Intra-cluster transmission $p_{intra}$")
plt.ylabel("Final adoption (R/N)")
plt.title("Effect of clustering on diffusion threshold")
plt.legend()
plt.grid(True)
plt.show()
