In [129]:
import json
import numpy as np
import networkx as nx
import gurobipy as gp
from gurobipy import *
import random
import copy
np.random.seed(0)
from tqdm import tqdm
import time

In [5]:
print(gp.gurobi.version())

(10, 0, 1)


## Cascade Functions

In [112]:
def simulate_cascade(G, p_lb=0, p_ub=1):
    """
    Simulate a cascade in graph by flipping coins for all edges in G to construct a subgraph G'
    
    Parameters:
    G (networkx.Graph): The graph on which to run the model.
    
    Returns:
    G_prime (networkx.Graph): A subgraph of G
    """
    
    G_prime = G.copy()
    
    removed_edges = [e for e in G.edges if np.random.uniform() < G[e[0]][e[1]]['weight']]
    
    G_prime.remove_edges_from(removed_edges)
    
    return G_prime

In [113]:
def simulate_fixed_y_cascade(G, ys):
    '''
    Simulate a cascade with fixed initial set of nodes
    '''
    # Set all nodes as not activated
    activated = set()
    # Add the seed nodes to the activated set
    activated.update(set(ys))
    # Initialize the newly activated nodes list with the seed nodes
    newly_activated = set(ys)
    
    # Run the model until there are no more newly activated nodes
    while len(newly_activated) != 0:
        temp_activated = set()
        for v in newly_activated:
            # Check for each successor if it gets activated
            for w in G.successors(v):
                if w not in activated:
                    u = np.random.uniform()
                    if u < G[v][w]['weight']:
                        temp_activated.add(w)
        # Add newly activated nodes to the activated set
        newly_activated = temp_activated
        activated.update(newly_activated)
    
    return len(activated)

In [114]:
def simulate_fixed_y_fixed_cascade(sub_G, ys):
    '''
    Simulate a cascade with fixed initial set of nodes
    '''
    # Set all nodes as not activated
    activated = set()
    # Add the seed nodes to the activated set
    activated.update(set(ys))
    # Initialize the newly activated nodes list with the seed nodes
    newly_activated = set(ys)
    
    # Run the model until there are no more newly activated nodes
    while len(newly_activated) != 0:
        temp_activated = set()
        for v in newly_activated:
            # Check for each successor if it gets activated
            for w in sub_G.successors(v):
                temp_activated.add(w)
        # Add newly activated nodes to the activated set
        newly_activated = temp_activated
        activated.update(newly_activated)
    
    return len(activated)

## MIP Model

In [8]:
# def solve_max_spread(graphs, B, C):
#     """
#     Solves the MIP model for maximizing the spread of cascades using network design as presented in the paper
#     "Maximizing the Spread of Cascades Using Network Design" by Sheldon et al. for the given graph and budget.
    
#     Parameters:
#     graphs (list [networkx.Graph]): List of training cascades graphs, which are subgraphs of the original network
#     budget (int): The budget of edges that can be added to the graph.
#     costs (list): Cost of each action
#     """
#     # Initialize the MIP model
#     model = gp.Model('maximize_spread_cascades')
    
#     # Number of training cascades
#     N = len(graphs)

#     # Initialize the decision variables
#     x = {}
#     y = {}
    
#     nodes = np.sort(list(graphs[0].nodes))
    
#     for k in range(N):
#         for v in nodes:
#             x[k, v] = model.addVar(vtype=GRB.CONTINUOUS, lb = 0, ub = 1, name=f'x_{k}_{v}')
#     for v in nodes:
#         y[v] = model.addVar(vtype=GRB.BINARY, name=f'y_{v}') # Asssuming each action corresponds to purchasing one node.

#     # Initialize the objective function to maximize the expected spread
#     obj = 1/N * gp.quicksum(x[k, v] for k in range(N) for v in nodes)
#     model.setObjective(obj, GRB.MAXIMIZE)
    
#     # Add the budget constraint
#     model.addConstr(gp.quicksum(C[v]*y[v] for v in nodes) <= B, name='budget_constraint')
    
#     # Add the edge constraints
#     for k in range(N):
#         for v in nodes:
#             preds = set(graphs[k].predecessors(v))
#             if len(preds) != 0:
#                 model.addConstr((x[k, v] <= gp.quicksum(x[k, u] for u in preds)), name=f'edge_constraint_{k}_{v}')

#     # Add the coverage constraint
#     model.addConstrs((x[k, v] <= y[v] for v in nodes for k in range(N)), name=f'coverage_constraint_{k}_{v}')
    
#     model.update()

#     # Optimize the model
#     model.optimize()
    
#     # Get Objective value
#     obj = model.getObjective().getValue()
#     ys = [v for v in nodes if y[v].x == 1]
#     return model, obj, ys

In [142]:
def solve_max_spread(pres, B, C):
    """
    Parameters:
    pres (dict): Set of all predecessor nodes (including the node itself) pres[k, i] who can activate node i in scenario k.
    B (int): The budget of edges that can be added to the graph.
    C (list): Cost of each action
    """
    # Initialize the MIP model
    model = gp.Model('maximize_spread_cascades')
    
    # Number of training cascades
    N = len(graphs)

    # Initialize the decision variables
    x = {}
    y = {}
    
    nodes = np.sort(list(graphs[0].nodes))
    
    for k in range(N):
        for v in nodes:
            x[k, v] = model.addVar(vtype=GRB.CONTINUOUS, lb = 0, ub = 1, name=f'x_{k}_{v}')
    for v in nodes:
        y[v] = model.addVar(vtype=GRB.BINARY, name=f'y_{v}') # Asssuming each action corresponds to purchasing one node.

    # Initialize the objective function to maximize the expected spread
    obj = 1/N * gp.quicksum(x[k, v] for k in range(N) for v in nodes)
    model.setObjective(obj, GRB.MAXIMIZE)
    
    # Add the budget constraint
    model.addConstr(gp.quicksum(C[v]*y[v] for v in nodes) <= B, name='budget_constraint')

    # Add the coverage constraint
    model.addConstrs((x[k, v] <= gp.quicksum(y[j] for j in pres[k, v]) for v in nodes for k in range(N)), 
                     name=f'coverage_constraint_{k}_{v}')
    
    model.update()

    # Optimize the model
    model.optimize()
    
    # Get Objective value
    obj = model.getObjective().getValue()
    ys = [v for v in nodes if y[v].x == 1]
    return model, obj, ys

In [66]:
def find_pres(graphs):
    '''
    Return set of all predecessor nodes who can activate node i in scenario k
    '''
    pres = {}
    for k in range(len(graphs)):
        G_reversed = graphs[k].reverse()
        for i in graphs[k].nodes:
            successors = {i: [i]}
            pres[k, i] = set()
            while successors:
                new_successors = {}
                for node, successors in successors.items():
                    for successor in successors:
                        if successor not in pres[k, i]:
                            pres[k, i].add(successor)
                            new_successors[successor] = G_reversed.successors(successor)
                successors = new_successors
    return pres

## SAA Algo

In [134]:
def SAA_phase1(input_graph, B, C, num_sample = 50, num_train = 10):
    sol_lst = []
    obj_ub_lst = []
    for i in range(num_sample):
        np.random.seed(i)
        # For each sample, generate num_train of training cascades.
        graphs = [simulate_cascade(input_graph) for _ in range(num_train)]
        # Find predecessors of each node in each realization.
        pres = find_pres(graphs)
        model, obj, ys = solve_max_spread(pres, B, C)
        sol_lst.append(ys)
        obj_ub_lst.append(obj)
        
    return sol_lst, obj_ub_lst

In [135]:
def SAA_phase2(input_graph, sol_lst, num_valid = 500, num_test = 500):
    # Choose the best solution by re-estimating using num_valid training cascades.
    obj_avg_lst = [np.mean([simulate_fixed_y_cascade(input_graph, sol)
                            for _ in range(num_valid)]) for sol in tqdm(sol_lst)]

    y_opt = sol_lst[np.argmax(obj_avg_lst)]
    # Generate num_test replications for testing the best solution.
    obj_lb_lst = [simulate_fixed_y_cascade(input_graph, y_opt) for _ in range(num_test)]
    
    return y_opt, obj_lb_lst

In [153]:
def main_algo(G, B, C, num_sample, num_train, num_valid, num_test):
    start_time = time.perf_counter()
    
    sol_lst, obj_ub_lst = SAA_phase1(G, B, C, num_sample = num_sample, num_train = num_train)
    y_opt, obj_lb_lst = SAA_phase2(G, sol_lst, num_valid = num_valid, num_test = num_test)
    
    end_time = time.perf_counter()
    cpu_time = end_time - start_time
    print("CPU time:", cpu_time)

    t_critical = t.ppf(0.975, num_sample-1)
    ub_CI_lower = np.mean(obj_ub_lst)-t_critical*(np.std(obj_ub_lst)/np.sqrt(num_sample))
    ub_CI_upper = np.mean(obj_ub_lst)+t_critical*(np.std(obj_ub_lst)/np.sqrt(num_sample))
    print("95% Confidence Interval of upper bound estimate: [{:.2f}, {:.2f}]".format(ub_CI_lower, ub_CI_upper))

    
    lb_CI_lower = np.mean(obj_lb_lst)-1.96*(np.std(obj_lb_lst)/np.sqrt(num_test))
    lb_CI_upper = np.mean(obj_lb_lst)+1.96*(np.std(obj_lb_lst)/np.sqrt(num_test))
    print("95% Confidence Interval of lower bound estimate: [{:.2f}, {:.2f}]".format(lb_CI_lower, lb_CI_upper))
    

    return y_opt, ub_CI_lower, ub_CI_upper, lb_CI_lower, lb_CI_upper, cpu_time

## Experiment

### 1. Random DAG

In [154]:
# Create a random Gnp graph.
G = nx.gnp_random_graph(500, 0.2, seed=0, directed=True)

# Create a directed acyclic graph by only keeping edges that point from lower (newer) indices to higher (older).
DAG = nx.DiGraph()
DAG.add_nodes_from(G.nodes)
DAG.add_edges_from([(u,v) for (u,v) in G.edges() if u<v])
print('Is the graph acyclic:', nx.is_directed_acyclic_graph(DAG))
print('Num edges: ', len(DAG.edges))

Is the graph acyclic: True
Num edges:  25077


In [155]:
# Add random activation probabilities for each edge
np.random.seed(0)
for e in DAG.edges():
    DAG[e[0]][e[1]]['weight'] = np.random.uniform(0, 1)

In [156]:
# Input parameters - budget and action cost
B = 300
np.random.seed(0)
# Cost of each node is uniformly distribution from (0, 100)
C = [np.random.uniform(0, 100) for _ in range(len(DAG))]
# C = 100 * np.ones(len(DAG))

Experiment 1.1: 

### 2. Reddit Interaction Network

Refer to http://snap.stanford.edu/data/web-RedditNetworks.html

In [144]:
# read in networks for subreddit /r/politics
with open("politics.json") as fp:
    month_nets = json.load(fp)

# see who the user sn00gan replied to in the first month
print(month_nets[0]["sn00gan"])

['TedTheGreek_Atheos', 'ptwonline', 'sn00gan', 'TiiziiO', 'caged_raptor']


In [148]:
# Create graph
G2 = nx.DiGraph()
for i in range(len(month_nets)):
    for k,v in month_nets[i].items():
        for vs in v:
            G2.add_edge(k, vs)

In [149]:
len(G2.edges)

1688750

In [150]:
len(G2.nodes)

119780

In [152]:
# Choose a random subgraph with the specified number of edges
num_edges = 500
subgraph_edges = random.sample(G2.edges(), num_edges)
sub_G2 = G2.edge_subgraph(subgraph_edges)
print('num nodes:', len(sub_G2))
print('num edges:', len(sub_G2.edges))

since Python 3.9 and will be removed in a subsequent version.
  subgraph_edges = random.sample(G2.edges(), num_edges)


num nodes: 926
num edges: 500
