In [1]:
import networkx as nx
import pandas as pd
import math
import random
from scipy.stats import kendalltau
import time
from scipy.sparse.linalg import eigs

In [2]:
# Import the three datasets
# Dataset 1
df = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_1/swisstools_asset_relationships.csv",sep=",")
relationships = df[["source_asset_id","target_asset_id"]]
types = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_1/swisstools_asset_types.csv",sep=",")
assets = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_1/swisstools_assets.csv",sep=",")
# Dataset 2
df2 = pd.read_csv("C://Users/jesse/Downloads/output_ml_students_twosets/student_export_2/haarlem_asset_relationships.csv",sep=",")
relationships2 = df2[["source_asset_id","target_asset_id"]]
types2 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_2/haarlem_asset_types.csv",sep=",")
assets2 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_2/haarlem_assets.csv",sep=",")
# Dataset 3
df3 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students/output_ml_students/backend_assetrelationship.csv",sep=",")
relationships3 = df3[["source_asset_id","target_asset_id"]]
types3 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students/output_ml_students/backend_assettype.csv",sep=",")
assets3 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students/output_ml_students/backend_asset.csv",sep=",")

  assets = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students_twosets/student_export_1/swisstools_assets.csv",sep=",")
  assets3 = pd.read_csv("C:/Users/jesse/Downloads/output_ml_students/output_ml_students/backend_asset.csv",sep=",")


In [3]:
# Create the asset_and_type dataframe for each dataset
# Dataset 1
asset_and_type = assets.merge(types[["id","category"]],left_on="asset_type_id",right_on="id",how="left")
asset_and_type = asset_and_type.drop(columns="id_y").rename(columns={"id_x":"id"})
asset_and_type["category"] = asset_and_type["category"].fillna("Unspecified")
assettypes_1 = asset_and_type[["id","category"]]
# Dataset 2
asset_and_type_2 = assets2.merge(types2[["id","category"]],left_on="asset_type_id",right_on="id",how="left")
asset_and_type_2 = asset_and_type_2.drop(columns="id_y").rename(columns={"id_x":"id"})
asset_and_type_2["category"] = asset_and_type_2["category"].fillna("Unspecified")
assettypes_2 = asset_and_type_2[["id","category"]]
# Dataset 3
asset_and_type_3 = assets3.merge(types3[["id","category"]],left_on="asset_type_id",right_on="id",how="left")
asset_and_type_3 = asset_and_type_3.drop(columns="id_y").rename(columns={"id_x":"id"})
asset_and_type_3["category"] = asset_and_type_3["category"].fillna("Unspecified")
assettypes_3 = asset_and_type_3[["id","category"]]

In [4]:
# DiGraph is directed, Graph is undirected
G1 = nx.DiGraph()
edges = list(zip(relationships["source_asset_id"],relationships["target_asset_id"]))
G1.add_edges_from(edges)
# Second dataset
G2 = nx.DiGraph()
edges2 = list(zip(relationships2["source_asset_id"],relationships2["target_asset_id"]))
G2.add_edges_from(edges2)
# Third dataset
G3 = nx.DiGraph()
edges3 = list(zip(relationships3["source_asset_id"],relationships3["target_asset_id"]))
G3.add_edges_from(edges3)

In [5]:
# Function to get the second order neighbors counts of each node
def second_order_neighbor_counts(G):
    adj = G.adj
    result = {}
    for node in G:
        direct = set(adj[node])
        second_order = set()

        for neighbor in direct:
            for nn in adj[neighbor]:
                if nn != node and nn not in direct:
                    second_order.add(nn)
        result[node] = len(second_order)
    return result
# Function to get the PE values
def get_PE(G,second_order_counts):
    # Clustering coefficient and cn
    c = {}
    for i in G.nodes:
        if G.degree(i) <= 1:
            c[i] = 0
        else:
            c[i] = (sum(1 for j in list(G.neighbors(i)) for k in list(G.neighbors(i)) if j != k and G.has_edge(j,k))) / (G.degree(i) * (G.degree(i) - 1))
    cn = {}
    sumcn = 0
    for i in G.nodes:
        sumneigh = len(list(G.neighbors(i))) + second_order_counts[i]
        cn[i] = sumneigh/(1+c[i])
        sumcn += cn[i]
    # I
    I = {}
    for i in G.nodes:
        I[i] = cn[i]/sumcn
    # PE
    sum_I = {}
    PE = {}
    for i in G.nodes:
        sum_I[i] = 0.0
        for j in G.neighbors(i):
            if I[j] == 0:
                continue
            else:
                sum_I[i] += -I[j] * math.log(I[j])
        PE[i] = sum_I[i]
    return PE
# Function to get all other measures, then put all together in a dictionary
def get_measures(G,second_order_counts):
    # Node Propagation Entropy
    PE = get_PE(G=G,second_order_counts=second_order_counts)
    # PageRank
    pagerank = nx.pagerank(G)
    # Degree Centrality
    degree_centrality = nx.degree_centrality(G)
    # H-index
    h_index = {}
    for node in G:
        neighbor_degrees = sorted([G.degree(nbr) for nbr in G.neighbors(node)], reverse=True)
        h = 0
        for i, d in enumerate(neighbor_degrees, 1):
            if d >= i:
                h = i
            else:
                break
        h_index[node] = h
    # K-shell
    G.remove_edges_from(nx.selfloop_edges(G))
    k_shell = nx.core_number(G)
    # Put them together in a dictionary
    measures = {"PE":PE,"PageRank":pagerank,"Degree Centrality":degree_centrality,"H-index":h_index,"K-shell":k_shell}
    return measures

In [7]:
# Get the second order neighbor counts for each graph
second_order_counts_1 = second_order_neighbor_counts(G1)
second_order_counts_2 = second_order_neighbor_counts(G2)
second_order_counts_3 = second_order_neighbor_counts(G3)

In [21]:
# Call the function for each graph
measures_1 = get_measures(G=G1,second_order_counts=second_order_counts_1)
measures_2 = get_measures(G=G2,second_order_counts=second_order_counts_2)
measures_3 = get_measures(G=G3,second_order_counts=second_order_counts_3)

In [22]:
# Function to calculate prevalence_threshold
def get_prevalence_threshold(G):
    A_sparse = nx.to_scipy_sparse_array(G, format='csr', dtype='float64')
    lambda1 = eigs(A_sparse, k=1, which='LR', return_eigenvectors=False, maxiter=100000, tol=1e-3)[0].real
    prevalence_threshold = 1 / lambda1
    return prevalence_threshold
# Function to run the SIR model
def run_sir(G, seed, beta, gamma, max_steps=100):
    infected = set([seed])
    recovered = set()
    susceptible = set(G.nodes) - infected

    for _ in range(max_steps):
        new_infected = set()
        for node in infected:
            for neighbor in G.neighbors(node):
                if neighbor in susceptible and random.random() < beta:
                    new_infected.add(neighbor)
            if random.random() < gamma:
                recovered.add(node)
        infected = (infected - recovered).union(new_infected)
        susceptible -= new_infected
        if not infected:
            break

    return len(recovered)
# Function to get the spread results
def get_sir_results(measures, G, top_k, num_simulations, beta, gamma):
    spread_results = {}
    for name, vector in measures.items():
        top_nodes = sorted(vector.items(), key=lambda x: x[1], reverse=True)[:top_k]
        top_nodes = [node for node, _ in top_nodes]
        influence_scores = []
        for node in top_nodes:
            total_spread = 0
            for _ in range(num_simulations):
                spread = run_sir(G=G, seed=node, beta=beta, gamma=gamma)
                total_spread += spread
            avg_spread = total_spread / num_simulations
            influence_scores.append(avg_spread)
        spread_results[name] = influence_scores
    return spread_results
# Function to get kendall coefficients
def kendall_scores(spread_results):
    kendall_scores = {}
    for name, influence_scores in spread_results.items(): 
        centrality_ranks = list(range(len(influence_scores)))
        sir_ranks = sorted(range(len(influence_scores)), key=lambda i: influence_scores[i], reverse=True)
        tau,_ = kendalltau(centrality_ranks, sir_ranks)
        kendall_scores[name] = tau
    return kendall_scores
# Function to run the sis model
def run_sis(G, seed, beta, gamma, max_steps=100):
    infected = set([seed])
    susceptible = set(G.nodes)
    susceptible.remove(seed)

    for _ in range(max_steps):
        new_infected = set()
        for node in infected:
            for neighbor in G.neighbors(node):
                if neighbor in susceptible and random.random() < beta:
                    new_infected.add(neighbor)
        # Infected nodes may recover (become susceptible again)
        still_infected = {node for node in infected if random.random() >= gamma}
        infected = still_infected.union(new_infected)
        susceptible -= new_infected
        susceptible |= (infected - still_infected)

        if not infected:
            break

    return len(set(G.nodes)) - len(susceptible)
# Function to get the sis results
def get_sis_results(measures, G, top_k, num_simulations, beta, gamma):
    spread_results = {}
    for name, vector in measures.items():
        top_nodes = sorted(vector.items(), key=lambda x: x[1], reverse=True)[:top_k]
        top_nodes = [node for node, _ in top_nodes]
        influence_scores = []
        for node in top_nodes:
            total_spread = 0
            for _ in range(num_simulations):
                spread = run_sis(G=G, seed=node, beta=beta, gamma=gamma)
                total_spread += spread
            avg_spread = total_spread / num_simulations
            influence_scores.append(avg_spread)
        spread_results[name] = influence_scores
    return spread_results

In [23]:
# Calculate the prevalence_threshold for each graph
prevalence_threshold_1 = get_prevalence_threshold(G1)
prevalence_threshold_2 = get_prevalence_threshold(G2)
prevalence_threshold_3 = get_prevalence_threshold(G3)

In [10]:
# Run the SIR model and get the kendall coefficients for each graph
spread_results_1 = get_sir_results(measures=measures_1, G=G1, top_k=50, num_simulations=1000, beta=prevalence_threshold_1, gamma=1)
kendall_coefficients_1 = kendall_scores(spread_results=spread_results_1)
spread_results_2 = get_sir_results(measures=measures_2, G=G2, top_k=50, num_simulations=1000, beta=prevalence_threshold_2, gamma=1)
kendall_coefficients_2 = kendall_scores(spread_results=spread_results_2)
spread_results_3 = get_sir_results(measures=measures_3, G=G3, top_k=50, num_simulations=1000, beta=prevalence_threshold_3, gamma=1)
kendall_coefficients_3 = kendall_scores(spread_results=spread_results_3)

In [11]:
kendall_coefficients_1, kendall_coefficients_2, kendall_coefficients_3

({'PE': 0.746938775510204,
  'PageRank': 0.4873469387755102,
  'Degree Centrality': 0.6587755102040816,
  'H-index': 0.6146938775510205,
  'K-shell': 0.17387755102040817},
 {'PE': 0.8808163265306123,
  'PageRank': 0.6,
  'Degree Centrality': 0.7893877551020408,
  'H-index': 0.42857142857142855,
  'K-shell': 0.29959183673469386},
 {'PE': 0.7959183673469388,
  'PageRank': 0.4971428571428571,
  'Degree Centrality': 0.6457142857142858,
  'H-index': 0.6555102040816326,
  'K-shell': 0.25387755102040815})

In [24]:
# Run the SIS model and get the kendall coefficients for each graph
SIS_results_1 = get_sis_results(measures=measures_1,G=G1,top_k=50,num_simulations=1000,beta=prevalence_threshold_1,gamma=0.8)
kendall_coefficients_1 = kendall_scores(spread_results=SIS_results_1)
SIS_results_2 = get_sis_results(measures=measures_2,G=G2,top_k=50,num_simulations=1000,beta=prevalence_threshold_2,gamma=0.8)
kendall_coefficients_2 = kendall_scores(spread_results=SIS_results_2)
SIS_results_3 = get_sis_results(measures=measures_3,G=G3,top_k=50,num_simulations=1000,beta=prevalence_threshold_3,gamma=0.8)
kendall_coefficients_3 = kendall_scores(spread_results=SIS_results_3)
kendall_coefficients_3

{'PE': 0.5510204081632653,
 'PageRank': 0.30775510204081635,
 'Degree Centrality': 0.41714285714285715,
 'H-index': 0.6326530612244898,
 'K-shell': 0.21959183673469387}

In [None]:
SIS_results_1 = get_sis_results(measures=measures_1,G=G1,top_k=50,num_simulations=1000,beta=prevalence_threshold_1,gamma=0.8)
kendall_coefficients_1 = kendall_scores(spread_results=SIS_results_1)
kendall_coefficients_1

In [84]:
kendall_coefficients_1, kendall_coefficients_2, kendall_coefficients_3

({'PE': nan,
  'PageRank': nan,
  'Degree Centrality': nan,
  'H-index': nan,
  'K-shell': nan},
 {'PE': nan,
  'PageRank': nan,
  'Degree Centrality': nan,
  'H-index': nan,
  'K-shell': nan},
 {'PE': nan,
  'PageRank': nan,
  'Degree Centrality': nan,
  'H-index': nan,
  'K-shell': nan})

In [19]:
def SLE_valuation(target_ratios,NPE,assettypes):
    NPE_df = pd.DataFrame(NPE.items(),columns=["id","value"])
    NPE_and_type = pd.merge(NPE_df,assettypes,on="id",how="left")
    total_value = NPE_and_type["value"].sum()
    group_totals = NPE_and_type.groupby("category")["value"].sum()
    scaling_factors = {t: (target_ratios[t] * total_value) / group_totals[t] for t in group_totals.index}
    NPE_and_type["adjusted_value"] = NPE_and_type.apply(lambda row: row["value"] * scaling_factors.get(row["category"],1), axis=1)
    NPE_values = NPE_and_type[["id","category","adjusted_value"]]
    return NPE_values

In [20]:
# Get the NPE final valuations
target_ratios = {"OT":0.40,"IoT":0.20,"IT":0.15,"Other":0.15,"Network":0.10,"Unspecified":0.0}
valuations_1 = NPE_valuation(target_ratios=target_ratios,NPE=measures_1["PE"],assettypes=assettypes_1)
valuations_2 = NPE_valuation(target_ratios=target_ratios,NPE=measures_2["PE"],assettypes=assettypes_2)
valuations_3 = NPE_valuation(target_ratios=target_ratios,NPE=measures_3["PE"],assettypes=assettypes_3)

In [21]:
# Check the percentages
# Dataset 1
total_values_1 = dict(valuations_1.groupby("category")["adjusted_value"].sum())
percentages_1 = {}
for i in total_values_1:
    percentages_1[i] = total_values_1[i]/sum(total_values_1.values())
# Dataset 2
total_values_2 = dict(valuations_2.groupby("category")["adjusted_value"].sum())
percentages_2 = {}
for i in total_values_2:
    percentages_2[i] = total_values_2[i]/sum(total_values_2.values())
# Dataset 3
total_values_3 = dict(valuations_3.groupby("category")["adjusted_value"].sum())
percentages_3 = {}
for i in total_values_3:
    percentages_3[i] = total_values_3[i]/sum(total_values_3.values())
percentages_1, percentages_2, percentages_3

({'IT': 0.15,
  'IoT': 0.19999999999999996,
  'Network': 0.1,
  'OT': 0.4,
  'Other': 0.15,
  'Unspecified': 0.0},
 {'IT': 0.25,
  'IoT': 0.33333333333333337,
  'Network': 0.16666666666666666,
  'Other': 0.25,
  'Unspecified': 0.0},
 {'IT': 0.15,
  'IoT': 0.2,
  'Network': 0.1,
  'OT': 0.4,
  'Other': 0.15,
  'Unspecified': 0.0})

In [44]:
# Test on the extra datasets found in Yu et al.
file_path_crime = "C:/Users/jesse/Downloads/download.tsv.moreno_crime/moreno_crime/out.moreno_crime_crime"
G_crime = nx.DiGraph()
file_path_netscience = "C:/Users/jesse/Downloads/download.tsv.dimacs10-netscience/dimacs10-netscience/out.dimacs10-netscience"
G_netscience = nx.DiGraph()
file_path_polbooks = "C:/Users/jesse/Downloads/download.tsv.dimacs10-polbooks/dimacs10-polbooks/out.dimacs10-polbooks"
G_polbooks = nx.DiGraph()
file_path_train = "C:/Users/jesse/Downloads/download.tsv.moreno_train/moreno_train/out.moreno_train_train"
G_train = nx.DiGraph()
file_path_dolphins = "C:/Users/jesse/Downloads/download.tsv.dolphins/dolphins/out.dolphins"
G_dolphins = nx.DiGraph()
file_path_uspowergrid = "C:/Users/jesse/Downloads/download.tsv.opsahl-powergrid/opsahl-powergrid/out.opsahl-powergrid"
G_uspowergrid = nx.DiGraph()
file_path_yeast = "C:/Users/jesse/Downloads/download.tsv.moreno_propro/moreno_propro/out.moreno_propro_propro"
G_yeast = nx.DiGraph()
data = {"Crime":(G_crime,file_path_crime),
          "Netscience":(G_netscience,file_path_netscience),
          "Polbooks":(G_polbooks,file_path_polbooks),
          "Train":(G_train,file_path_train),
          "Dolphins":(G_dolphins,file_path_dolphins),
          "Uspowergrip":(G_uspowergrid,file_path_uspowergrid),
          "Yeast":(G_yeast,file_path_yeast)}

for i in data:  
    graph = data[i][0]
    file_path = data[i][1]
    with open(file_path, "r") as f:
        for i, line in enumerate(f, 1):
            parts = line.strip().split()
            try:
                if len(parts) >= 2:
                    from_node = int(parts[0])
                    to_node = int(parts[1])
                    edge_data = {}

                    if len(parts) >= 3:
                        edge_data['weight'] = float(parts[2])
                    if len(parts) == 4:
                        edge_data['timestamp'] = int(parts[3])
                    graph.add_edge(from_node, to_node, **edge_data)
            except ValueError as e:
                print(f"Skipping line {i}: {parts} -> {e}")


Skipping line 1: ['%', 'bip', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 2: ['%', '1476', '829', '551'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'posweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 2: ['%', '243', '64', '64'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 2: ['%', '6594', '4941', '4941'] -> invalid literal for int() with base 10: '%'
Skipping line 1: ['%', 'sym', 'unweighted'] -> invalid literal for int() with base 10: '%'
Skipping line 2: ['%', '2277', '1870', '1870'] -> invalid literal for int() with base 1

In [79]:
graphs = {}
for i in data:
    graphs[i] = data[i][0]
scores = {}
for i in graphs:   
    graph = graphs[i]
    second_order_counts = second_order_neighbor_counts(graph)
    measures = get_measures(G=graph,second_order_counts=second_order_counts)
    prevalence_threshold = get_prevalence_threshold(graph)
    spread_results = get_sir_results(measures=measures,G=graph,top_k=50,num_simulations=10000,beta=prevalence_threshold,gamma=1)
    kendall_coefficients = kendall_scores(spread_results=spread_results)
    scores[i] = kendall_coefficients
scores

KeyboardInterrupt: 

In [81]:
graphs = {"Crime":G_crime,"Netscience":G_netscience,"Polbooks":G_polbooks,"Train":G_train,"Dolphins":G_dolphins,"Yeast":G_yeast}
scores = {}
for i in graphs:   
    graph = graphs[i]
    second_order_counts = second_order_neighbor_counts(graph)
    measures = get_measures(G=graph,second_order_counts=second_order_counts)
    prevalence_threshold = get_prevalence_threshold(graph)
    spread_results = get_sis_results(measures=measures,G=graph,top_k=50,num_simulations=1000,beta=prevalence_threshold,gamma=0.8)
    kendall_coefficients = kendall_scores(spread_results=spread_results)
    scores[i] = kendall_coefficients
scores

{'Crime': {'PE': 0.516734693877551,
  'PageRank': -0.022040816326530613,
  'Degree Centrality': 0.04816326530612245,
  'H-index': 0.28,
  'K-shell': 0.08081632653061226},
 'Netscience': {'PE': 0.2457142857142857,
  'PageRank': 0.7877551020408163,
  'Degree Centrality': 0.26857142857142857,
  'H-index': 0.18040816326530612,
  'K-shell': 0.5151020408163265},
 'Polbooks': {'PE': 0.43183673469387757,
  'PageRank': -0.19020408163265307,
  'Degree Centrality': 0.15918367346938775,
  'H-index': 0.3142857142857143,
  'K-shell': 0.4416326530612245},
 'Train': {'PE': 0.9004081632653061,
  'PageRank': -0.16408163265306122,
  'Degree Centrality': 0.4759183673469388,
  'H-index': 0.7093877551020409,
  'K-shell': 0.6293877551020408},
 'Dolphins': {'PE': 0.7485714285714286,
  'PageRank': -0.2653061224489796,
  'Degree Centrality': 0.005714285714285714,
  'H-index': 0.2440816326530612,
  'K-shell': -0.40244897959183673},
 'Yeast': {'PE': 0.47265306122448986,
  'PageRank': 0.1689795918367347,
  'Degree