In [3]:
# LIBRARIES
import math
import random
import math
import statistics
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt
import scipy as sp
import scipy.sparse as sparse
import seaborn as sns
from tqdm import tqdm

In [37]:
def deanon_ratio(G, H):
    nx.write_weighted_edgelist(G, "orig.edges")
    nx.write_weighted_edgelist(H, "new.edges")
    n = len(G.nodes())
    seed_nodes = random.sample(sorted(G.nodes()), int(round(0.05 * n)))
    with open("seeds.txt", "w") as s:
        for sn in seed_nodes:
            s.write(str(sn) + " " + str(sn) + "\n")
    cmd = "java -jar secGraph.jar -m d -a DV -gA orig.edges \
        -gB new.edges -seed seeds.txt -bSize " + str(n) + " -nKeep \
        " + str(n) + " -gO out.txt -am stack"
    res = sproc.check_output(cmd, shell=True)
    sval = res.split(" ")[2].split("/")
    return float(sval[0])/float(sval[1])

In [4]:
# SGF

def SGF(G, transformation = "modularity", alpha = 0.8, normalization_type = "truncate", k = 6):

    #____________________________________________
    # INPUT MATRIX

    # Setting up basic parameters
    A = nx.adjacency_matrix(G).toarray()
    n = len(A)
    degrees = [G.degree[node] for node in G.nodes()]
    M = np.zeros((n, n))

    # identity transformation
    if transformation == "identity":
        M = A

    # modularity transformation
    elif transformation == "modularity":
        m = sum(degrees) / 2
        B = A - np.outer(degrees, degrees) / (2 * m)
        M = B

    # laplacian transformation
    elif transformation == "laplacian":
        D = np.diag(degrees)
        L = D - A
        M = L

    # signless laplacian transformation
    elif transformation == "signless laplacian":
        D = np.diag(degrees)
        L = D + A
        M = L
    
    # General Zagreb transformation
    elif transformation == "General Zagreb":
        D = np.diag(degrees)
        L = pow(D,3) + A
        M = L

    # Seidel transformation
    elif transformation == "Seidel":
        A_complement = np.ones((n, n)) - A - np.eye(n)
        S = A - A_complement
        M = S

    # Sum connectivity transformation
    elif transformation == "Sum connectivity":
        SC = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                if A[i][j] == 1:
                    SC[i][j] = A[i][j] * (1 / math.sqrt((degrees[i] * degrees[j])))
        M = SC

    # Transition Random Walk transformation
    elif transformation == "Transition RW":
        D = np.diag(degrees)
        P = np.linalg.inv(D) @ A
        M = P

    # Bethe-Hessian transformation
    elif transformation == "Bethe-Hessian":
        # Computing the non-backtracking matrix
        Gdirect = G.to_directed()
        S = np.zeros((len(Gdirect.edges),len(G.nodes)))
        T = np.zeros((len(G.nodes),len(Gdirect.edges)))
        for i,a in enumerate(Gdirect.edges):
            for j,b in enumerate(G.nodes):
                if a [ 1 ] == b:
                    S[i,j]=1
                if a [ 0 ] == b :
                    T[j,i] = 1
        tau = np.zeros((len(Gdirect.edges),len(Gdirect.edges)))
        for i,a in enumerate(Gdirect.edges):
            for j,b in enumerate(Gdirect.edges):
                if a[0]==b[1] and a[1]==b[0]:
                    tau[i][j] = 1
        B = S@T - tau

        # Computing the Bethe-Hessian matrix
        D = np.diag(degrees)
        I = np.eye(n, n)
        rho = max(abs(np.linalg.eigvals(B)))
        r = pow(rho, 1/2)
        H = (pow(r,2) - 1) * I - r * A + D
        M = H

    #____________________________________________
    # LOW RANK ALPHA APPROXIMATION
    
    # Computation of eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(M)
    eigenvectors = eigenvectors.T

    # sorting the eigenvectors, eigenvalues
    paired_sorted_list = sorted(zip(eigenvalues, eigenvectors), key=lambda x: x[0])
    eigenvalues_sorted, eigenvectors_sorted = zip(*paired_sorted_list)

    # Computation of M_tilde
    M_tilde = np.zeros((eigenvectors_sorted[0].shape[0], eigenvectors_sorted[0].shape[0]))
    for i in range(math.ceil(alpha * len(eigenvalues))):
        contribution = eigenvalues_sorted[i] * np.outer(eigenvectors_sorted[i], eigenvectors_sorted[i])
        M_tilde += contribution

    #____________________________________________
    # BACK TRANSFORMATION
        
    # Computation of A_tilde
    A_tilde = np.zeros((n, n))
    if transformation == "identity":
        A_tilde = M_tilde
    elif transformation == "modularity":
        A_tilde = M_tilde + np.outer(degrees, degrees) / (2 * m)
    elif transformation == "laplacian":
        A_tilde = D - M_tilde
    elif transformation == "signless laplacian":
        A_tilde = M_tilde - D
    elif transformation == "General Zagreb":
        A_tilde = M_tilde - pow(D,3)
    elif transformation == "Seidel":
        A_tilde = M_tilde + A_complement
    elif transformation == "Sum connectivity":
        for i in range(n):
            for j in range(n):
                if A[i][j] == 1:
                    A_tilde[i][j] = M_tilde[i][j] * math.sqrt((degrees[i] * degrees[j]))
    elif transformation == "Transition RW":
        A_tilde = D @ M_tilde
    elif transformation == "Bethe-Hessian":
        A_tilde = ((r**2 - 1) * I - M_tilde + D) * (1 / r)

    #____________________________________________
    # NORMALIZATION

    # logistic
    A_dots = np.zeros((n, n))
    if normalization_type == "logistic":
        for i in range(n):
            for j in range(n):
                A_dots[i][j] = 1 / (1 + math.exp((0.5 - A_tilde[i][j])*k))

    # truncation
    elif normalization_type == "truncate":
        for i in range(n):
            for j in range(n):
                if A_tilde[i][j] < 0:
                    A_dots[i][j] = 0
                elif A_tilde[i][j] > 1:
                    A_dots[i][j] = 1
                else:
                    A_dots[i][j] = A_tilde[i][j]

    # scaling
    elif normalization_type == "scale":
        A_tilde_flattened = A_tilde.flatten()
        for i in range(n):
            for j in range(n):
                A_dots[i][j] = (A_tilde[i][j] - min(A_tilde_flattened)) / (max(A_tilde_flattened) - min(A_tilde_flattened))
    
    #____________________________________________
    # ADJACENCY MATRIX GENERATION
    
    # Bernoulli sampling
    A_prime = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            u = random.uniform(0, 1)
            if u < A_dots[i][j]:
                A_prime[i][j] = 1
                A_prime[j][i] = 1
            else:
                A_prime[i][j] = 0
                A_prime[j][i] = 0
            if i == j:
                A_prime[i][j] = 0

    W = nx.from_numpy_array(A_prime)
    return W

In [38]:
# TEST - MOVIE GRAPH

df = pd.read_csv("movies_edges.csv", delimiter=",")
df.columns = ["source", "target","label", "movie_id", "weight"]
movies = nx.from_pandas_edgelist(df, "source", "target")

print(movies)
#nx.draw_kamada_kawai(movies)
#plt.pyplot.show()

W = SGF(movies, transformation = "modularity", alpha = 0.9, normalization_type = "truncate", k = 6)

print(W)
#nx.draw_kamada_kawai(W)
deanon_ratio = deanon_ratio(movies, W)
print(deanon_ratio)

Graph with 99 nodes and 317 edges
Graph with 99 nodes and 389 edges


Error: Unable to access jarfile secGraph.jar


CalledProcessError: Command 'java -jar secGraph.jar -m d -a DV -gA orig.edges         -gB new.edges -seed seeds.txt -bSize 99 -nKeep         99 -gO out.txt -am stack' returned non-zero exit status 1.

In [1]:
# STATISTICS COMPUTATION (LONG TO RUN)

def statistics(graphs, sample_size = 100):

    #____________________________________________
    # GRAPH STATISTICS
    alpha_dict = {}
    for alpha in tqdm([0.5, 0.7, 0.9]):
        g_dict = {
            "transformation": [],
            "g_name": [],
            "avg_clustering_ratio_obs":[],
            "modularity_communities_ratio_obs":[],
            "De-anonymization ratio":[]
        }
        for name, g in graphs:
            g_avg_clustering = nx.average_clustering(g)
            g_modularity = nx.community.greedy_modularity_communities(g)
            
            for transformation in tqdm(["identity", "modularity", "laplacian", "Bethe-Hessian"]):
                for i in range(sample_size):
                    w = SGF(g, transformation, alpha)
                    g_dict["transformation"].append(transformation)
                    g_dict["g_name"].append(name)
                    g_dict["avg_clustering_ratio_obs"].append(nx.average_clustering(w)/ g_avg_clustering)
                    g_dict["modularity_communities_ratio_obs"].append(len(nx.community.greedy_modularity_communities(w))/ len(g_modularity))
                    g_dict["De-anonymization ratio"].append(deanon_ratio(g,w))
        alpha_dict[alpha]  = pd.DataFrame(g_dict)
    return alpha_dict


In [18]:
# CALL STATISTICS FUNCTION

df = pd.read_csv("facebookedgelist.csv", delimiter=";")
df.columns = ["source", "target"]
facebook = nx.from_pandas_edgelist(df, "source", "target")

# creating graphs list
graphs = [
    ("karate", nx.karate_club_graph()),
    ("LFR", nx.LFR_benchmark_graph(300, 3, 1.5, 1, min_degree = 2)),
    ("windill", nx.windmill_graph(300, 6)),
    ("facebook", facebook),
]
data = statistics(graphs)


100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
100%|██████████| 4/4 [04:30<00:00, 67.71s/it]
 50%|█████     | 2/4 [15:06:52<15:06:52, 27206.16s/it]
  0%|          | 0/3 [15:11:27<?, ?it/s]


KeyboardInterrupt: 

In [36]:
sns.set_theme(style="ticks", palette="pastel")

for alpha in data:
# Draw a nested boxplot to show bills by day and time
    df = data[alpha]
    sns.boxplot(x="g_name", y="avg_clustering_ratio_obs",
                hue="transformation", palette=["m", "g", "b", "r"],
                data=df)
    sns.despine(offset=10, trim=True)

    plt.pyplot.show()
    # Draw a nested boxplot to show bills by day and time
    sns.boxplot(x="g_name", y="modularity_communities_ratio_obs",
                hue="transformation", palette=["m", "g", "b", "r"],
                data=df)
    sns.despine(offset=10, trim=True)

NameError: name 'data' is not defined

In [None]:
# PRINT STATISTICS IN BOXPLOTS

sns.set_theme(style="ticks", palette="pastel")
tips = {
    'total_bill': [16.99, 10.34, 21.01, 23.68, 24.59],
    'tip': [1.01, 1.66, 3.50, 3.31, 3.61],
    'sex': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'smoker': ['No', 'Yes', 'No', 'No', 'Yes'],
    'day': ['Sun', 'Sun', 'Mon', 'Mon', 'Sun'],
    'time': ['Lunch', 'Dinner', 'Dinner', 'Dinner', 'Dinner'],
    'size': [2, 3, 3, 2, 4]
}

# Create a DataFrame
df = pd.DataFrame(tips)



# Load the example tips dataset

# Draw a nested boxplot to show bills by day and time
sns.boxplot(x="day", y="total_bill",
            hue="smoker", palette=["m", "g"],
            data=tips)
sns.despine(offset=10, trim=True)
