In [1]:
import sys
import os
import numpy as np
import pygod
from pygod.utils import load_data
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
import torch_sparse
from torch_sparse import SparseTensor
from typing import List
from sklearn.metrics import roc_auc_score
from gad_adversarial_robustness.utils.graph_utils import prepare_graph, get_n_anomaly_indexes, load_anomaly_detection_dataset
from gad_adversarial_robustness.poison.greedy import multiple_AS, poison_attack

import argparse
import scipy.sparse as sp
from scipy.sparse import coo_matrix
from torch_geometric.utils.convert import from_scipy_sparse_matrix

# --------- related to dataset import
from pygod.generator import gen_contextual_outlier, gen_structural_outlier
from torch_geometric.datasets import AttributedGraphDataset

# --------- jaccard
from gad_adversarial_robustness.gad.dominant.dominant_cuda_Jaccard_similarity import Dominant

In [2]:
NUM_CLASSES = 2
PRELOADED_EDGE_INDEX = False
EDGE_INDEX_PT = "300_budget_greedy_edge_index.pt"

#### Import Data

In [3]:
# ------- datasets: 
data_name = ["Cora", "Wiki", "Facebook", "Blogcatalog"] # blogcataog not an option yet
string = data_name[2]

dataset = AttributedGraphDataset(root = "data/"+string, name = string)
data = dataset[0]
clean_data = data # save for later use

# ------- inject dataset through pygod:
seed = None
num_nodes_to_inject = 20
num_nodes_per_clique = 10
num_cliques = (num_nodes_to_inject // 2) // num_nodes_per_clique
num_contextual_outliers = num_nodes_to_inject - num_cliques * num_nodes_per_clique

data, ya = gen_contextual_outlier(data, n = num_contextual_outliers, k = 50, seed = seed) 
#n (int) – Number of nodes converting to outliers.
#k (int) – Number of candidate nodes for each outlier node.

data, ys = gen_structural_outlier(data, m = num_nodes_per_clique, n = num_cliques, seed = seed)
#m (int) - Number nodes in the outlier cliques.
#n (int) - Number of outlier clique

data.y = torch.logical_or(ys, ya).long() # where ys = y structural and ya = y attribute

y_binary: List[int] = data.y.bool()
anomaly_list = np.where(y_binary == True)[0]  # Used for list for which nodes to hide

In [4]:
data

Data(x=[4039, 1283], edge_index=[2, 88324], y=[4039])

#### Modify data

In [5]:
# -----> to obtain the dense_adj matrix from edge_index tensor 
edge_weight = torch.ones(data.edge_index.size(1))
edge_weight = edge_weight.cpu()
print(edge_weight.shape) # 11060

adj = sp.csr_matrix((edge_weight, data.edge_index), (data.num_nodes, data.num_nodes))
print(adj.size) # 11054

adj = torch_sparse.SparseTensor.from_scipy(adj).coalesce().to("cpu")

# adj matrix based on edge_index
data.adj = adj.to_dense()

torch.Size([88324])
88324


In [6]:
print(data.adj)
print(data.adj.shape)

tensor([[0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([4039, 4039])


### Posion

In [7]:
# ----- compute new or load in poisoned data
if PRELOADED_EDGE_INDEX is False :

    # truth, of type int list, is instantiated to the T/F labels indicating whether a node is an anomalous node
    truth: List[int] = data.y.bool()

    print("Create poison compatible adjacency matrix...") # based on code from: https://github.com/zhuyulin-tony/BinarizedAttack/blob/main/src/Greedy.py
    triple = []
    for i in range(data.num_nodes): # Cora has 2708 nodes
        for j in range(i + 1, data.num_nodes):
            triple.append([i, j, data.adj[i,j]])  #Fill with 0, then insert actual after

    # convert tripple to numpy array
    triple = np.array(triple)

    # These are the nodes we try reduce the "active subnetwork score" for (i.e. disguising anonomalous nodes)
    target_node_lst = get_n_anomaly_indexes(truth, 999) # the indexes of the anomalies

    # print(type(target_node_lst)), print(f'target node list: {target_node_lst}'), print(target_node_lst)

    print("Making model...")
    model = multiple_AS(target_lst = target_node_lst, n_node = data.num_nodes, device = 'cpu')
    budget = 100  # The amount of edges to change


    print("Starting attack...")
    adj_adversary, _, _ = poison_attack(model, triple, budget)


    print("Converting to compatible tensor...")

    # Create Edge Index'
    edge_index = torch.tensor([[],[]])

    # Transpose it to make shape compatible
    transposed_adj_adversary = torch.transpose(adj_adversary, 0, 1)

    for i in range(len(adj_adversary)):
        if(adj_adversary[i][2] != 0):   #If edge value is not 0 (no edge)
            #Add edge to edge index, choosing first 2 elements (edges), and then the ith edge
            edge_index = torch.cat((edge_index, transposed_adj_adversary[:2, i:i+1]), -1)
            # Dataset uses edges both ways so add reverse edge as well
            edge_index = torch.cat((edge_index, torch.flip(transposed_adj_adversary[:2, i:i+1], dims=[0])), -1)


    edge_index = edge_index.type(torch.int64)
    data.edge_index = edge_index # assign to dataset obj

else : 
    data.edge_index = torch.load(EDGE_INDEX_PT)

Create poison compatible adjacency matrix...
Anomalies indexes: [  11  217  318  668  934 1670 1685 1710 2084 2334 2360 2661 2778 2961
 2963 3326 3476 3536 3631 3702]
Making model...
Starting attack...


  return torch.sparse.mm(torch.sparse.mm(A_sp, A_sp), A_sp).to_dense()
  E = torch.sum(A, 1) + 0.5 * torch.diag(self.sparse_matrix_power(A, 3)).T


initial anomaly score: 106.60519312429084
Iteration: 1 --- Anomaly score: 106.57597807740956
Iteration: 2 --- Anomaly score: 106.45028160148823
Iteration: 3 --- Anomaly score: 106.4160821603449
Iteration: 4 --- Anomaly score: 106.29799505068871
Iteration: 5 --- Anomaly score: 106.16275784404255
Iteration: 6 --- Anomaly score: 106.04575996098119
Iteration: 7 --- Anomaly score: 105.92977189159757
Iteration: 8 --- Anomaly score: 105.7995460430152
Iteration: 9 --- Anomaly score: 105.68213118733848
Iteration: 10 --- Anomaly score: 105.55238190689516
Iteration: 11 --- Anomaly score: 105.43609091554195
Iteration: 12 --- Anomaly score: 105.31848432329144
Iteration: 13 --- Anomaly score: 105.19952063037036
Iteration: 14 --- Anomaly score: 105.0776321666385
Iteration: 15 --- Anomaly score: 104.95407621874465
Iteration: 16 --- Anomaly score: 104.83611051273868
Iteration: 17 --- Anomaly score: 104.55144451090808
Iteration: 18 --- Anomaly score: 104.37101774261379
Iteration: 19 --- Anomaly score: 1

In [8]:
print(data)
print(clean_data)

Data(x=[4039, 1283], edge_index=[2, 176626], y=[4039], adj=[4039, 4039])
Data(x=[4039, 1283], edge_index=[2, 176626], y=[4039], adj=[4039, 4039])


# GAD: DOMINANT JACCARD SIMILARITY

In [9]:
# num_nodes = data.y.shape[0] # note: data already has attribute .num_nodes

# # ---------- make adj matrix from data in coo format
# edge_weight = torch.ones(data.edge_index.size(1))
# edge_weight = edge_weight.cpu()

# adj = sp.csr_matrix((edge_weight, data.edge_index), (data.num_nodes, data.num_nodes))

In [10]:
# # make a train/validation/test split #
# labels = data.y

# # ----------- create new masks with specified split 
# split = [0.6, 0.2, 0.2]
# train_ratio = split[0]
# val_ratio = split[1]
# test_ratio = split[2]

# # Create a random permutation of node indices
# node_indices = torch.randperm(num_nodes)
# print(node_indices)

# # Calculate the split indices
# train_size = int(num_nodes * train_ratio)
# val_size = int(num_nodes * val_ratio)
# test_size = num_nodes - train_size - val_size

#     # Create new masks based on the split indices
# new_train_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_train_mask[node_indices[:train_size]] = True

# new_val_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_val_mask[node_indices[train_size:train_size+val_size]] = True

# new_test_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_test_mask[node_indices[train_size+val_size:]] = True

# # Assign the new masks to the data object
# data.train_mask = new_train_mask
# data.val_mask = new_val_mask
# data.test_mask = new_test_mask

# # Extract the new indices for the training, validation, and test sets
# idx_train = np.where(data.train_mask == True)[0]
# idx_val = np.where(data.val_mask == True)[0]
# idx_test = np.where(data.test_mask == True)[0]

# # from the RTGNN github ----------------------------------------------
# train_labels = labels[idx_train]
# val_labels = labels[idx_val]

# # Concatenating training and validation labels
# train_val_labels = np.concatenate([train_labels, val_labels],axis=0)

# # Concatenating training and validation masks
# idx = np.concatenate([idx_train, idx_val],axis=0)


# # # convert the T/F labels to integers --> used in the following call of "noisyfy_with_P"
# train_val_labels_int = train_val_labels.astype(int)
# # idx_int = idx.astype(int)