In [35]:
import sys
import os
import numpy as np
import pygod
from pygod.utils import load_data
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
import torch_sparse
from torch_sparse import SparseTensor
from typing import List
from sklearn.metrics import roc_auc_score
from gad_adversarial_robustness.utils.graph_utils import prepare_graph, get_n_anomaly_indexes, load_anomaly_detection_dataset
from gad_adversarial_robustness.poison.greedy import multiple_AS, poison_attack

import argparse
import scipy.sparse as sp
from scipy.sparse import coo_matrix
from torch_geometric.utils.convert import from_scipy_sparse_matrix

# --------- related to dataset import
from pygod.generator import gen_contextual_outlier, gen_structural_outlier
from torch_geometric.datasets import AttributedGraphDataset

# --------- jaccard
from gad_adversarial_robustness.gad.dominant.dominant_cuda import Dominant
from gad_adversarial_robustness.gad.dominant.dominant_cuda_Jaccard_similarity import Dominant as DominantJaccard

# --------- setup
import yaml

In [36]:
NUM_CLASSES = 2
PRELOADED_EDGE_INDEX = False
EDGE_INDEX_PT = "100_budget_facebook_greedy_edge_index.pt"

#### Import Data

In [37]:
# ------- datasets: 
data_name = ["Cora", "Wiki", "Facebook", "Blogcatalog"] # blogcataog not an option yet
string = data_name[2]

dataset = AttributedGraphDataset(root = "data/"+string, name = string)
data = dataset[0]
clean_data = data.clone() # save for later use

# ------- inject dataset through pygod:
seed = None
num_nodes_to_inject = 20
num_nodes_per_clique = 10
num_cliques = (num_nodes_to_inject // 2) // num_nodes_per_clique
num_contextual_outliers = num_nodes_to_inject - num_cliques * num_nodes_per_clique

data, ya = gen_contextual_outlier(data, n = num_contextual_outliers, k = 50, seed = seed) 
#n (int) – Number of nodes converting to outliers.
#k (int) – Number of candidate nodes for each outlier node.

data, ys = gen_structural_outlier(data, m = num_nodes_per_clique, n = num_cliques, seed = seed)
#m (int) - Number nodes in the outlier cliques.
#n (int) - Number of outlier clique

data.y = torch.logical_or(ys, ya).long() # where ys = y structural and ya = y attribute

y_binary: List[int] = data.y.bool()
anomaly_list = np.where(y_binary == True)[0]  # Used for list for which nodes to hide

In [38]:
print('poisoned data:   ', data)
print('clean data:      ', clean_data)
print('edge_index diff: ', len(data.edge_index[1]) - len(clean_data.edge_index[1]))

poisoned data:    Data(x=[4039, 1283], edge_index=[2, 88324], y=[4039])
clean data:       Data(x=[4039, 1283], edge_index=[2, 88234], y=[4039, 193])
edge_index diff:  90


#### adj matrix from edge_index

In [39]:
# -----> to obtain the dense_adj matrix from edge_index tensor 
edge_weight = torch.ones(data.edge_index.size(1))
edge_weight = edge_weight.cpu()
adj = sp.csr_matrix((edge_weight, data.edge_index), (data.num_nodes, data.num_nodes))
adj = torch_sparse.SparseTensor.from_scipy(adj).coalesce().to("cpu")

# adj matrix based on edge_index
data.adj = adj.to_dense()

### Posion (compute OR load in poisoned data)

In [40]:
# truth, of type int list, is instantiated to the T/F labels indicating whether a node is an anomalous node
if PRELOADED_EDGE_INDEX is False :
    truth: List[int] = data.y.bool()
    print('truth: ', truth)

truth:  tensor([False, False, False,  ..., False, False, False])


In [41]:
if PRELOADED_EDGE_INDEX is False :
    print("Create poison compatible adjacency matrix...") # based on code from: https://github.com/zhuyulin-tony/BinarizedAttack/blob/main/src/Greedy.py
    triple = []
    for i in range(data.num_nodes): # for all nodes...
        for j in range(i + 1, data.num_nodes):
            triple.append([i, j, data.adj[i,j]])  # Fill with 0, then insert actual after

    triple = np.array(triple) # convert to numpy array
    print('tripple: ', triple)

Create poison compatible adjacency matrix...


tripple:  [[0.000e+00 1.000e+00 1.000e+00]
 [0.000e+00 2.000e+00 1.000e+00]
 [0.000e+00 3.000e+00 1.000e+00]
 ...
 [4.036e+03 4.037e+03 0.000e+00]
 [4.036e+03 4.038e+03 0.000e+00]
 [4.037e+03 4.038e+03 0.000e+00]]


In [42]:
if PRELOADED_EDGE_INDEX is False :
    # ------- Identify Anomalous Nodes 
    # The nodes we try reduce the "Active Subnetwork" score for (i.e. disguise anonomalous nodes)
    target_node_lst = get_n_anomaly_indexes(truth, 999) # the indexes of the anomalies (999 is just a flag, if not 999 then it gets from 0 to n anomaly indexes)

    # ------- Making Model
    model = multiple_AS(target_lst = target_node_lst, n_node = data.num_nodes, device = 'cpu')
    budget = 1  # The amount of edges to change

Anomalies indexes: [ 148  426  523  836  899 1153 1212 1365 1490 1521 1792 1906 1973 2192
 2403 2590 2888 3807 3811 4007]


In [44]:
# ------- Attack (GradMaxSearch, i.e. the greedy attack)
if PRELOADED_EDGE_INDEX is False :
    adj_adversary, _, _ = poison_attack(model, triple, budget) # returns modified/ poisoned adj matrix

    

initial anomaly score: 93.62191966375084
Iteration: 1 --- Anomaly score: 93.52029468209719


In [45]:
# -------- post-attack processing --> i.e. "Converting to compatible tensor"
if PRELOADED_EDGE_INDEX is False :
    edge_index = torch.tensor([[],[]]) # Create new Edge Index

    # Transpose it to make shape compatible
    transposed_adj_adversary = torch.transpose(adj_adversary, 0, 1)

In [46]:
if PRELOADED_EDGE_INDEX is False :
    for i in range(len(adj_adversary)):
        if(adj_adversary[i][2] != 0):   # If edge value is NOT 0 (0 meaning no edge)
            # Add edge to edge index, choosing first 2 elements (edges), and then the ith edge
            edge_index = torch.cat((edge_index, transposed_adj_adversary[:2, i:i+1]), -1)

            # Dataset uses edges both ways so add reverse edge as well
            edge_index = torch.cat((edge_index, torch.flip(transposed_adj_adversary[:2, i:i+1], dims=[0])), -1)

    edge_index = edge_index.type(torch.int64)
    data.edge_index = edge_index # assign to dataset obj

    # ---- SAVE the edge index data: 
    torch.save(data.edge_index, '100_budget_facebook_greedy_edge_index.pt')

In [47]:
if PRELOADED_EDGE_INDEX is True  :
    data.edge_index = torch.load(EDGE_INDEX_PT)

In [49]:
print('poisoned data:   ', data)
print('clean data:      ', clean_data)
print('edge_index diff: ', len(data.edge_index[1]) - len(clean_data.edge_index[1]))

poisoned data:    Data(x=[4039, 1283], edge_index=[2, 176556], y=[4039], adj=[4039, 4039])
clean data:       Data(x=[4039, 1283], edge_index=[2, 88234], y=[4039, 193])
edge_index diff:  88322


# GAD: DOMINANT JACCARD SIMILARITY

In [50]:
script_dir = os.path.abspath('')
yaml_path = os.path.join(script_dir, '..', 'configs', 'dominant_config.yaml')
with open(yaml_path) as file:
        config = yaml.safe_load(file)

In [51]:
print("DOMINANT on CLEAN data:")
adj, attrs, label, adj_label = load_anomaly_detection_dataset(clean_data)
edge_index = torch.LongTensor(np.array(sp.coo_matrix(adj).nonzero()))
adj_label = torch.FloatTensor(adj_label)
attrs = torch.FloatTensor(attrs)

model = Dominant(feat_size=attrs.size(1), hidden_size=config['model']['hidden_dim'], dropout=config['model']['dropout'],
                    device='cpu', edge_index=edge_index, adj_label=adj_label, attrs=attrs, label=label)

model.fit(config, verbose=False)

# -------------------------------------------------------------------------------------

print("DOMINANT on POISONED data:")
dataset = data
adj, attrs, label, adj_label = load_anomaly_detection_dataset(dataset)
edge_index = torch.LongTensor(np.array(sp.coo_matrix(adj).nonzero()))
adj_label = torch.FloatTensor(adj_label)
attrs = torch.FloatTensor(attrs)

model = Dominant(feat_size=attrs.size(1), hidden_size=config['model']['hidden_dim'], dropout=config['model']['dropout'],
                    device='cpu', edge_index=edge_index, adj_label=adj_label, attrs=attrs, label=label)

model.fit(config, verbose = True)

DOMINANT on CLEAN data:


KeyboardInterrupt: 

In [None]:
"""
Dominant Jaccard Similarity on poisoned data
"""
jaccard_threshold = 0.01

print("DOMINANT modified w/ Jaccard on CLEAN data:")
adj, _, _, adj_label = load_anomaly_detection_dataset(clean_data, config['model']['device'])
#edge_index = torch.LongTensor(np.array(sp.coo_matrix(adj).nonzero()))
adj_label = torch.FloatTensor(adj_label).to(config['model']['device'])
#attrs = torch.FloatTensor(attrs)

edge_index = clean_data.edge_index.to(config['model']['device'])
label = torch.Tensor(clean_data.y.bool()).to(config['model']['device'])
attrs = clean_data.x.to(config['model']['device'])


model = DominantJaccard(feat_size=attrs.size(1), hidden_size=config['model']['hidden_dim'], dropout=config['model']['dropout'],
                    device=config['model']['device'], edge_index=edge_index, adj_label=adj_label, attrs=attrs, label=label)
model.to(config['model']['device'])
model.fit(config, threshold=jaccard_threshold, verbose=False)

# -------------------------------------------------------------------------------------

print("DOMINANT modified w/ Jaccard on POISONED data:")
dataset = data
adj, _, _, adj_label = load_anomaly_detection_dataset(dataset, config['model']['device'])
#edge_index = torch.LongTensor(np.array(sp.coo_matrix(adj).nonzero()))
adj_label = torch.FloatTensor(adj_label).to(config['model']['device'])
#attrs = torch.FloatTensor(attrs)

edge_index = dataset.edge_index.to(config['model']['device'])
label = torch.Tensor(dataset.y.bool()).to(config['model']['device'])
attrs = dataset.x.to(config['model']['device'])

model = DominantJaccard(feat_size=attrs.size(1), hidden_size=config['model']['hidden_dim'], dropout=config['model']['dropout'],
                    device=config['model']['device'], edge_index=edge_index, adj_label=adj_label, attrs=attrs, label=label)
model.to(config['model']['device'])
model.fit(config, threshold=jaccard_threshold, verbose=False)

In [48]:
# # ----- compute new or load in poisoned data
# if PRELOADED_EDGE_INDEX is False :

#     # truth, of type int list, is instantiated to the T/F labels indicating whether a node is an anomalous node
#     truth: List[int] = data.y.bool()
#     print('truth: ', truth)


#     print("Create poison compatible adjacency matrix...") # based on code from: https://github.com/zhuyulin-tony/BinarizedAttack/blob/main/src/Greedy.py
#     triple = []
#     for i in range(data.num_nodes): # for all nodes...
#         for j in range(i + 1, data.num_nodes):
#             triple.append([i, j, data.adj[i,j]])  # Fill with 0, then insert actual after

#     triple = np.array(triple) # convert to numpy array
#     print('tripple: ', triple)

#     # These are the nodes we try reduce the "active subnetwork score" for (i.e. disguising anonomalous nodes)
#     target_node_lst = get_n_anomaly_indexes(truth, 999) # the indexes of the anomalies (999 is just a flag, if not 999 then it gets from 0 to n anomaly indexes)

#     print("Making model...")
#     model = multiple_AS(target_lst = target_node_lst, n_node = data.num_nodes, device = 'cpu')
#     budget = 1  # The amount of edges to change

#     print("Starting attack...")
#     adj_adversary, _, _ = poison_attack(model, triple, budget) # returns modified/ poisoned adj matrix as 

#     print("Converting to compatible tensor...")
#     edge_index = torch.tensor([[],[]]) # Create new Edge Index

#     # Transpose it to make shape compatible
#     transposed_adj_adversary = torch.transpose(adj_adversary, 0, 1)

#     for i in range(len(adj_adversary)):
#         if(adj_adversary[i][2] != 0):   # If edge value is NOT 0 (0 meaning no edge)
#             #Add edge to edge index, choosing first 2 elements (edges), and then the ith edge
#             edge_index = torch.cat((edge_index, transposed_adj_adversary[:2, i:i+1]), -1)
#             # Dataset uses edges both ways so add reverse edge as well
#             edge_index = torch.cat((edge_index, torch.flip(transposed_adj_adversary[:2, i:i+1], dims=[0])), -1)


#     edge_index = edge_index.type(torch.int64)
#     data.edge_index = edge_index # assign to dataset obj

#     # ---- SAVE the edge index data: 
#     torch.save(data.edge_index, '100_budget_facebook_greedy_edge_index.pt')

# else : 
#     data.edge_index = torch.load(EDGE_INDEX_PT)

In [None]:
# num_nodes = data.y.shape[0] # note: data already has attribute .num_nodes

# # ---------- make adj matrix from data in coo format
# edge_weight = torch.ones(data.edge_index.size(1))
# edge_weight = edge_weight.cpu()

# adj = sp.csr_matrix((edge_weight, data.edge_index), (data.num_nodes, data.num_nodes))

In [None]:
# # make a train/validation/test split #
# labels = data.y

# # ----------- create new masks with specified split 
# split = [0.6, 0.2, 0.2]
# train_ratio = split[0]
# val_ratio = split[1]
# test_ratio = split[2]

# # Create a random permutation of node indices
# node_indices = torch.randperm(num_nodes)
# print(node_indices)

# # Calculate the split indices
# train_size = int(num_nodes * train_ratio)
# val_size = int(num_nodes * val_ratio)
# test_size = num_nodes - train_size - val_size

#     # Create new masks based on the split indices
# new_train_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_train_mask[node_indices[:train_size]] = True

# new_val_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_val_mask[node_indices[train_size:train_size+val_size]] = True

# new_test_mask = torch.zeros(num_nodes, dtype=torch.bool)
# new_test_mask[node_indices[train_size+val_size:]] = True

# # Assign the new masks to the data object
# data.train_mask = new_train_mask
# data.val_mask = new_val_mask
# data.test_mask = new_test_mask

# # Extract the new indices for the training, validation, and test sets
# idx_train = np.where(data.train_mask == True)[0]
# idx_val = np.where(data.val_mask == True)[0]
# idx_test = np.where(data.test_mask == True)[0]

# # from the RTGNN github ----------------------------------------------
# train_labels = labels[idx_train]
# val_labels = labels[idx_val]

# # Concatenating training and validation labels
# train_val_labels = np.concatenate([train_labels, val_labels],axis=0)

# # Concatenating training and validation masks
# idx = np.concatenate([idx_train, idx_val],axis=0)


# # # convert the T/F labels to integers --> used in the following call of "noisyfy_with_P"
# train_val_labels_int = train_val_labels.astype(int)
# # idx_int = idx.astype(int)