RTGNN vs. greedy binarized attack model

- this model should utilize the greedy binarized attack (white box poison) on a dataset to hide (potential) anomalies or poison in a datset, such as CORA. 
-   This should produce a dataset which contains hidden anomalies
- After obtaining a dataset with disguised poison (hidden anomalies) --> use RTGNN model to create a model that is unaffected/ robust to these anomalies
-   Like DOMINANT, RTGNN also makes use of an Autoencoder (encoder - decoder)?
-   

Goal: See if RTGNN is able to withstand or counteract a binarized attack on the input data

trying to figure out whether RTGNN would be useful or not, against a dataset with disguised poison (hidden anomalies)

In [261]:
import sys
import os
import numpy as np
import pygod
from pygod.utils import load_data
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
import torch_sparse
from torch_sparse import SparseTensor
from typing import List
from sklearn.metrics import roc_auc_score
# from gad_adversarial_robustness.gad.dominant import dominant
from gad_adversarial_robustness.utils.graph_utils import prepare_graph, get_n_anomaly_indexes, load_anomaly_detection_dataset
from gad_adversarial_robustness.poison.greedy import multiple_AS, poison_attack

import argparse
import scipy.sparse as sp


In [309]:
NUM_CLASSES = 2

In [268]:
from gad_adversarial_robustness.gad.RTGNN.utils import noisify_with_P
from gad_adversarial_robustness.gad.RTGNN.dataset import Dataset
from gad_adversarial_robustness.gad.RTGNN.model.RTGNN import RTGNN

In [248]:
script_dir = os.path.abspath('')
dataset_caching_path = os.path.join(script_dir, '..', '..', '..', 'data')

# import dataset from pygod
clean_data: Data = load_data("inj_cora", dataset_caching_path)
poisoned_data: Data = load_data("inj_cora", dataset_caching_path)

In [249]:
print(poisoned_data)

# x = [2708, 1433] : our node feature matrix of the shape [number of nodes, number of features]
# edge_index = [2, 11060] : our graph connectivity matrix of the shape [2, number of edges]
# y = [2708] : the node ground truth labels 
# train_mask = [2708] : an optional attribute that says which node should be used for training, with a list of True or False statements
# etc... in this case, the train_mask, vel_mask and test_mask have the same size....

Data(x=[2708, 1433], edge_index=[2, 11060], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [250]:
print(f'Edges are directed: {poisoned_data.is_directed()}')
print(f'Graph has isolated nodes: {poisoned_data.has_isolated_nodes()}')
print(f'Graph has loops: {poisoned_data.has_self_loops()}')

Edges are directed: True
Graph has isolated nodes: False
Graph has loops: False


In [289]:
print(poisoned_data)
print('------------')
print(f'Number of graphs: {len(poisoned_data)}')
print(f'Number of features: {poisoned_data.num_features}')
# print(f'Number of classes: {poisoned_data.num_classes}') ### does not have an attribute for the number of classes

Data(x=[2708, 1433], edge_index=[2, 10862], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], adj=[2708, 2708])
------------
Number of graphs: 7
Number of features: 1433


In [251]:
print(f'edge_index = {poisoned_data.edge_index.shape}')
print(poisoned_data.edge_index)

# the graph's connections are stored in two lists (11060 directed edges, which equate to 5530 bidirectional edges).

edge_index = torch.Size([2, 11060])
tensor([[   0,    0,    0,  ...,  869,  127, 1674],
        [ 633, 1862, 2582,  ..., 1732,  214,  438]])


In [252]:
##### -->  the method of obtaining dense_adj matrix from #####
edge_weight = torch.ones(poisoned_data.edge_index.size(1))
edge_weight = edge_weight.cpu()
adj = sp.csr_matrix((edge_weight, poisoned_data.edge_index), (poisoned_data.num_nodes, poisoned_data.num_nodes))
adj = torch_sparse.SparseTensor.from_scipy(adj).coalesce().to("cpu")

# adj matrix based on edge_index
poisoned_data.adj = adj.to_dense() 

In [253]:
print(poisoned_data.adj)
print(poisoned_data.adj.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 1., 0.]])
torch.Size([2708, 2708])


In [254]:
# truth, of type int list, is instantiated to the T/F labels indicating whether a node is an anomalous node
truth: List[int] = poisoned_data.y.bool()

In [255]:
print("Create poison compatible adjacency matrix...") # based on code from: https://github.com/zhuyulin-tony/BinarizedAttack/blob/main/src/Greedy.py
triple = []
for i in range(poisoned_data.num_nodes): # Cora has 2708 nodes
    for j in range(i + 1, poisoned_data.num_nodes):
        triple.append([i, j, poisoned_data.adj[i,j]])  #Fill with 0, then insert actual after

Create poison compatible adjacency matrix...


In [256]:
# convert tripple to numpy array
triple = np.array(triple)

In [257]:
# These are the nodes we try reduce the "active subnetwork score" for (i.e. disguising anonomalous nodes)
target_node_lst = get_n_anomaly_indexes(truth, 999) # the indexes of the anomalies

# print(type(target_node_lst)), print(f'target node list: {target_node_lst}'), print(target_node_lst)

print("Making model...")
model = multiple_AS(target_lst = target_node_lst, n_node = poisoned_data.num_nodes, device = 'cpu')
budget = 100  # The amount of edges to change

Anomalies indexes: [  10   50   70   76  104  124  127  143  151  170  179  181  196  214
  217  224  227  287  289  294  311  333  425  438  451  454  459  539
  565  572  578  581  615  619  641  652  654  660  670  674  692  711
  722  738  781  833  843  869  874  878  882  891  895  915  923  938
  980  982  996 1002 1014 1035 1053 1079 1090 1096 1133 1135 1206 1211
 1224 1229 1235 1287 1293 1310 1362 1391 1414 1426 1533 1540 1543 1547
 1570 1573 1575 1606 1623 1633 1656 1674 1728 1730 1732 1783 1808 1818
 1833 1854 1881 1885 1901 1918 1946 1999 2004 2041 2052 2055 2056 2078
 2089 2121 2126 2198 2215 2234 2263 2265 2294 2307 2336 2340 2375 2382
 2386 2397 2475 2479 2506 2518 2551 2600 2624 2654 2658 2693]
Making model...


In [258]:
print("Starting attack...")
adj_adversary, _, _ = poison_attack(model, triple, budget)

Starting attack...
triple copy type: <class 'numpy.ndarray'>


In [259]:
print("Converting to compatible tensor...")

# Create Edge Index'
edge_index = torch.tensor([[],[]])

# Transpose it to make shape compatible
transposed_adj_adversary = torch.transpose(adj_adversary, 0, 1)

for i in range(len(adj_adversary)):
    if(adj_adversary[i][2] != 0):   #If edge value is not 0 (no edge)
        #Add edge to edge index, choosing first 2 elements (edges), and then the ith edge
        edge_index = torch.cat((edge_index, transposed_adj_adversary[:2, i:i+1]), -1)
        # Dataset uses edges both ways so add reverse edge as well
        edge_index = torch.cat((edge_index, torch.flip(transposed_adj_adversary[:2, i:i+1], dims=[0])), -1)


edge_index = edge_index.type(torch.int64)
poisoned_data.edge_index = edge_index # assign to dataset obj

Converting to compatible tensor...


In [260]:
print(poisoned_data)
print(clean_data)

Data(x=[2708, 1433], edge_index=[2, 10862], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], adj=[2708, 2708])
Data(x=[2708, 1433], edge_index=[2, 11060], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


## RTGNN

In [269]:
# define the args for the model

parser = argparse.ArgumentParser()

######## the following are the parameters defined in the RTGNN main file ########

parser.add_argument('--seed', type=int, default=12, help='Random seed.')
parser.add_argument('--weight_decay', type=float, default=5e-4,
                    help='Weight decay (L2 loss on parameters).')
parser.add_argument('--hidden', type=int, default=128,
                    help='Number of hidden units.')
parser.add_argument('--edge_hidden', type=int, default=64,
                    help='Number of hidden units of MLP graph constructor')
parser.add_argument('--dropout', type=float, default=0.5,
                    help='Dropout rate (1 - keep probability).')
# parser.add_argument('--dataset', type=str, default="cora",
#                     choices=['cora', 'citeseer','blogcatalog'], help='dataset')
parser.add_argument('--ptb_rate', type=float, default=0.3,
                    help="noise ptb_rate")
parser.add_argument('--epochs', type=int,  default=200,
                    help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.001,
                    help='Initial learning rate.')
parser.add_argument('--alpha', type=float, default=1,
                    help='loss weight of graph reconstruction')
parser.add_argument('--tau',type=float, default=0.05,
                    help='threshold of filtering noisy edges')
parser.add_argument('--th',type=float, default=0.95,
                    help='threshold of adding pseudo labels')
parser.add_argument("--K", type=int, default=100,
                    help='number of KNN search for each node')
parser.add_argument("--n_neg", type=int, default=100,
                    help='number of negitive sampling for each node')
parser.add_argument('--noise', type=str, default='uniform', choices=['uniform', 'pair'],
                    help='type of noises')
parser.add_argument('--decay_w', type=float, default=0.1,
                    help='down-weighted factor')
parser.add_argument('--co_lambda',type=float,default=0.1,
                     help='weight for consistency regularization term')

_StoreAction(option_strings=['--co_lambda'], dest='co_lambda', nargs=None, const=None, default=0.1, type=<class 'float'>, choices=None, required=False, help='weight for consistency regularization term', metavar=None)

In [270]:
args = parser.parse_known_args()[0]
print(args)

Namespace(seed=12, weight_decay=0.0005, hidden=128, edge_hidden=64, dropout=0.5, ptb_rate=0.3, epochs=200, lr=0.001, alpha=1, tau=0.05, th=0.95, K=100, n_neg=100, noise='uniform', decay_w=0.1, co_lambda=0.1)


In [272]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# data = Dataset(root='./data', name=args.dataset)

### insert the data that was attacked by greedy BinarizedAttack
data = poisoned_data

In [387]:
# initialize adj matrix, node features and labels
adj = data.adj
features = data.x
labels = data.y.bool() # converts to true / false

# noise perturbation rate --> it controls the amount of noise added to the training and validation labels
ptb = args.ptb_rate 

# initalize number of classes
nclass = NUM_CLASSES + 1 
args.class_num=nclass

##################################################### from the original code
# # initialize data split values: train, validation, test indexes
# idx_train = len(data.train_mask)
# idx_val = len(data.val_mask)
# idx_test = len(data.test_mask)

# # Extracting labels for training and validation data
# train_labels = labels[idx_train] 
# val_labels = labels[idx_val]

# # concatenating values to np arrays
# train_val_labels = np.concatenate([train_labels,val_labels],axis=0) 
# idx = np.concatenate([idx_train,idx_val],axis=0)

##################################################################
######################   modified version:   #####################
################################################################## 

# Assuming data.y.shape == (2708,)
num_nodes = data.y.shape[0]

# Determine the split ratios (e.g., 0.6, 0.2, 0.2)
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

# Create a random permutation of node indices
node_indices = torch.randperm(num_nodes)

# Calculate the split indices
train_size = int(num_nodes * train_ratio)
val_size = int(num_nodes * val_ratio)
test_size = num_nodes - train_size - val_size

# Create new masks based on the split indices
new_train_mask = torch.zeros(num_nodes, dtype=torch.bool)
new_train_mask[node_indices[:train_size]] = True

new_val_mask = torch.zeros(num_nodes, dtype=torch.bool)
new_val_mask[node_indices[train_size:train_size+val_size]] = True

new_test_mask = torch.zeros(num_nodes, dtype=torch.bool)
new_test_mask[node_indices[train_size+val_size:]] = True

# Assign the new masks to the data object
data.train_mask = new_train_mask
data.val_mask = new_val_mask
data.test_mask = new_test_mask

# -----------------------------------------------------------

# Extracting labels for training, validation, and test data using masks
train_labels = labels[data.train_mask]
val_labels = labels[data.val_mask]
test_labels = labels[data.test_mask]

# Concatenating training and validation labels
train_val_labels = torch.cat([train_labels, val_labels], dim=0)

# Concatenating training and validation masks
idx = torch.cat([data.train_mask, data.val_mask], dim=0)

In [388]:
# convert tensors to numpy arrays
train_val_labels = train_val_labels.cpu().detach().numpy()
idx = idx.cpu().detach().numpy()

In [393]:
# Adding noise to the concatenated labels and getting the noise indices and clean indices
noise_y, P, noise_idx, clean_idx = noisify_with_P(train_val_labels, data.train_mask.shape[0], nclass, ptb, 10, args.noise)
args.noise_idx, args.clean_idx = noise_idx, clean_idx

noise_labels = labels.copy()
noise_labels[idx] = noise_y # set the noisy labels

Uniform noise


ValueError: object too deep for desired array

In [None]:
# instatiate the model


In [None]:
# model fit


In [None]:
# model test
