# Inject Anomalies

We utilizerd the datasets from the pygod.

See [data repository](https://github.com/pygod-team/data) for more details.


In [30]:
import torch
from pygod.generator import gen_contextual_outlier, gen_structural_outlier
from pygod.utils import load_data
from pygod.models import MLPAE,SCAN,Radar,ANOMALOUS,GCNAE,DOMINANT,DONE,AdONE,AnomalyDAE,GAAN,GUIDE,CONAD
from pygod.metric import eval_roc_auc
import numpy as np
from torch_geometric.data import Data

from pygod.utils.utility import check_parameter



def gen_joint_outliers(data, m, n, random_state=None):
    """
    We randomly select n nodes from the network which will be the anomalies 
    and for each node we select m nodes from the network. 
    We connect each of n nodes with the m other nodes.

    Parameters
    ----------
    data : PyTorch Geometric Data instance (torch_geometric.data.Data)
        The input data.
    m : int
        Number nodes in the outlier cliques.
    n : int
        Number of outlier cliques.
    random_state : int, optional
        The seed to control the randomness, Default: ``None``.

    Returns
    -------
    data : PyTorch Geometric Data instance (torch_geometric.data.Data)
        The structural outlier graph with injected edges.
    y_outlier : torch.Tensor
        The outlier label tensor where 1 represents outliers and 0 represents
        regular nodes.
    """

    if not isinstance(data, Data):
        raise TypeError("data should be torch_geometric.data.Data")

    if isinstance(m, int):
        check_parameter(m, low=0, high=data.num_nodes, param_name='m')
    else:
        raise ValueError("m should be int, got %s" % m)

    if isinstance(n, int):
        check_parameter(n, low=0, high=data.num_nodes, param_name='n')
    else:
        raise ValueError("n should be int, got %s" % n)

    check_parameter(m * n, low=0, high=data.num_nodes, param_name='m*n')

    if random_state:
        np.random.seed(random_state)


    outlier_idx = np.random.choice(data.num_nodes, size=n, replace=False)
    all_nodes = [i for i in range(data.num_nodes)]
    rem_nodes = []
    
    for node in all_nodes:
        if node is not outlier_idx:
            rem_nodes.append(node)
    
    
    
    new_edges = []
    
    # connect all m nodes in each clique
    for i in range(0, n):
        other_idx = np.random.choice(data.num_nodes, size=m, replace=False)
        for j in other_idx:
            new_edges.append(torch.tensor([[i, j]], dtype=torch.long))
                    

    new_edges = torch.cat(new_edges)


    y_outlier = torch.zeros(data.x.shape[0], dtype=torch.long)
    y_outlier[outlier_idx] = 1

    data.edge_index = torch.cat([data.edge_index, new_edges.T], dim=1)

    return data, y_outlier


path = "injected_dataset/"
dataset_name = "inj_cora"
data = load_data(dataset_name)


contextual_n = 70
contextual_k = 10
structural_n = 70
structural_m = 10

data, yc = gen_contextual_outlier(data, n=contextual_n, k=contextual_k)
data, ys = gen_structural_outlier(data, n=structural_n, m=structural_m)
data, yj = gen_joint_outliers(data, n=structural_n, m=structural_m)
yjs = torch.logical_or(yj, ys).int()
# torch.save(data, path + dataset_name + ".pt" )
# torch.save(yc, path + dataset_name + "_yc.pt" )
# torch.save(ys, path + dataset_name + "_ys.pt" )
# torch.save(yj, path + dataset_name + "_yj.pt" )

## Training and Testing
Training different model for different type of outliers



In [None]:
path = "injected_dataset/"
dataset_name = "inj_cora"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = torch.load(path + dataset_name + ".pt" )
yc = torch.load(path + dataset_name + "_yc.pt" )
ys = torch.load(path + dataset_name + "_ys.pt" )
yj = torch.load(path + dataset_name + "_yj.pt" )

model = MLPAE(epoch=100,hid_dim=16)
# model = SCAN()
# model = Radar()
# model = ANOMALOUS(epoch=100)
# model = GCNAE(epoch=100,hid_dim=16)
# model = DOMINANT(epoch=100,hid_dim=16)
# model = DONE(epoch=100,hid_dim=16)
# model = AdONE(epoch=100,hid_dim=16)
# model = AnomalyDAE(epoch=100)
# model = GAAN(epoch=100,hid_dim=16)
# model = GUIDE(epoch=100)
# model = CONAD(epoch=100,hid_dim=16)


roc_auc = []
roc_auc_c = []
roc_auc_s = []
roc_auc_j = []
roc_auc_js = []
for i in range(5):
    model.fit(data)
    labels = model.predict(data)
    outlier_scores = model.decision_function(data)
    prob = model.predict_proba(data)
    labels, confidence = model.predict(data, return_confidence=True)
    
    auc_score = eval_roc_auc(data.y.bool().numpy(), outlier_scores)
    auc_score = auc_score * 100
    
    auc_score_c = eval_roc_auc(yc, outlier_scores)
    auc_score_c = auc_score_c * 100
    
    auc_score_s = eval_roc_auc(ys, outlier_scores)
    auc_score_s = auc_score_s * 100
    
    auc_score_j = eval_roc_auc(yj, outlier_scores)
    auc_score_j = auc_score_j * 100
    
    auc_score_js = eval_roc_auc(yjs, outlier_scores)
    auc_score_js = auc_score_js * 100
    
    roc_auc.append(auc_score)
    roc_auc_c.append(auc_score_c)
    roc_auc_s.append(auc_score_s)
    roc_auc_j.append(auc_score_j)
    roc_auc_js.append(auc_score_js)



roc_auc = np.array(roc_auc)
mean = np.mean(roc_auc)
std = np.std(roc_auc)

roc_auc_c = np.array(roc_auc_c)
mean_c = np.mean(roc_auc_c)
std_c = np.std(roc_auc_c)

roc_auc_s = np.array(roc_auc_s)
mean_s = np.mean(roc_auc_s)
std_s = np.std(roc_auc_s)

roc_auc_j = np.array(roc_auc_j)
mean_j = np.mean(roc_auc_j)
std_j = np.std(roc_auc_j)

roc_auc_js = np.array(roc_auc_js)
mean_js = np.mean(roc_auc_js)
std_js = np.std(roc_auc_js)

print(model)
print('Dataset: ', dataset_name)
print('%.2f ± %.2f' %(mean, std))
print('%.2f ± %.2f' %(mean_c, std_c))
print('%.2f ± %.2f' %(mean_s, std_s))
print('%.2f ± %.2f' %(mean_j, std_j))
print('%.2f ± %.2f' %(mean_js, std_js))