In [212]:
import math
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset

from datasets.TimeDataset import TimeDataset
from util.env import *
from util.time import *

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x19ffd3ed910>

In [None]:
def get_batch_edge_index(org_edge_index, batch_num, n_nodes):

    edge_index = org_edge_index.clone().detach()
    edge_num = org_edge_index.shape[1]
    batch_edge_index = edge_index.repeat(1, batch_num).contiguous()

    for i in range(batch_num):
        batch_edge_index[:, i * edge_num : (i + 1) * edge_num] += i * n_nodes

    return batch_edge_index.long()


class OutLayer(nn.Module):
    def __init__(self, in_num, layer_num, inter_num=512):
        super(OutLayer, self).__init__()

        modules = []

        for i in range(layer_num):
            # last layer, output shape:1
            if i == layer_num - 1:
                modules.append(nn.Linear(in_num if layer_num == 1 else inter_num, 1))
            else:
                layer_in_num = in_num if i == 0 else inter_num
                modules.append(nn.Linear(layer_in_num, inter_num))
                modules.append(nn.BatchNorm1d(inter_num))
                modules.append(nn.ReLU())

        self.mlp = nn.ModuleList(modules)

    def forward(self, x):
        out = x

        for mod in self.mlp:
            if isinstance(mod, nn.BatchNorm1d):
                out = out.permute(0, 2, 1)
                out = mod(out)
                out = out.permute(0, 2, 1)
            else:
                out = mod(out)

        return out


class GNNLayer(nn.Module):
    def __init__(self, in_channel, out_channel, inter_dim=0, heads=1):
        super(GNNLayer, self).__init__()

        self.gnn = GraphLayer(
            in_channel, out_channel, inter_dim=inter_dim, heads=heads, concat=False
        )

        self.bn = nn.BatchNorm1d(out_channel)
        self.relu = nn.ReLU()
        self.leaky_relu = nn.LeakyReLU()

    def forward(self, x, edge_index, embedding=None):

        out, (new_edge_index, att_weight) = self.gnn(
            x, edge_index, embedding, return_attention_weights=True
        )

        self.att_weight_1 = att_weight
        self.edge_index_1 = new_edge_index

        out = self.bn(out)

        return self.relu(out)


class GDN(nn.Module):
    def __init__(
        self,
        fc_edge_idx,
        n_nodes,
        embed_dim=64,
        out_layer_inter_dim=256,
        input_dim=10,
        out_layer_num=1,
        topk=20,
    ):

        super(GDN, self).__init__()

        self.fc_edge_idx = fc_edge_idx
        self.n_nodes = n_nodes
        self.embed_dim = embed_dim
        self.out_layer_inter_dim = out_layer_inter_dim
        self.input_dim = input_dim
        self.out_layer_num = out_layer_num
        self.topk = topk

    def _initialise_layers(self):

        self.embedding = nn.Embedding(self.n_nodes, self.embed_dim)
        nn.init.kaiming_uniform_(self.embedding.weight, a=math.sqrt(5))

        self.bn_outlayer_in = nn.BatchNorm1d(self.embed_dim)

        self.gnn_layers = nn.ModuleList(
            [
                GNNLayer(
                    self.input_dim,
                    self.embed_dim,
                    inter_dim=2 * self.embed_dim,
                    heads=1,
                )
            ]
        )

        self.node_embedding = None
        self.learned_graph = None

        self.out_layer = OutLayer(
            self.embed_dim, self.out_layer_num, inter_num=self.out_layer_inter_dim
        )

        self.cache_fc_edge_idx = None
        self.cache_embed_index = None

        self.dp = nn.Dropout(0.2)

    def forward(self, data):

        x = data.clone().detach()
        device = data.device
        batch_num, n_nodes, all_feature = x.shape
        x = x.view(-1, all_feature).contiguous()

        if self.cache_fc_edge_idx is None:
            self.cache_fc_edge_idx = get_batch_edge_index(
                self.fc_edge_index, batch_num, n_nodes
            ).to(device)

        all_embeddings = self.embedding(torch.arange(n_nodes).to(device))  # v_i's

        weights_arr = all_embeddings.detach().clone()
        all_embeddings = all_embeddings.repeat(batch_num, 1)

        weights = weights_arr.view(n_nodes, -1)

        cos_ji_mat = torch.matmul(weights, weights.T)  # e_{ji} in eqn (2)
        normed_mat = torch.matmul(
            weights.norm(dim=-1).view(-1, 1), weights.norm(dim=-1).view(1, -1)
        )
        cos_ji_mat = cos_ji_mat / normed_mat

        topk_indices_ji = torch.topk(cos_ji_mat, self.topk, dim=-1)[
            1
        ]  # A_{ji} in eqn (3)

        self.learned_graph = topk_indices_ji

        gated_i_ = torch.arange(0, n_nodes)
        gated_i = (
            gated_i_.permute(*torch.arange(gated_i_.ndim - 1, -1, -1))
            .unsqueeze(1)
            .repeat(1, self.topk)
            .flatten()
            .to(device)
            .unsqueeze(0)
        )

        gated_j = topk_indices_ji.flatten().unsqueeze(0)
        gated_edge_index = torch.cat((gated_j, gated_i), dim=0)

        batch_gated_edge_index = get_batch_edge_index(
            gated_edge_index, batch_num, n_nodes
        ).to(device)

        gcn_out = self.gnn_layers(
            x,
            batch_gated_edge_index,
            embedding=all_embeddings,
        )
        gcn_out = gcn_out.view(batch_num, n_nodes, -1)

        idxs = torch.arange(0, n_nodes).to(device)
        out = torch.mul(gcn_out, self.embedding(idxs))
        out = out.permute(0, 2, 1)
        out = F.relu(self.bn_outlayer_in(out))  # eqn (5)
        out = out.permute(0, 2, 1)
        out = self.dp(out)
        out = self.out_layer(out)
        out = out.view(-1, n_nodes)

        return out

In [218]:
class GNNAD:
    """
    Graph Neural Network-based Anomaly Detection in Multivariate Timeseries.
    """

    def __init__(
        self,
        batch: int = 128,
        epoch: int = 100,
        slide_win: int = 15,
        dim: int = 64,
        slide_stride: int = 5,
        comment: str = "",
        random_seed: int = 0,
        out_layer_num: int = 1,
        out_layer_inter_dim: int = 256,
        decay: float = 0,
        validate_ratio: float = 0.1,
        topk: int = 20,
        save_path_pattern: str = "msl",
        dataset: str = "msl",
        device: str = "cpu",
        report: str = "best",
        load_model_path: str = "",
    ):

        self.batch = batch
        self.epoch = epoch
        self.slide_win = slide_win
        self.dim = dim
        self.slide_stride = slide_stride
        self.comment = comment
        self.random_seed = random_seed
        self.out_layer_num = out_layer_num
        self.out_layer_inter_dim = out_layer_inter_dim
        self.decay = decay
        self.validate_ratio = validate_ratio
        self.topk = topk
        self.save_path_pattern = save_path_pattern
        self.dataset = dataset
        self.device = device
        self.report = report
        self.load_model_path = load_model_path

    def _split_train_validation(self, data):

        dataset_len = len(data)
        validate_use_len = int(dataset_len * self.validate_ratio)
        validate_start_idx = random.randrange(dataset_len - validate_use_len)
        idx = torch.arange(dataset_len)

        train_sub_idx = torch.cat(
            [idx[:validate_start_idx], idx[validate_start_idx + validate_use_len :]]
        )
        train_subset = Subset(data, train_sub_idx)

        validate_sub_idx = idx[
            validate_start_idx : validate_start_idx + validate_use_len
        ]
        validate_subset = Subset(data, validate_sub_idx)

        return train_subset, validate_subset

    def _load_data(self):

        train = pd.read_csv(f"./data/{self.dataset}/train.csv", sep=",", index_col=0)
        test = pd.read_csv(f"./data/{self.dataset}/test.csv", sep=",", index_col=0)

        train = train.drop(columns=["attack"]) if "attack" in train.columns else train

        feature_list = train.columns[
            train.columns.str[0] != "_"
        ].to_list()  # convention is to pass non-features as '_'
        assert len(feature_list) == len(set(feature_list))

        fc_struc = {
            ft: [x for x in feature_list if x != ft] for ft in feature_list
        }  # fully connected structure

        edge__idx_tuples = [
            (feature_list.index(child), feature_list.index(node_name))
            for node_name, node_list in fc_struc.items()
            for child in node_list
        ]

        fc_edge_idx = [
            [x[0] for x in edge__idx_tuples],
            [x[1] for x in edge__idx_tuples],
        ]
        fc_edge_idx = torch.tensor(fc_edge_idx, dtype=torch.long)

        train_input = _parse_data(train, feature_list)
        test_input = _parse_data(test, feature_list, labels=test.attack.tolist())

        cfg = {
            "slide_win": self.slide_win,
            "slide_stride": self.slide_stride,
        }

        train_dataset = TimeDataset(train_input, fc_edge_idx, mode="train", config=cfg)
        test_dataset = TimeDataset(test_input, fc_edge_idx, mode="test", config=cfg)

        train_subset, validate_subset = self._split_train_validation(train_dataset)

        # get data loaders
        train_dataloader = DataLoader(train_subset, batch_size=self.batch, shuffle=True)

        validate_dataloader = DataLoader(
            validate_subset, batch_size=self.batch, shuffle=False
        )

        test_dataloader = DataLoader(
            test_dataset, batch_size=self.batch, shuffle=False, num_workers=0
        )

        # save to self
        self.fc_edge_idx = fc_edge_idx
        self.feature_list = feature_list
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.train_dataloader = train_dataloader
        self.validate_dataloader = validate_dataloader
        self.test_dataloader = test_dataloader

    def _load_model(self):
        # instantiate model
        model = GDN(
            self.fc_edge_idx,
            n_nodes=len(self.feature_list),
            dim=self.dim,
            input_dim=self.slide_win,
            out_layer_num=self.out_layer_num,
            out_layer_inter_dim=self.out_layer_inter_dim,
            topk=self.topk,
        ).to(self.device)

        model._initialise_layers()

        self.model = model

    def fit(self):
        self._load_data()

        return self


def _parse_data(data, feature_list, labels=None):

    labels = [0] * data.shape[0] if labels == None else labels
    res = data[feature_list].T.values.tolist()
    res.append(labels)
    return res

In [230]:
edge_index_sets = []
edge_index_sets.append(fitted_model.fc_edge_idx)
edge_index_sets

[tensor([[ 1,  2,  3,  ..., 23, 24, 25],
         [ 0,  0,  0,  ..., 26, 26, 26]])]

In [232]:
for i, edge_index in enumerate(edge_index_sets):
    print(i)

0


In [219]:
model = GNNAD()
fitted_model = model.fit()

In [223]:
len(fitted_model.feature_list)

27

In [221]:
fitted_model.fc_edge_idx

[tensor([[ 1,  2,  3,  ..., 23, 24, 25],
         [ 0,  0,  0,  ..., 26, 26, 26]])]

In [105]:
# refactor to module
#    - init: dataloaders, GDN
#    - run: train, test, check output
# write unit tests
# --------------------

# get interactive validation screen
# plots like paper
# error handling for real data!

# --------------------
# ideas for research (graph metrics, input node-related anomaly)