In [212]:
import pandas as pd
import numpy as np
import torch
import random

from torch.utils.data import DataLoader, Subset
from datasets.TimeDataset import TimeDataset

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x19ffd3ed910>

In [213]:
class GNNAD():
    """
    Graph Neural Network-based Anomaly Detection in Multivariate Timeseries.
    """
    def __init__(self,
        batch: int = 128,
        epoch: int = 100,
        slide_win: int = 15,
        dim: int = 64,
        slide_stride: int = 5,
        comment: str = '',
        random_seed: int = 0,
        out_layer_num: int = 1,
        out_layer_inter_dim: int = 256,
        decay: float = 0,
        validate_ratio: float = 0.1,
        topk: int = 20,
        save_path_pattern: str = 'msl',
        dataset: str = 'msl',
        device: str = 'cpu',
        report: str = 'best',
        load_model_path: str = '',
        ):

        self.batch = batch
        self.epoch = epoch
        self.slide_win = slide_win
        self.dim = dim
        self.slide_stride = slide_stride
        self.comment = comment
        self.random_seed = random_seed
        self.out_layer_num = out_layer_num
        self.out_layer_inter_dim = out_layer_inter_dim
        self.decay = decay
        self.validate_ratio = validate_ratio
        self.topk = topk
        self.save_path_pattern = save_path_pattern
        self.dataset = dataset
        self.device = device
        self.report = report
        self.load_model_path = load_model_path


    def _split_train_validation(self, data):

        dataset_len = len(data)
        validate_use_len = int(dataset_len * self.validate_ratio)
        validate_start_idx = random.randrange(dataset_len - validate_use_len)
        idx = torch.arange(dataset_len)

        train_sub_idx = torch.cat([idx[:validate_start_idx], idx[validate_start_idx+validate_use_len:]])
        train_subset = Subset(data, train_sub_idx)

        validate_sub_idx = idx[validate_start_idx:validate_start_idx+validate_use_len]
        validate_subset = Subset(data, validate_sub_idx)

        return train_subset, validate_subset


    def _load_data(self):

        train = pd.read_csv(f'./data/{self.dataset}/train.csv', sep=',', index_col=0)
        test = pd.read_csv(f'./data/{self.dataset}/test.csv', sep=',', index_col=0)

        train = train.drop(columns=['attack']) if 'attack' in train.columns else train
        
        feature_list = train.columns[train.columns.str[0] != '_'].to_list() # convention is to pass non-features as '_'
        assert len(feature_list) == len(set(feature_list))

        fc_struc = {ft: [x for x in feature_list if x != ft] for ft in feature_list} # fully connected structure

        edge__idx_tuples = [(feature_list.index(child), feature_list.index(node_name)) 
        for node_name, node_list in fc_struc.items() for child in node_list]

        fc_edge_idx = [[x[0] for x in edge__idx_tuples], [x[1] for x in edge__idx_tuples]]
        fc_edge_idx = torch.tensor(fc_edge_idx, dtype = torch.long)

        train_input = _parse_data(train, feature_list)
        test_input = _parse_data(test, feature_list, labels=test.attack.tolist())

        cfg = {
            'slide_win': self.slide_win,
            'slide_stride': self.slide_stride,
        }
        
        train_dataset = TimeDataset(train_input, fc_edge_idx, mode='train', config=cfg)
        test_dataset = TimeDataset(test_input, fc_edge_idx, mode='test', config=cfg)

        train_subset, validate_subset = self._split_train_validation(train_dataset)

        # get data loaders
        train_dataloader = DataLoader(train_subset, batch_size=self.batch,
                                shuffle=True)

        validate_dataloader = DataLoader(validate_subset, batch_size=self.batch,
                                shuffle=False)

        test_dataloader = DataLoader(test_dataset, batch_size=self.batch,
                                shuffle=False, num_workers=0)
        
        # instantiate model
        model = GDN([fc_edge_idx],
            len(feature_list), 
            dim=self.dim, 
            input_dim=self.slide_win,
            out_layer_num=self.out_layer_num,
            out_layer_inter_dim=self.out_layer_inter_dim,
            topk=self.topk
        ).to(self.device)

        # save to self
        self.feature_list = feature_list
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.train_dataloader = train_dataloader
        self.validate_dataloader = validate_dataloader
        self.test_dataloader = test_dataloader
        self.model = model

    def fit(self):
        self._load_data()





def _parse_data(data, feature_list, labels=None):

    labels = [0]*data.shape[0] if labels == None else labels
    res = data[feature_list].T.values.tolist()
    res.append(labels)
    return res

In [None]:
model = GNNAD()
model.fit()


In [105]:
# refactor to module
#    - init: dataloaders, GDN
#    - run: train, test, check output
# write unit tests
#--------------------

# get interactive validation screen
# plots like paper
# error handling for real data!

#--------------------
# ideas for research (graph metrics, input node-related anomaly)