In [1]:
import sys 
sys.path.append('../')


In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split, Subset

from sklearn.preprocessing import MinMaxScaler

from util.env import get_device, set_device
from util.preprocess import build_loc_net, construct_data
from util.net_struct import get_feature_map, get_fc_graph_struc
from util.iostream import printsep

from datasets.TimeDataset import TimeDataset


from models.GDN import GDN

from train import train
# from test  import test
from evaluate import get_err_scores, get_best_performance_data, get_val_performance_data, get_full_err_scores

import sys
from datetime import datetime

import os
import argparse
from pathlib import Path

import matplotlib.pyplot as plt

import json
import random


: 

In [3]:
parser = argparse.ArgumentParser()

parser.add_argument('-batch', help='batch size', type = int, default=128)
parser.add_argument('-epoch', help='train epoch', type = int, default=100)
parser.add_argument('-slide_win', help='slide_win', type = int, default=15)
parser.add_argument('-dim', help='dimension', type = int, default=64)
parser.add_argument('-slide_stride', help='slide_stride', type = int, default=5)
parser.add_argument('-save_path_pattern', help='save path pattern', type = str, default='')
parser.add_argument('-dataset', help='wadi / swat', type = str, default='wadi')
parser.add_argument('-device', help='cuda / cpu', type = str, default='cuda')
parser.add_argument('-random_seed', help='random seed', type = int, default=0)
parser.add_argument('-comment', help='experiment comment', type = str, default='')
parser.add_argument('-out_layer_num', help='outlayer num', type = int, default=1)
parser.add_argument('-out_layer_inter_dim', help='out_layer_inter_dim', type = int, default=256)
parser.add_argument('-decay', help='decay', type = float, default=0)
parser.add_argument('-val_ratio', help='val ratio', type = float, default=0.1)
parser.add_argument('-topk', help='topk num', type = int, default=20)
parser.add_argument('-report', help='best / val', type = str, default='best')
parser.add_argument('-load_model_path', help='trained model path', type = str, default='')

args = parser.parse_args([])

In [11]:

train_config = {
    'batch': args.batch,
    'epoch': args.epoch,
    'slide_win': args.slide_win,
    'dim': args.dim,
    'slide_stride': args.slide_stride,
    'comment': args.comment,
    'seed': args.random_seed,
    'out_layer_num': args.out_layer_num,
    'out_layer_inter_dim': args.out_layer_inter_dim,
    'decay': args.decay,
    'val_ratio': args.val_ratio,
    'topk': args.topk,
}

In [4]:
env_config={
    'save_path': args.save_path_pattern,
    'dataset': args.dataset,
    'report': args.report,
    'device': 'cuda',
    'load_model_path': args.load_model_path
}

In [12]:
cfg = {
    'slide_win': train_config['slide_win'],
    'slide_stride': train_config['slide_stride'],
}

In [6]:
dataset = 'msl'
train_orig = pd.read_csv(f'./data/{dataset}/train.csv', sep=',', index_col=0)
test_orig = pd.read_csv(f'./data/{dataset}/test.csv', sep=',', index_col=0)

train, test = train_orig, test_orig

if 'attack' in train.columns:
    train = train.drop(columns=['attack'])

In [8]:
feature_map = get_feature_map(dataset)
fc_struc = get_fc_graph_struc(dataset)

set_device(env_config['device'])
device = get_device()

In [9]:
fc_edge_index = build_loc_net(fc_struc, list(train.columns), feature_map=feature_map)
fc_edge_index = torch.tensor(fc_edge_index, dtype = torch.long)

train_dataset_indata = construct_data(train, feature_map, labels=0)
test_dataset_indata = construct_data(test, feature_map, labels=test.attack.tolist())



In [16]:
def get_loaders(train_dataset, seed, batch, val_ratio=0.1):
    dataset_len = int(len(train_dataset))
    train_use_len = int(dataset_len * (1 - val_ratio))
    val_use_len = int(dataset_len * val_ratio)
    val_start_index = random.randrange(train_use_len)
    indices = torch.arange(dataset_len)

    train_sub_indices = torch.cat([indices[:val_start_index], indices[val_start_index+val_use_len:]])
    train_subset = Subset(train_dataset, train_sub_indices)

    val_sub_indices = indices[val_start_index:val_start_index+val_use_len]
    val_subset = Subset(train_dataset, val_sub_indices)


    train_dataloader = DataLoader(train_subset, batch_size=batch,
                            shuffle=True)

    val_dataloader = DataLoader(val_subset, batch_size=batch,
                            shuffle=False)

    return train_dataloader, val_dataloader

In [17]:
train_dataset = TimeDataset(train_dataset_indata, fc_edge_index, mode='train', config=cfg)
test_dataset = TimeDataset(test_dataset_indata, fc_edge_index, mode='test', config=cfg)


train_dataloader, val_dataloader = get_loaders(train_dataset, train_config['seed'], train_config['batch'], val_ratio = train_config['val_ratio'])


In [18]:
for i, batch in enumerate(train_dataloader):
    print("sdf")
    break

sdf


In [19]:
batch

[tensor([[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [ 0.7604,  0.7588,  0.7573,  ...,  1.0000,  1.0000,  1.0000],
          [-0.9948, -0.9894, -0.9839,  ..., -0.7487, -0.7487, -0.7487],
          ...,
          [-1.0000, -1.0000,  0.9839,  ...,  0.9248,  0.9248,  0.9248],
          [-1.0012, -1.0012, -1.0012,  ..., -1.0012, -1.0012, -1.0012],
          [-1.0000, -1.0000, -1.0000,  ...,  0.8261,  0.8261,  0.6522]],
 
         [[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [ 1.0000,  1.0000,  0.5458,  ..., -0.0097, -0.0105, -0.0105],
          [-0.7487, -0.9498, -1.1900,  ..., -1.1411, -1.1411, -1.1303],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-0.9991, -0.9991, -0.9991,  ..., -0.9991, -0.9991, -0.9991],
          [-1.0000, -0.9565, -1.0000,  ..., -0.9130, -0.9130, -0.8261]],
 
         [[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-0.0648, -0.0667,

In [21]:
len(batch)

4

In [25]:
batch[3].shape

torch.Size([128, 2, 702])

# dataset? 

In [None]:

def construct_data(data, feature_map, labels=0):
    res = []

    for feature in feature_map:
        if feature in data.columns:
            res.append(data.loc[:, feature].values.tolist())
        else:
            print(feature, 'not exist in data')
    # append labels as last
    sample_n = len(res[0])

    if type(labels) == int:
        res.append([labels]*sample_n)
    elif len(labels) == sample_n:
        res.append(labels)

    return res

In [None]:
def get_feature_map(dataset):
    feature_file = open(f'./data/{dataset}/list.txt', 'r')
    feature_list = []
    for ft in feature_file:
        feature_list.append(ft.strip())

    return feature_list

In [26]:
feature_file = open(f'./data/{dataset}/list.txt', 'r')
feature_list = []
for ft in feature_file:
    feature_list.append(ft.strip())

In [27]:
feature_list

['M-6',
 'M-1',
 'M-2',
 'S-2',
 'P-10',
 'T-4',
 'T-5',
 'F-7',
 'M-3',
 'M-4',
 'M-5',
 'P-15',
 'C-1',
 'C-2',
 'T-12',
 'T-13',
 'F-4',
 'F-5',
 'D-14',
 'T-9',
 'P-14',
 'T-8',
 'P-11',
 'D-15',
 'D-16',
 'M-7',
 'F-8']

In [28]:
train

Unnamed: 0_level_0,M-6,M-1,M-2,S-2,P-10,T-4,T-5,F-7,M-3,M-4,...,F-5,D-14,T-9,P-14,T-8,P-11,D-15,D-16,M-7,F-8
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.0,0.999976,-0.748738,-1.0,0.994353,0.0,-1.0,-0.642857,1.000046,1.000000,...,-0.059837,-1.0,-0.333329,0.999426,-1.0,0.941907,-1.000000,-1.000000,-1.001157,-0.826087
1,-1.0,0.999976,-0.748738,-1.0,0.993788,0.0,-1.0,-0.964286,1.000046,1.000000,...,-0.059238,-1.0,-0.333329,0.999296,-1.0,0.944196,-1.000000,-1.000000,-1.001157,-0.869565
2,-1.0,0.999976,-0.748738,-1.0,0.994353,0.0,-1.0,-0.785714,1.000046,1.000000,...,-0.059163,-1.0,-0.333329,0.999611,-1.0,0.943751,0.952800,0.983735,-1.001157,-0.869565
3,-1.0,0.999976,-0.748738,-1.0,0.993506,0.0,-1.0,-0.892857,1.000046,1.000000,...,-0.058563,-1.0,-0.333329,0.999500,-1.0,0.941081,0.951874,-1.000000,-1.001157,-0.782609
4,-1.0,0.999976,-0.748738,-1.0,0.994353,0.0,-1.0,-0.964286,1.000046,1.000000,...,-0.056016,-1.0,-1.000000,0.999519,-1.0,0.941653,-1.000000,-1.000000,-1.001157,-0.869565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1560,-1.0,-0.621155,-1.036327,0.0,0.994070,0.0,-1.0,-0.928571,0.591862,-1.216307,...,0.013526,-1.0,0.000000,0.999500,0.0,0.598373,-1.000000,0.000000,-0.999096,-1.000000
1561,-1.0,-0.622580,-1.031021,0.0,0.992094,0.0,-1.0,-0.928571,0.824871,-1.283452,...,0.015624,-1.0,0.000000,0.999352,0.0,0.659135,1.024063,0.000000,-0.999096,-1.000000
1562,-1.0,-0.624111,-1.025695,0.0,0.991529,0.0,-1.0,0.964286,1.000022,-1.237844,...,-0.835887,-1.0,0.000000,0.999463,0.0,0.762036,1.023137,0.000000,-0.999096,-1.000000
1563,-1.0,-0.625548,-1.020368,0.0,0.991529,0.0,-1.0,-0.964286,0.855654,-1.223329,...,-0.826820,-1.0,0.000000,0.999444,0.0,0.763498,-1.000000,0.000000,-0.999096,-1.000000


In [30]:
len(train_dataset_indata)

28

In [32]:
train_dataset_indata[0]

[-1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,

In [36]:
len(train_dataset_indata[0])

1565

In [None]:
class TimeDataset(Dataset):
    def __init__(self, raw_data, edge_index, mode='train', config = None):
        self.raw_data = raw_data

        self.config = config
        self.edge_index = edge_index
        self.mode = mode

        x_data = raw_data[:-1]
        labels = raw_data[-1]


        data = x_data

        # to tensor
        data = torch.tensor(data).double()
        labels = torch.tensor(labels).double()

        self.x, self.y, self.labels = self.process(data, labels)
    
    def __len__(self):
        return len(self.x)


    def process(self, data, labels):
        x_arr, y_arr = [], []
        labels_arr = []

        slide_win, slide_stride = [self.config[k] for k
            in ['slide_win', 'slide_stride']
        ]
        is_train = self.mode == 'train'

        node_num, total_time_len = data.shape

        rang = range(slide_win, total_time_len, slide_stride) if is_train else range(slide_win, total_time_len)
        
        for i in rang:

            ft = data[:, i-slide_win:i]
            tar = data[:, i]

            x_arr.append(ft)
            y_arr.append(tar)

            labels_arr.append(labels[i])


        x = torch.stack(x_arr).contiguous()
        y = torch.stack(y_arr).contiguous()

        labels = torch.Tensor(labels_arr).contiguous()
        
        return x, y, labels

    def __getitem__(self, idx):

        feature = self.x[idx].double()
        y = self.y[idx].double()

        edge_index = self.edge_index.long()

        label = self.labels[idx].double()

        return feature, y, label, edge_index