In [1]:
%load_ext autoreload
%autoreload 2
import utils.dataset as dataset
import utils.preprocessing as preprocessing
from utils.logger import Logger
import lstm.model as model
import utils.postprocessing as postprocessing
import pandas as pd
from sklearn import metrics
from numpy import linalg as LA
import numpy as np
import logging
import argparse
from sklearn import svm

from astropy.convolution import Gaussian1DKernel, convolve

import torch
import torch.nn as nn
from pathlib import Path
from torch.autograd import Variable

import time

  from ._conv import register_converters as _register_converters


# SVM

In [2]:
parser = argparse.ArgumentParser()

parser.add_argument("--cuda", default = True, action = "store_true")
parser.add_argument("--tf_log", default = False, action = "store_true")
parser.add_argument("--model_name", type = str, default = "enc_dec")
parser.add_argument("--batch_size", type = int, default = 256)
parser.add_argument("--clip", type = int, default=1)

parser.add_argument("--train_path", type = str, default = "../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Normal_v0.csv")
parser.add_argument("--test_path", type = str, default = "../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Attack_v0.csv")
parser.add_argument("--attack_list_path", type = str, default = '../../DATA/SWaT/SWaT_Physical/attack_list.csv')

parser.add_argument("--dropout", type = float, default = 0.5)
parser.add_argument("--hidden_size", type = int, default = 4)
parser.add_argument("--nlayers", type = int, default = 2)
parser.add_argument("--lr", type = float, default = 0.0001)

parser.add_argument("--cell_type", type=str, default="LSTM")
parser.add_argument("--epoch", type=int, default=10)
parser.add_argument("--seq_length", type=int, default=2)
parser.add_argument('--selected_dim', nargs='+', type=int, default=[36, 38, 28, 40])
args = parser.parse_args([])

In [3]:
class SVM_solver():
    def __init__(self, args):
        
        torch.manual_seed(777)
        torch.cuda.manual_seed_all(777)
        np.random.seed(777)
        
        self.attack_list = pd.read_csv(args.attack_list_path, error_bad_lines=False, sep='\t')
        train_x, test_x, self.test_y = dataset.svm_dataset(train_path = args.train_path, test_path = args.test_path)
        self.train_x = train_x[:,args.selected_dim]
        self.test_x = test_x[:, args.selected_dim]
        
        
    def fit(self, load):
        start_time = time.time()
        
        nu = 0.001
        gamma = 0.001
        clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma, verbose=False)
        clf.fit(self.train_x)
        end_time = time.time()
        preds = clf.predict(self.test_x)         
        end_time = time.time()

        f1 = metrics.f1_score(self.test_y, preds, pos_label = -1)
        print(f1)
        

In [4]:
parser = argparse.ArgumentParser()

parser.add_argument("--cuda", default=True, action="store_true")
parser.add_argument("--tf_log", default=False, action="store_true")
parser.add_argument("--model_name", type=str, default="enc_dec")
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--clip", type=int, default=1)

parser.add_argument("--train_path", type=str, default="../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Normal_v0.csv")
parser.add_argument("--test_path", type=str, default="../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Attack_v0.csv")
parser.add_argument("--attack_list_path", type=str, default='../../DATA/SWaT/SWaT_Physical/attack_list.csv')

parser.add_argument("--dropout", type=float, default=0.5)
parser.add_argument("--hidden_size", type=int, default=4)
parser.add_argument("--nlayers", type=int, default=2)
parser.add_argument("--lr", type=float, default=0.0001)

parser.add_argument("--cell_type", type=str, default="LSTM")
parser.add_argument("--epoch", type=int, default=1)
parser.add_argument("--seq_length", type=int, default=2)
parser.add_argument('--selected_dim', nargs='+', type=int, default=[36, 38, 28, 40])
args = parser.parse_args([])

In [66]:
args.selected_dim = [35, 44]
args.hidden_size = 4
solver = SVM_solver(args = args)
solver.fit(load = False) 

  
  pca.fit(X_train)


(496800, 2) (449919, 2)
(449919,) (449919,)
0.035280447871553815


In [76]:
args.selected_dim = [35, 44]
args.hidden_size = 4
solver = SVM_solver(args = args)
solver.fit(load = False) 

w1


  pca.fit(X_train)
  X_train = pca.transform(X_train)


(496800, 2) (449919, 2)
(449919,) (449919,)
0.8077569323301833


In [5]:
args.selected_dim = [35, 44]
args.hidden_size = 4
solver = SVM_solver(args = args)
solver.fit(load = False) 

w1


  y_train = y_train.rolling(10, min_periods=1).apply(check)
  y_test = y_test.rolling(10, min_periods=1).apply(check)


0.8077569323301833


In [6]:
args.selected_dim = [0, 39]
args.hidden_size = 4
solver = SVM_solver(args = args)
solver.fit(load = False) 

w1


  y_train = y_train.rolling(10, min_periods=1).apply(check)
  y_test = y_test.rolling(10, min_periods=1).apply(check)


0.3524330154683402


In [None]:
args.selected_dim = [0, 2]
args.hidden_size = 4
solver = SVM_solver(args = args)
solver.fit(load = False) 

# LSTM

## model

In [5]:
class ENCODER(nn.Module):

    def __init__(self,args):
        super(ENCODER, self).__init__()
        self.args = args
        self.drop = nn.Dropout(args['dropout'])
        self.linear = nn.Linear(args['hidden_size'], args['data_dim'])

        if args['cell_type'] in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, args['cell_type'])(args['rnn_inp_size'], args['hidden_size'], args['nlayers'], dropout=args['dropout'])

    def init_weights(self):
        initrange = 0.1
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.fill_(0)

    def forward(self, input, hidden, return_hiddens=False, noise=False):
        output, hidden = self.rnn(input, hidden)
        output = self.linear(output.contiguous().view(-1,self.args['hidden_size']))
        output = output.contiguous().view(input.size()[0], -1, self.args['rnn_inp_size'])
        return output, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data ############# 이게 무엇
        if self.args['cell_type'] == 'LSTM':
            return (Variable(weight.new(self.args['nlayers'], bsz, self.args['hidden_size']).zero_()),
                    Variable(weight.new(self.args['nlayers'], bsz, self.args['hidden_size']).zero_()))

    def repackage_hidden(self,h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if type(h) == tuple:
            return tuple(self.repackage_hidden(v) for v in h)
        else:
            return Variable(h.data)

    def extract_hidden(self, hidden):
        if self.args['cell_type'] == 'LSTM':
            return hidden[0][-1].data.cpu()  # hidden state last layer (hidden[1] is cell state)
        else:
            return hidden[-1].data.cpu()  # last layer

        
class DECODER(nn.Module):

    def __init__(self,args):
        super(DECODER, self).__init__()
        self.args = args
        self.drop = nn.Dropout(args['dropout'])
        self.linear = nn.Linear(args['hidden_size'], args['data_dim'])

        if args['cell_type'] in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, args['cell_type'])(args['rnn_inp_size'], args['hidden_size'], args['nlayers'], dropout=args['dropout'])


    def init_weights(self):
        initrange = 0.1
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.fill_(0)
        
    def forward(self, input, hidden, return_hiddens=False, noise=False):
        
        output, hidden = self.rnn(input, hidden)
        output = self.linear(output.contiguous().view(-1,self.args['hidden_size']))
        output = output.contiguous().view(input.size()[0], -1, self.args['rnn_inp_size'])

        return output, hidden




    def init_hidden(self, bsz):
        weight = next(self.parameters()).data ############# 이게 무엇
        if self.args['cell_type'] == 'LSTM':
            return (Variable(weight.new(self.args['nlayers'], bsz, self.args['hidden_size']).zero_()),
                    Variable(weight.new(self.args['nlayers'], bsz, self.args['hidden_size']).zero_()))

    def repackage_hidden(self,h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if type(h) == tuple:
            return tuple(self.repackage_hidden(v) for v in h)
        else:
            return Variable(h.data)

    def extract_hidden(self, hidden):
        if self.args['cell_type'] == 'LSTM':
            return hidden[0][-1].data.cpu()  # hidden state last layer (hidden[1] is cell state)
        else:
            return hidden[-1].data.cpu()  # last layer

## solver

In [5]:
class Solver():
    def __init__(self, args):
        
        torch.manual_seed(777)
        torch.cuda.manual_seed_all(777)
        np.random.seed(777)
        
        self.attack_list = pd.read_csv(args.attack_list_path, error_bad_lines=False, sep='\t')

        self.tf_log = args.tf_log
        
#         train_x, test_x, test_y = dataset.dataset(train_path = args.train_path, test_path = args.test_path)
        train_x, test_x, test_y = dataset.lstm_dataset(train_path = args.train_path, test_path = args.test_path)
        
        train_x_batchfy = preprocessing.batchify(args, train_x, args.batch_size)
        test_x_batchfy = preprocessing.batchify(args, test_x, args.batch_size)
        generate_batchfy = preprocessing.batchify(args, test_x, 1)
        train_generate_batchfy = preprocessing.batchify(args, train_x, 1)
        
        self.train_x_batchfy = train_x_batchfy[:,:,args.selected_dim]
        self.test_x_batchfy = test_x_batchfy[:,:,args.selected_dim]
        self.generate_batchfy = generate_batchfy[:,:,args.selected_dim]
        self.train_generate_batchfy = train_generate_batchfy[:,:,args.selected_dim]
        self.test_y = test_y
        

        self.args = args
        self.encoder = model.ENCODER(self.args)
        self.encoder.cuda()

        self.decoder = model.DECODER(self.args)
        self.decoder.cuda()

        self.optim_enc   = torch.optim.Adam(self.encoder.parameters(), self.args.lr)
        self.optim_dec   = torch.optim.Adam(self.decoder.parameters(), self.args.lr)

        self.loss_fn = nn.MSELoss()    
    
        self.logger = Logger('./tf_logs')
    
        self.base_dir = Path('model_save')
        self.base_dir.mkdir(parents=True,exist_ok=True)      


    def load(self, path):
        try:
            print("=> loaded checkpoint")
        except:
            print("=> Not exist checkpoint")
            pass        

    def fit(self, load):
        total_loss = 0
        max_f1 = 0
        total_length = self.train_x_batchfy.size(0) - 1
        start_time = time.time()
        
        
                                          
        for epoch in range(0, self.args.epoch):

            self.encoder.train()
            self.decoder.train()
                
            hidden_enc = self.encoder.init_hidden(self.args.batch_size)

            for batch, i in enumerate(range(0, self.train_x_batchfy.size(0) - 1, self.args.seq_length)):
                outSeq = []
                inputSeq, targetSeq = preprocessing.get_batch(self.args, self.train_x_batchfy, i)

                if args.seq_length != targetSeq.size()[0] :
                    continue
                hidden_enc = self.encoder.repackage_hidden(hidden_enc)
                self.optim_enc.zero_grad()
                self.optim_dec.zero_grad()
                
                Outputseq_enc, hidden_enc = self.encoder.forward(inputSeq, hidden_enc, return_hiddens=True)
                deccoder_input = Variable(torch.zeros(Outputseq_enc.size())).cuda()
                
                deccoder_input[0,:,:] = Outputseq_enc[-1,:,:] # inputSeq[-1,:,:]
                deccoder_input[1:,:,:] = targetSeq[:-1,:,:]
                
                loss_enc = self.loss_fn(Outputseq_enc[-1,:,:].view(self.args.batch_size, -1), targetSeq[0,:,:].contiguous().view(self.args.batch_size, -1))
                loss_enc.backward(retain_graph=True)
                
                
                encoder_norm = sum(p.grad.data.abs().sum() for p in self.encoder.parameters())
                
                torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), self.args.clip)
                
                self.optim_enc.step()     
                
                Outputseq_enc, hidden_enc = self.decoder.forward(deccoder_input, hidden_enc, return_hiddens=True)
                loss_dec = self.loss_fn(Outputseq_enc.view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1))   
                loss_dec.backward()
                
                edecoder_norm = sum(p.grad.data.abs().sum() for p in self.decoder.parameters())
                
                
                torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.args.clip)
                self.optim_dec.step()
                
                
                
                total_loss += loss_enc.item() + loss_dec.item()        

                if batch % 30 == 0 and self.tf_log == True :
                    print(encoder_norm)
                    print(decoder_norm)
                    # 1. Log scalar values (scalar summary)
                    info = { 'enc_loss': loss_enc.item(), 'dec_loss' : loss_dec.item() }

                    for tag, value in info.items():
                        self.logger.scalar_summary(tag, value, epoch*total_length + i +1)

                    # 2. Log values and gradients of the parameters (histogram summary)
                    for tag, value in self.encoder.named_parameters():
                        tag = tag.replace('.', '/')
                        self.logger.histo_summary(tag, value.data.cpu().numpy(), epoch*total_length + i +1)
                        
                    for tag, value in self.decoder.named_parameters():
                        tag = tag.replace('.', '/')
                        self.logger.histo_summary(tag, value.data.cpu().numpy(), epoch*total_length + i +1)
            
            total_loss = 0    
            if len(self.args.selected_dim) == 1:
                self.anomal_score = postprocessing.get_anomalscore_encdec_1dim(base_model = self,  \
                                                                 generate_batchfy = self.generate_batchfy, length = 449916,args = self.args)
            else:    
                self.anomal_score = postprocessing.get_anomalscore_encdec(base_model = self,  \
                                                                 generate_batchfy = self.generate_batchfy, length = 449916,args = self.args)

            self.anomal_score = LA.norm(self.anomal_score, axis=1)
    #TODO conv 설정
            max_conv, max_pre, max_recall, max_f1_tp, max_zerolist, find_attack_list = postprocessing.evaluate_conv(self.anomal_score, self.test_y, self.attack_list, 400)
    
          
            if max_f1_tp > max_f1:
                end_time = time.time()
                
                print("epoch[{}]\t conv[{}]\t precision[{}]\t recall[{}]\t f1[{}]\t findnum[{}]\t time[{}]".format(epoch, max_conv, max_pre, max_recall, max_f1_tp,36 - max_zerolist, end_time - start_time))
                
 
    def save_checkpoint(self, args, state):
        checkpoint = Path(self.base_dir, str(args.selected_dim))
        checkpoint = checkpoint.with_suffix('.pth')
        torch.save(state, checkpoint)
        

In [3]:
parser = argparse.ArgumentParser()

parser.add_argument("--cuda", default=True, action="store_true")
parser.add_argument("--tf_log", default=False, action="store_true")
parser.add_argument("--model_name", type=str, default="enc_dec")
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--clip", type=int, default=1)

parser.add_argument("--train_path", type=str, default="../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Normal_v0.csv")
parser.add_argument("--test_path", type=str, default="../../DATA/SWaT/SWaT_Physical/SWaT_Dataset_Attack_v0.csv")
parser.add_argument("--attack_list_path", type=str, default='../../DATA/SWaT/SWaT_Physical/attack_list.csv')

parser.add_argument("--dropout", type=float, default=0.5)
parser.add_argument("--hidden_size", type=int, default=4)
parser.add_argument("--nlayers", type=int, default=2)
parser.add_argument("--lr", type=float, default=0.0001)

parser.add_argument("--cell_type", type=str, default="LSTM")
parser.add_argument("--epoch", type=int, default=1)
parser.add_argument("--seq_length", type=int, default=2)
parser.add_argument('--selected_dim', nargs='+', type=int, default=[36, 38, 28, 40])
args = parser.parse_args([])

In [4]:
#일반 dataset 함수
args.selected_dim = [0, 40]
args.hidden_size = 4
solver = Solver(args = args)
solver.fit(load = False) 

In train : {'Normal': 496800}
In test : {'Attack': 54621, 'Normal': 395298}
data length is 51
epoch[0]	 conv[20]	 precision[0.97355307480304]	 recall[0.6402482561652112]	 f1[0.7724812794062424]	 findnum[11]	 time[287.5399081707001]


In [8]:
#lstm dataset 함수
args.selected_dim = [0, 40]
args.hidden_size = 4
solver = Solver(args = args)
solver.fit(load = False) 

epoch[0]	 conv[20]	 precision[0.97355307480304]	 recall[0.6402482561652112]	 f1[0.7724812794062424]	 findnum[11]	 time[288.06569743156433]
