In [2]:
# 2022-11-23
# add sentences emb and time emb
# add full model
# add params of val module for candidate and rate, multi-classification task

import sys
sys.path.append('../../')

import numpy as np
import random
import pandas as pd
import time
import argparse
import os
import collections
from tqdm import tqdm
import nni
import json
import math

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as f
from torch.utils.data import TensorDataset, DataLoader
import dateutil.parser

from utils import pytorchtools
from collections import Counter
from torch.autograd import Variable

from sklearn.metrics import accuracy_score,precision_recall_fscore_support, top_k_accuracy_score

from torch_geometric.nn import GCNConv

import warnings
warnings.filterwarnings("ignore")

def seed_torch(seed=42):
    seed = int(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True

seed_torch(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class MLPLayer(nn.Module):
    def __init__(self, dmodel, hid_size, drop, training):
        super(MLPLayer, self).__init__()
        self.dmodel = dmodel
        self.hid_size = hid_size
        self.drop = drop
        
        self.fc0 = nn.Linear(dmodel, hid_size)
        self.fc1 = nn.Linear(hid_size, hid_size)
        
    def forward(self, x):
        x = f.relu(self.fc0(x))
        x = f.dropout(x, p=self.drop, training=self.training)
        x = f.relu(self.fc1(x))
        return x

class CNNLayer(nn.Module):
    def __init__(self, dmodel, hid_size, ksize, drop, training):
        super(CNNLayer, self).__init__()
        self.dmodel = dmodel
        self.hid_size = hid_size
        self.ksize = ksize
        self.drop = drop
        
        self.cnn0 = nn.Conv1d(dmodel, hid_size, kernel_size=ksize, stride=1, padding=0)
        self.cnn1 = nn.Conv1d(hid_size, hid_size, kernel_size=ksize, stride=1, padding=0)
        
    def forward(self, x):
        x = f.relu(self.cnn0(x))
        x = f.dropout(x, p=self.drop, training=self.training)
        x = f.relu(self.cnn1(x))
        return x
    
class FTEncoder(nn.Module):
    def __init__(self, sen_size, hidden_size, alpha=0.5, pattern=0):
        super(FTEncoder, self).__init__()
        self.pattern = pattern
        assert self.pattern in [0, 1, 2], "pattern just in cat_first 0 or nn_first 1 or add_first 2"
        if pattern == 1:
            self.alpha = alpha
            assert alpha < 1 and alpha > 0, "alpha is rate just in (0,1)"
            sen_fc_size = int(hidden_size * alpha)
            time_fc_size = hidden_size - sen_fc_size
            print('x:{}, sen_x:{}, time_x:{}'.format(hidden_size, sen_fc_size, time_fc_size))
            self.sen_fc = nn.Linear(sen_size, sen_fc_size)
            self.time_fc = nn.Linear(1, time_fc_size)
        elif pattern == 0:
            self.cat_fc = nn.Linear(sen_size + 1, hidden_size)
        elif pattern == 2:
            self.sen_fc = nn.Linear(sen_size, hidden_size)
            self.time_fc = nn.Linear(1, hidden_size)
    
    def forward(self, x): # x(sen_x, time_x)
        if self.pattern == 0:
            cat_x = torch.cat((x[0], x[1].unsqueeze(-1)), -1)
            cat_x = self.cat_fc(cat_x)
        elif self.pattern == 1:
            sen_x = self.sen_fc(x[0])
            time_x = self.time_fc(x[1].unsqueeze(-1))
            cat_x = torch.cat((sen_x, time_x), -1)
        elif self.pattern == 2:
            cat_x = self.sen_fc(x[0]) + self.time_fc(x[1].unsqueeze(-1))
        return cat_x
    
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # self.fc = nn.Linear(hidden_size, hidden_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
        out, _ = self.lstm(x, (h0, c0))
        # out = self.fc(out[:, -1, :])
        return out[:,-1,:]

In [4]:
class IREncoder(nn.Module):
    def __init__(self, dmodel, mlp_hid_size, gcn_hid_size, drop, com_num, training):
        super(IREncoder, self).__init__()
        self.dmodel = dmodel
        self.mlp_hid_size = mlp_hid_size
        self.gcn_hid_size = gcn_hid_size
        # self.ksize = ksize
        self.drop = drop
        self.com_num = com_num
        self.taining = training
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.edge_mlp = MLPLayer(2*dmodel, mlp_hid_size, drop, training)
        self.mlp_out = nn.Linear(mlp_hid_size, 1)
        
        # self.edge_cnn = CNNLayer(hid_size, hid_size, ksize, drop, training)
        self.cnn_out = nn.Conv1d(mlp_hid_size, 1, kernel_size=1, stride=1, padding=0)
        
        self.GCN0 = GCNConv(dmodel, gcn_hid_size)
        self.GCN1 = GCNConv(gcn_hid_size, dmodel)
        
    def rel_pyg(self, node):
        rec2edge_index = []
        send2edge_index = []
        for i in range(len(node)):
            for j in range(i + 1 , len(node)):
                rec2edge_index.append(node[i])
                send2edge_index.append(node[j])
        rec2edge_index = torch.as_tensor(rec2edge_index, dtype=torch.long).to(self.device)
        send2edge_index = torch.as_tensor(send2edge_index, dtype=torch.long).to(self.device)
        return torch.stack([rec2edge_index, send2edge_index])
    
    def gumbel_softmax(self, x, axis=1):
        trans_input = x.transpose(axis, 0).contiguous()
        soft_max_1d = f.softmax(trans_input)
        return soft_max_1d.transpose(axis, 0)
    
    def node2edge(self, x, index):
        edge_index = self.rel_pyg(index)
        edge_x = torch.cat([x[edge_index[0]], x[edge_index[1]]], -1)
        return edge_x, edge_index
    
    def forward(self, x, index):
        padding_x = torch.zeros([self.com_num, self.dmodel], requires_grad=True).to(self.device)
        padding_x[index] = x
        edge_x, edge_index = self.node2edge(padding_x, index)
        
        # (c):MLP+CNN
        edge_x = self.edge_mlp(edge_x)
        # edge_x = edge_x.unsqueeze(0).permute(0, 2, 1)
        # edge_x = self.cnn_out(edge_x)
        # edge_x = edge_x.permute(0, 2, 1).squeeze(0)
        edge_x = self.mlp_out(edge_x)
        
        edge_weight = self.gumbel_softmax(edge_x)
        
        out =f.relu(self.GCN0(padding_x, edge_index, edge_weight))
        out =f.dropout(out, self.drop, training=self.training)
        out = self.GCN1(out, edge_index, edge_weight)
        
        return out[index]

In [5]:
class Model(nn.Module):
    def __init__(self, input_size, com_num, hidden_size, alpha, pattern, \
                 num_layers, num_keys, drop = 0.1, training = True):
        super(Model, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ft_hid_size, lstm_hid_size, mlp_hid_size, gcn_hid_size, out_hid_size = hidden_size # unzip
        self.lstm_hid_size = lstm_hid_size
        self.com_num = com_num
        # (a)
        self.ftencoder = FTEncoder(input_size, ft_hid_size, alpha, pattern)
        # (b)
        self.lstm0 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # self.lstm1 = LSTMEncoder(input_size, hidden_size, num_layers)
        self.lstm2 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # (c)
        self.irencoder = IREncoder(lstm_hid_size, mlp_hid_size, gcn_hid_size, drop, com_num, training)
        # (d)
        self.att_fc = nn.Linear(lstm_hid_size, lstm_hid_size)
        self.fc1 = nn.Linear(2*lstm_hid_size, out_hid_size)
        self.fc2 = nn.Linear(out_hid_size, num_keys)
        
        # variable
        self.u_att = Variable(torch.zeros(1, lstm_hid_size), requires_grad=True).to(self.device)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.u_att, gain=nn.init.calculate_gain('relu'))
        print('Variabel inited.')
        
    def resolve(self, per_x, per_index):
        res = collections.OrderedDict()
        for idx in range(per_x.shape[0]):
            if per_index[idx].item() not in res.keys():
                res[per_index[idx].item()] = []
            res[per_index[idx].item()].append(per_x[idx].to(self.device))
        return res
    
    def attention_net(self, x): # [16, 8, 512] [1, 512]
        sequence_len = x.shape[1]
        re_x = x.reshape(-1, self.lstm_hid_size)
        re_x = torch.mm(re_x, self.u_att.T).reshape(-1, sequence_len)
        re_x = f.softmax(re_x, dim=1).unsqueeze(-1)
        
        x = torch.sum(x * re_x, 1)
        x = f.relu(self.att_fc(x))
        return x
    
    def forward(self, x, index, q_x, t_x):
        # (a): feature extra
        x = self.ftencoder((x, t_x))
        # (b): lstm layer
        batch_as_x = self.lstm0(x)
        # out1 = self.lstm1(q_x.unsqueeze(1))
        
        batch_ac_x = []
        for idx in range(x.shape[0]):
            res = self.resolve(x[idx], index[idx])
            ac_x = []
            for item in res.items():
                list_x = torch.stack(item[1]).unsqueeze(0)
                out_x = self.lstm2(list_x)
                ac_x.append(out_x.squeeze(0))
            ac_x = torch.stack(ac_x)
            
            # (c):
            if ac_x.shape[0] != 1:
                ac_x = self.irencoder(ac_x, list(res.keys()))
            
            # (d): attention pooling
            ac_x = self.attention_net(ac_x.unsqueeze(0)) # add batch
            batch_ac_x.append(ac_x)
        
        batch_ac_x = torch.stack(batch_ac_x).squeeze(1)
        multi_out = torch.cat((batch_as_x, batch_ac_x), -1)
        out = f.relu(self.fc1(multi_out))
        out = self.fc2(out)
        return out

In [5]:
class Model_wo_ic(nn.Module):
    def __init__(self, input_size, com_num, hidden_size, alpha, pattern, \
                 num_layers, num_keys, drop = 0.1, training = True):
        super(Model_wo_ic, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ft_hid_size, lstm_hid_size, mlp_hid_size, gcn_hid_size, out_hid_size = hidden_size # unzip
        self.lstm_hid_size = lstm_hid_size
        self.com_num = com_num
        # (a)
        self.ftencoder = FTEncoder(input_size, ft_hid_size, alpha, pattern)
        # (b)
        self.lstm0 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # self.lstm1 = LSTMEncoder(input_size, hidden_size, num_layers)
        self.lstm2 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # (c)
        self.irencoder = IREncoder(lstm_hid_size, mlp_hid_size, gcn_hid_size, drop, com_num, training)
        # (d)
        self.att_fc = nn.Linear(lstm_hid_size, lstm_hid_size)
        self.fc1 = nn.Linear(2*lstm_hid_size, out_hid_size)
        self.fc2 = nn.Linear(out_hid_size, num_keys)
        
        # variable
        self.u_att = Variable(torch.zeros(1, lstm_hid_size), requires_grad=True).to(self.device)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.u_att, gain=nn.init.calculate_gain('relu'))
        print('Variabel inited.')
        
    def resolve(self, per_x, per_index):
        res = collections.OrderedDict()
        for idx in range(per_x.shape[0]):
            if per_index[idx].item() not in res.keys():
                res[per_index[idx].item()] = []
            res[per_index[idx].item()].append(per_x[idx].to(self.device))
        return res
    
    def attention_net(self, x): # [16, 8, 512] [1, 512]
        sequence_len = x.shape[1]
        re_x = x.reshape(-1, self.lstm_hid_size)
        re_x = torch.mm(re_x, self.u_att.T).reshape(-1, sequence_len)
        re_x = f.softmax(re_x, dim=1).unsqueeze(-1)
        
        x = torch.sum(x * re_x, 1)
        x = f.relu(self.att_fc(x))
        return x
    
    def forward(self, x, index, q_x, t_x):
        # (a): feature extra
        x = self.ftencoder((x, t_x))
        # (b): lstm layer
        batch_as_x = self.lstm0(x)
        # out1 = self.lstm1(q_x.unsqueeze(1))
        
        batch_ac_x = []
        for idx in range(x.shape[0]):
            res = self.resolve(x[idx], index[idx])
            ac_x = []
            for item in res.items():
                list_x = torch.stack(item[1]).unsqueeze(0)
                out_x = self.lstm2(list_x)
                ac_x.append(out_x.squeeze(0))
            ac_x = torch.stack(ac_x)
            
#             # (c):
#             if ac_x.shape[0] != 1:
#                 ac_x = self.irencoder(ac_x, list(res.keys()))
            
            # (d): attention pooling
            ac_x = self.attention_net(ac_x.unsqueeze(0)) # add batch
            batch_ac_x.append(ac_x)
        
        batch_ac_x = torch.stack(batch_ac_x).squeeze(1)
        multi_out = torch.cat((batch_as_x, batch_ac_x), -1)
        out = f.relu(self.fc1(multi_out))
        out = self.fc2(out)
        return out

In [6]:
class Model_noS(nn.Module):
    def __init__(self, input_size, com_num, hidden_size, alpha, pattern, \
                 num_layers, num_keys, drop = 0.1, training = True):
        super(Model_noS, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ft_hid_size, lstm_hid_size, mlp_hid_size, gcn_hid_size, out_hid_size = hidden_size # unzip
        self.lstm_hid_size = lstm_hid_size
        self.com_num = com_num
        # (a)
        self.ftencoder = FTEncoder(input_size, ft_hid_size, alpha, pattern)
        # (b)
        self.lstm0 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # self.lstm1 = LSTMEncoder(input_size, hidden_size, num_layers)
        self.lstm2 = dict()
        for cn in range(com_num):
            self.lstm2[cn] = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers).to(device)
        # (c)
        self.irencoder = IREncoder(lstm_hid_size, mlp_hid_size, gcn_hid_size, drop, com_num, training)
        # (d)
        self.att_fc = nn.Linear(lstm_hid_size, lstm_hid_size)
        self.fc1 = nn.Linear(2*lstm_hid_size, out_hid_size)
        self.fc2 = nn.Linear(out_hid_size, num_keys)
        
        # variable
        self.u_att = Variable(torch.zeros(1, lstm_hid_size), requires_grad=True).to(self.device)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.u_att, gain=nn.init.calculate_gain('relu'))
        print('Variabel inited.')
        
    def resolve(self, per_x, per_index):
        res = collections.OrderedDict()
        for idx in range(per_x.shape[0]):
            if per_index[idx].item() not in res.keys():
                res[per_index[idx].item()] = []
            res[per_index[idx].item()].append(per_x[idx])
        return res
    
    def attention_net(self, x): # [16, 8, 512] [1, 512]
        sequence_len = x.shape[1]
        re_x = x.reshape(-1, self.lstm_hid_size)
        re_x = torch.mm(re_x, self.u_att.T).reshape(-1, sequence_len)
        re_x = f.softmax(re_x, dim=1).unsqueeze(-1)
        
        x = torch.sum(x * re_x, 1)
        x = f.relu(self.att_fc(x))
        return x
    
    def forward(self, x, index, q_x, t_x):
        # (a): feature extra
        x = self.ftencoder((x, t_x))
        # (b): lstm layer
        batch_as_x = self.lstm0(x)
        # out1 = self.lstm1(q_x.unsqueeze(1))
        
        batch_ac_x = []
        for idx in range(x.shape[0]):
            res = self.resolve(x[idx], index[idx])
            ac_x = []
            for item in res.items():
                list_x = torch.stack(item[1]).unsqueeze(0).to(self.device)
                out_x = self.lstm2[item[0]](list_x)
                ac_x.append(out_x.squeeze(0))
            ac_x = torch.stack(ac_x)
            
            # (c):
            if ac_x.shape[0] != 1:
                ac_x = self.irencoder(ac_x, list(res.keys()))
            
            # (d): attention pooling
            ac_x = self.attention_net(ac_x.unsqueeze(0)) # add batch
            batch_ac_x.append(ac_x)
        
        batch_ac_x = torch.stack(batch_ac_x).squeeze(1)
        multi_out = torch.cat((batch_as_x, batch_ac_x), -1)
        out = f.relu(self.fc1(multi_out))
        out = self.fc2(out)
        return out

In [7]:
class Model_wo_LSTM(nn.Module):
    def __init__(self, input_size, com_num, hidden_size, alpha, pattern, \
                 num_layers, num_keys, drop = 0.1, training = True):
        super(Model_wo_LSTM, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        ft_hid_size, lstm_hid_size, mlp_hid_size, gcn_hid_size, out_hid_size = hidden_size # unzip
        self.lstm_hid_size = lstm_hid_size
        self.com_num = com_num
        # (a)
        self.ftencoder = FTEncoder(input_size, ft_hid_size, alpha, pattern)
        # (b)
        # self.lstm0 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        # # self.lstm1 = LSTMEncoder(input_size, hidden_size, num_layers)
        # self.lstm2 = LSTMEncoder(ft_hid_size, lstm_hid_size, num_layers)
        self.lstm2nn = nn.Linear(ft_hid_size, lstm_hid_size)
        # (c)
        self.irencoder = IREncoder(lstm_hid_size, mlp_hid_size, gcn_hid_size, drop, com_num, training)
        # (d)
        self.att_fc = nn.Linear(lstm_hid_size, lstm_hid_size)
        self.fc1 = nn.Linear(2*lstm_hid_size, out_hid_size)
        self.fc2 = nn.Linear(out_hid_size, num_keys)
        
        # variable
        self.u_att = Variable(torch.zeros(1, lstm_hid_size), requires_grad=True).to(self.device)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.u_att, gain=nn.init.calculate_gain('relu'))
        print('Variabel inited.')
        
    def resolve(self, per_x, per_index):
        res = collections.OrderedDict()
        for idx in range(per_x.shape[0]):
            if per_index[idx].item() not in res.keys():
                res[per_index[idx].item()] = []
            res[per_index[idx].item()].append(per_x[idx].to(self.device))
        return res
    
    def attention_net(self, x): # [16, 8, 512] [1, 512]
        sequence_len = x.shape[1]
        re_x = x.reshape(-1, self.lstm_hid_size)
        re_x = torch.mm(re_x, self.u_att.T).reshape(-1, sequence_len)
        re_x = f.softmax(re_x, dim=1).unsqueeze(-1)
        
        x = torch.sum(x * re_x, 1)
        x = f.relu(self.att_fc(x))
        return x
    
    def forward(self, x, index, q_x, t_x):
        # (a): feature extra
        x = self.ftencoder((x, t_x))
        # (b): lstm layer
        batch_as_x = torch.mean(x, 1)
        batch_as_x = self.lstm2nn(batch_as_x)
        # out1 = self.lstm1(q_x.unsqueeze(1))
        
        batch_ac_x = []
        for idx in range(x.shape[0]):
            res = self.resolve(x[idx], index[idx])
            ac_x = []
            for item in res.items():
                list_x = torch.stack(item[1]).unsqueeze(0)
                out_x = torch.mean(list_x, 1)
                out_x = self.lstm2nn(out_x)
                ac_x.append(out_x.squeeze(0))
            ac_x = torch.stack(ac_x)
            
            # (c):
            if ac_x.shape[0] != 1:
                ac_x = self.irencoder(ac_x, list(res.keys()))
            
            # (d): attention pooling
            ac_x = self.attention_net(ac_x.unsqueeze(0)) # add batch
            batch_ac_x.append(ac_x)
        
        batch_ac_x = torch.stack(batch_ac_x).squeeze(1)
        multi_out = torch.cat((batch_as_x, batch_ac_x), -1)
        out = f.relu(self.fc1(multi_out))
        out = self.fc2(out)
        return out

In [6]:
def getPath(name, train_name, emb_pattern = 'sen'):
    assert emb_pattern in ['one', 'sen'], 'emb_pattern just in one_hot or sentences'
    print('Load name: {} and used_name: {}'.format(name, train_name))
    
    train_path = train_name + '/train_normal.csv'
    # val_normal_path = train_name + '/val_normal.csv'
    # val_anomaly_path = train_name + '/val_anomaly.csv'
    val_normal_path = train_name + '/test_normal.csv'
    val_anomaly_path = train_name + '/test_anomaly.csv'
    test_normal_path = train_name + '/test_normal.csv'
    test_anomaly_path = train_name + '/test_anomaly.csv'
    
    temp_path = name + '.log_templates.csv'
    emb_path = name + '_one_hot.json'
    if emb_pattern == 'sen':
        emb_path = name + '_sentences_emb.json'
    com_path = name + '_component.json'
    
    return train_path, val_normal_path, val_anomaly_path, temp_path, emb_path, com_path, test_normal_path, test_anomaly_path

def getDateTimeFromISO8601String(s):
    d = dateutil.parser.parse(s, yearfirst=True)
    return d

def generate_train(name, train_path, logTemp_path, encoder_path, com_path, window_size):
    train_datas = pd.read_csv(train_path, engine='c', na_filter=False, memory_map=True)
    logTemp = pd.read_csv(logTemp_path, index_col='EventId', engine='c', na_filter=False, memory_map=True)
    mapping = {index: i for i, index in enumerate(logTemp.index.unique())}
    emb = json.load(open(encoder_path, 'r'))
    cop = json.load(open(com_path, 'r'))
    num_keys = len(logTemp.index.unique())
    
    # slide sample
    inputs, outputs = [], []
    for idx, row in train_datas.iterrows():
        # seqs = row['EventSequence'][1:-1].replace('\'', '').replace(' ','').split(',')
        seqs = eval(row['EventSequence'])
        len_seq = len(seqs)
        inputs.extend([seqs[i:i + window_size] for i in range(len_seq - window_size)])
        outputs.extend([mapping[seqs[i + window_size][0]] for i in range(len_seq - window_size)])
    
    # encoder
    inputs_encoded, coms_encoded, quans_encoded, time_encoded = [], [], [], []
    for idx, events in enumerate(inputs):
        
        # quan encoder
        quan_pattern = [0] * num_keys
        log_counter = Counter([mapping[event] for event, _, _ in events])
        for key in log_counter:
            quan_pattern[key] = log_counter[key]
        quans_encoded.append(quan_pattern)
        
        inp, com, tm = [], [], []
        start_time = getDateTimeFromISO8601String(events[0][2])
        for event, component, time in events:
            cur_time = getDateTimeFromISO8601String(time)
            inp.append(emb[event])
            com.append(cop[component])
            tm.append((cur_time - start_time).seconds)
            
        inputs_encoded.append(inp)
        coms_encoded.append(com)
        time_encoded.append(tm)
    dataset = TensorDataset(torch.as_tensor(inputs_encoded, dtype=torch.float), torch.as_tensor(coms_encoded),\
                            torch.as_tensor(quans_encoded, dtype=torch.float), torch.as_tensor(time_encoded, dtype=torch.float), torch.as_tensor(outputs))
    
    print('Number of {}_seqs: {}, components: {}'.format(name, len(dataset), len(cop)))
    
    return dataset, len(inputs_encoded[0][0]), len(logTemp.index.unique()), len(cop)

def generate_pre(name, log_path, logTemp_path, encoder_path, com_path, window_size):
    pre_data = pd.read_csv(log_path, engine='c', na_filter=False, memory_map=True)
    logTemp = pd.read_csv(logTemp_path, index_col='EventId', engine='c', na_filter=False, memory_map=True)
    mapping = {index: i for i, index in enumerate(logTemp.index.unique())}
    emb = json.load(open(encoder_path, 'r'))
    emb_len = len(list(emb.items())[0][1])
    cop = json.load(open(com_path, 'r'))
    num_keys = len(logTemp.index.unique())
    
    # session sample
    inputs = []
    for idx, row in pre_data.iterrows():
        # seqs = row['EventSequence'][1:-1].replace('\'', '').replace(' ','').split(',')
        seqs = eval(row['EventSequence'])
        len_seq = len(seqs)
        # seqs = seqs + [-1]*(window_size + 1 - len(seqs)) # is padding?
        inp, comp, quanp, timep, lab = [], [], [], [], []
        for i in range(len(seqs) - window_size):
            seq, com, tm = [], [], []
            
            # quan encoder
            quan_pattern = [0] * num_keys
            log_counter = Counter([mapping[event] for event, _, _ in seqs[i:i + window_size]])
            for key in log_counter:
                quan_pattern[key] = log_counter[key]
            quanp.append(quan_pattern)
            
            start_time = getDateTimeFromISO8601String(seqs[i:i + window_size][0][2])
            for event in seqs[i:i + window_size]:
                cur_time = getDateTimeFromISO8601String(event[2])
                seq.append([-1]*emb_len) if event[0] == -1 else seq.append(emb[event[0]])
                com.append(-1) if event[0] == -1 else com.append(cop[event[1]])
                tm.append(-1) if event[0] == -1 else tm.append((cur_time - start_time).seconds)
                
            inp.append(seq)
            comp.append(com)
            timep.append(tm)
            lab.append(mapping[seqs[i + window_size][0]] if seqs[i + window_size] != -1 else -1)
        if inp: inputs.append((inp, comp, quanp, timep, lab))
    
    print('Number of {}_seqs(session): {}'.format(name, len(inputs)))
    return inputs, len(inputs), len(cop)

In [7]:
def evaluation(output, label, valid_loss, pattern='macro'):
    accuracy = accuracy_score(label, output)
    precision, recall, F1, _ = precision_recall_fscore_support(label, output,  average=pattern)
    return accuracy, precision, recall, F1, np.average(valid_loss)

def eval_handle(nordl, anodl, model, window_size, num_candidates, anomaly_rate = 1):
    nor_hit = []
    ano_hit = []
    total_loss = []
    with torch.no_grad():
        model.eval()
        # normal valid
        for seq, com, quan, timp, label in nordl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            indice = torch.argsort(output, 1, descending=True)[:,0:num_candidates]
            fcnt = (torch.isin(label, indice) == False).sum().item()
            nor_hit.append(1) if fcnt >= anomaly_rate else nor_hit.append(0)

        # normal valid
        for seq, com, quan, timp, label in anodl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            indice = torch.argsort(output, 1, descending=True)[:,0:num_candidates]
            fcnt = (torch.isin(label, indice) == False).sum().item()
            ano_hit.append(1) if fcnt >= anomaly_rate else ano_hit.append(0)
            
    nor_label = [0]*len(nor_hit)
    ano_label = [1]*len(ano_hit)
    
    return evaluation(nor_hit + ano_hit, nor_label + ano_label, np.average(total_loss))

def eval_handle_topK(nordl, anodl, model, window_size, num_candidates, anomaly_rate = 1):
    nor_hit = dict()
    ano_hit = dict()
    total_loss = []
    with torch.no_grad():
        model.eval()
        # normal valid
        for seq, com, quan, timp, label in nordl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            for num_can in num_candidates:
                if num_can not in nor_hit.keys():
                    nor_hit[num_can] = []
                indice = torch.argsort(output, 1, descending=True)[:,0:num_can].contiguous()
                fcnt = (torch.isin(label, indice) == False).sum().item()
                nor_hit[num_can].append(1) if fcnt >= anomaly_rate else nor_hit[num_can].append(0)

        # normal valid
        for seq, com, quan, timp, label in anodl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            for num_can in num_candidates:
                if num_can not in ano_hit.keys():
                    ano_hit[num_can] = []              
                indice = torch.argsort(output, 1, descending=True)[:,0:num_can].contiguous()
                fcnt = (torch.isin(label, indice) == False).sum().item()
                ano_hit[num_can].append(1) if fcnt >= anomaly_rate else ano_hit[num_can].append(0)
            
    nor_label = [0]*len(nor_hit[num_candidates[0]])
    ano_label = [1]*len(ano_hit[num_candidates[0]])
    
    res = dict()
    for num_can in num_candidates:
        if num_can not in res.keys():
            res[num_can] = []
        res[num_can].append(evaluation(nor_hit[num_can] + ano_hit[num_can], nor_label + ano_label, np.average(total_loss)))
    return res

def eval_mulClass(nordl, anodl, model, window_size, num_candidates, anomaly_rate = 1):
    out_list = []
    label_list = []
    total_loss = []
    with torch.no_grad():
        model.eval()
         # normal valid
        for seq, com, quan, timp, label in nordl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            indice = torch.argsort(output, 1, descending=True)[:,0:num_candidates]
            out_list.append(indice)
            label_list.append(label)

        # normal valid
        for seq, com, quan, timp, label in anodl:
            assert len(seq) == len(label), 'seqs len not equal labels len, please check the generate'
            seq = torch.as_tensor(seq, dtype=torch.float).to(device)
            quan = torch.as_tensor(quan, dtype=torch.float).to(device)
            com = torch.as_tensor(com).to(device)
            timp = torch.as_tensor(timp, dtype=torch.float).to(device)
            label = torch.as_tensor(label).to(device)
            
            output = model(seq, com, quan, timp).to(device)
            # loss = criterion(output, label)
            # total_loss.append(loss.item())
            
            indice = torch.argsort(output, 1, descending=True)[:,0:num_candidates]
            out_list.append(indice)
            label_list.append(label)
        
    return evaluation(torch.cat(out_list).cpu(), torch.cat(label_list).cpu(), np.average(total_loss))

In [8]:
lr = 0.001
batch_size = 16
num_epochs = 2
# 6,7,8,9,10,11,12,13,14
window_sizes = [9]
# ft_hid_size, lstm_hid_size, mlp_hid_size, gcn_hid_size, out_hid_size
hidden_size = [64, 64, 64, 64, 64]
alpha = 0.8
pattern = 1

num_layers = 2
drop = 0.1
num_candidates = [1]
anomaly_rate = 1
patience = 20

# data path
name = '..'
used_name = '..'

In [1]:
for window_size in window_sizes:
    train_path, val_normal_path, val_anomaly_path, temp_path, emb_path, com_path, test_normal_path, test_anomaly_path = getPath(name, used_name, 'sen')
    train_dataset, attr_num, class_num, com_num = generate_train('HDFS_train', train_path, temp_path, emb_path, com_path, window_size)
    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=False)

    label_normal, normal_len, com_num = generate_pre('HDFS_val_normal', val_normal_path, temp_path, emb_path, com_path, window_size)
    label_anomaly, anomal_len, com_num = generate_pre('HDFS_val_anomaly', val_anomaly_path, temp_path, emb_path, com_path, window_size)

In [2]:
model = Model(attr_num, com_num, hidden_size, alpha, pattern, \
                              num_layers, class_num, drop, True).to(device)
# 统计模型参数量
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

In [3]:
# FLOPs
from fvcore.nn import FlopCountAnalysis, parameter_count_table

print(parameter_count_table(model))

In [4]:
for window_size in window_sizes:
    train_path, val_normal_path, val_anomaly_path, temp_path, emb_path, com_path, test_normal_path, test_anomaly_path = getPath(name, used_name, 'sen')
    train_dataset, attr_num, class_num, com_num = generate_train('HDFS_train', train_path, temp_path, emb_path, com_path, window_size)
    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=False)

    label_normal, normal_len, com_num = generate_pre('HDFS_val_normal', val_normal_path, temp_path, emb_path, com_path, window_size)
    label_anomaly, anomal_len, com_num = generate_pre('HDFS_val_anomaly', val_anomaly_path, temp_path, emb_path, com_path, window_size)
    print('data loaded')
    modelLs = []

    modelLs.append(Model(attr_num, com_num, hidden_size, alpha, pattern, \
                              num_layers, class_num, drop, True).to(device))
    # modelLs.append(Model_wo_ic(attr_num, com_num, hidden_size, alpha, pattern, \
    #                           num_layers, class_num, drop, True).to(device))
    # modelLs.append(Model_wo_LSTM(attr_num, com_num, hidden_size, alpha, pattern, \
    #                           num_layers, class_num, drop, True).to(device))
    criterion = nn.CrossEntropyLoss()
    
    for model in modelLs:
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    
    best_x, best_epoch = 0, 0

    label = None
    indice = None
    for epoch in range(num_epochs):  # Loop over the dataset multiple times
        train_loss = []
        model.train()
        for step, (seq, com, quan, timp, label) in enumerate(dataloader):
            optimizer.zero_grad()
            # feature loaded
            seq = seq.clone().detach().to(device)
            com = com.clone().detach().to(device)
            quan = quan.clone().detach().to(device)
            timp = timp.clone().detach().to(device)

            output = model(seq, com, quan, timp).to(device)
            loss = criterion(output, label.to(device))
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, np.average(train_loss)))
        train_loss = []
        # val model
        # accuracy, precision, recall, F1, val_loss = eval_handle(label_normal, label_anomaly, model, window_size, num_candidates[0], anomaly_rate)
        # accuracy, precision, recall, F1, val_loss = eval_mulClass(label_normal, label_anomaly, model, window_size, num_candidates, anomaly_rate)
        # print("Accuracy: {0:.3f}, Precision: {1:.3f}, Recall: {2:.3f}, F1-score: {3:.3f}".format(accuracy, precision, recall, F1))
        res = eval_handle_topK(label_normal, label_anomaly, model, window_size, num_candidates, anomaly_rate)
        for item in res.items():
            accuracy, precision, recall, F1, _ = item[1][0]
            print("TopK={0} | Accuracy: {1:.3f}, Precision: {2:.3f}, Recall: {3:.3f}, F1-score: {4:.3f}".format(item[0], accuracy, precision, recall, F1))
            if best_x <= F1:
                best_x = F1
                best_epoch = epoch
                # 保存模型
                state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch': best_epoch}
        # if patience != 0:
        #     early_stopping(val_loss, model)
        #     if early_stopping.early_stop:
        #         print("Early stopping")
        #         break

    print("best epoch:{} / best F1:{:.3f}".format(best_epoch + 1, best_x))

In [None]:
# 测试
model.load_state_dict(state['model'])
model.eval()

label_normal, normal_len, _ = generate_pre('HDFS_test_normal', test_normal_path, temp_path, emb_path, com_path, window_size)
label_anomaly, anomal_len, _ = generate_pre('HDFS_test_anomaly', test_anomaly_path, temp_path, emb_path, com_path, window_size)

res = eval_handle_topK(label_normal, label_anomaly, model, window_size, num_candidates, anomaly_rate)

for item in res.items():
    accuracy, precision, recall, F1, _ = item[1][0]
    print("TopK={0} | Accuracy: {1:.3f}, Precision: {2:.3f}, Recall: {3:.3f}, F1-score: {4:.3f}".format(item[0], accuracy, precision, recall, F1))

In [None]:
# save
model_dir = 'model/CSCLog/'
name = 'CSCLog'
torch.save(state, model_dir + '/' + name + '.pt')