In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
import dataloader
from models import DeepLog
from preprocessing import Vectorizer, Iterator


batch_size = 32
hidden_size = 32
num_directions = 2
topk = 5
train_ratio = 0.8
window_size = 10
epoches = 15
num_workers = 2
device = 0 

struct_log = './data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = './data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
    (x_train, window_y_train, y_train), (x_test, window_y_test, y_test), test_buffer = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
    
    feature_extractor = Vectorizer()
    train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train)
    test_dataset = feature_extractor.transform(x_test, window_y_test, y_test)

    train_loader = Iterator(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers).iter
    test_loader = Iterator(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers).iter
    print("test : ", feature_extractor.num_labels)
    model = DeepLog.DeepLog(num_labels=feature_extractor.num_labels, hidden_size=hidden_size, num_directions=num_directions, topk=topk, device=device)
    model.fit(train_loader, epoches)

    print('Train validation:')
    metrics = model.evaluate(train_loader)

    print('Test validation:')
    metrics = model.evaluate(test_loader)

Loading ./data/HDFS/HDFS_100k.log_structured.csv
250 63
Slicing 6351 sessions, with window 10
Slicing done, 27805 windows generated
Slicing 1589 sessions, with window 10
Slicing done, 6408 windows generated
Train: 27805 windows (1053/27805 anomaly), 26752/27805 normal
Test: 6408 windows (253/6408 anomaly), 6155/6408 normal
test :  14
Epoch 1/15, training loss: 1.01550
Epoch 2/15, training loss: 0.33975
Epoch 3/15, training loss: 0.30030
Epoch 4/15, training loss: 0.28703
Epoch 5/15, training loss: 0.28021
Epoch 6/15, training loss: 0.27424
Epoch 7/15, training loss: 0.27087
Epoch 8/15, training loss: 0.26643
Epoch 9/15, training loss: 0.26392
Epoch 10/15, training loss: 0.26136
Epoch 11/15, training loss: 0.26065
Epoch 12/15, training loss: 0.25702
Epoch 13/15, training loss: 0.25555
Epoch 14/15, training loss: 0.25383
Epoch 15/15, training loss: 0.25263
Train validation:
[('window_acc', 0.87833), ('session_acc', 0.96174), ('f1', 0.11636), ('recall', 0.064), ('precision', 0.64)]
Test v

Loading ./data/HDFS/HDFS_100k.log_structured.csv
250 63
Slicing 6351 sessions, with window 10
Slicing done, 27805 windows generated
Slicing 1589 sessions, with window 10
Slicing done, 6408 windows generated
Train: 27805 windows (1053/27805 anomaly), 26752/27805 normal
Test: 6408 windows (253/6408 anomaly), 6155/6408 normal
test :  14
Epoch 1/5, training loss: 1.01353
Epoch 2/5, training loss: 0.38671
Epoch 3/5, training loss: 0.31505
Epoch 4/5, training loss: 0.29871
Epoch 5/5, training loss: 0.28954
Train validation:
[('window_acc', 0.8783), ('session_acc', 0.9619), ('f1', 0.14184), ('recall', 0.08), ('precision', 0.625)]
Test validation:
[('window_acc', 0.93243), ('session_acc', 0.96224), ('f1', 0.25), ('recall', 0.15873), ('precision', 0.58824)]


In [1]:
import sys
sys.path.append('../')
import dataloader
from preprocessing import Vectorizer
import tensorflow as tf
import numpy as np

def make_batch(train_data, batch_size = 16):
    train_dataset = train_data
    num_of_data = len(train_dataset["x"])
    
    index = np.arange(0, num_of_data)
    np.random.shuffle(index)
    index = index[:batch_size]
    
    shuffled_input_data = [train_dataset["x"][i] for i in index]
    shuffled_label_data = [train_dataset["window_y"][i] for i in index]
    
    return np.asarray(shuffled_input_data), np.asarray(shuffled_label_data)

def load_data():
    struct_log = './data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
    label_file = './data/HDFS/anomaly_label.csv' # The anomaly label file
    
    (x_train, window_y_train, y_train), (x_test, window_y_test, y_test), buffer = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
    
    feature_extractor = Vectorizer()
    train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train)
    test_dataset = feature_extractor.transform(x_test, window_y_test, y_test)
    num_label = feature_extractor.num_labels
    
    return train_dataset, test_dataset, num_label


class DeepLog():
    def __init__(self, sess, seq_length, batch_size, epochs, lr, hidden_size = 100, topk = 9):
        self.sess = sess
        self.batch_size = batch_size
        self.learning_rate = lr
        self.epochs = epochs
        self.seq_length = seq_length
        
        self.hidden_size = hidden_size
        self.num_directions = 2
        self.topk = topk
        self.num_label = 12
        
        #self.rnn = nn.LSTM(input_size=1, hidden_size=self.hidden_size, batch_first=True, bidirectional=(self.num_directions==2))
        
        self.X = tf.placeholder(shape = [None, self.seq_length, 1], dtype = tf.float32)
        self.Y = tf.placeholder(shape = [None, 1], dtype = tf.float32)
        
        self.model()
        self.optimizer()
        
        print("init end")
    
    def model(self):
        cell_fw = tf.contrib.rnn.LSTMBlockCell(num_units=self.hidden_size)
        cell_bw = tf.contrib.rnn.LSTMBlockCell(num_units=self.hidden_size)
        
        
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.X, time_major=False, dtype=tf.float32)
        #outputs[:,-1,:]
        print("output's shape : ", np.shape(outputs))
        outputs_fw = tf.transpose(outputs[0], [0,1,2])
        outputs_bw = tf.transpose(outputs[1], [0,1,2])
        print("outputs_fw's shape : ", np.shape(outputs_fw))
        print("outputs_bw's shape : ", np.shape(outputs_bw))
        self.logits = tf.layers.Dense(outputs_bw, (self.num_label+1))
        #self.y_pred = tf.argmax(self.logits, axis=1)
        
        return self.logits
        
        
        
    def optimizer(self):
        self.loss = tf.losses.softmax_cross_entropy(self.Y, self.logits)
        self.optimizer_loss = tf.train.AdamOptimizer(learning_rate=lr).minimize(self.total_loss)
        

        
    
    def train(self):
        data_size = 64#28883#41749
        batch_size = self.batch_size
        total_batch = data_size//batch_size

        loss_data = []
        
        #write = tf.summary.FileWriter('./mygraph', self.sess.graph)
        SAVE_PATH = "C:/Users/jaekyu/Documents/Jupyter Lab/Deeplog/Weight/Weight.ckpt"
        print("session start")
        self.sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        print("training data load start")
        train_dataset, test_dataset, num_label = load_data()
        print("training data load finish")
        try:
            saver.restore(self.sess, SAVE_PATH)
            print("load")
        except:
            print("first training")
            
        train_dataset, test_dataset, num_label = load_data()
            
        for epoch in range(self.epochs):#
            print("epoch",epoch+1, "start")
            for i in range(total_batch):#total_batch
                #data load, batch 생성
                input_loader, label_loader = make_batch(train_dataset, batch_size = self.batch_size)

                #total_loss_opt, Heat_loss, Vector_loss = self.sess.run([self.optimizer_total_loss, self.loss_stage6_branch2, self.loss_stage6_branch1],
                #         feed_dict = {self.X : batch_img, self.confidence_map_label : heatmap, self.vector_map_label : vectormap})

                #h_loss_data.append(Heat_loss)
                #v_loss_data.append(Vector_loss)
            #heatmap_output, vectormap_output = self.sess.run([self.stage6_branch2, self.stage6_branch1], feed_dict = {self.X : batch_img})
            
            
            
            saver.save(self.sess, SAVE_PATH)
            
print("cell end")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


cell end


In [2]:
batch_size = 32
hidden_size = 32
num_directions = 2
topk = 5
train_ratio = 0.2
window_size = 10
epoches = 2
num_workers = 2
device = 0 


with tf.Session() as sess:
    obj = DeepLog(sess=sess, seq_length = 10, batch_size = 32, epochs = 3, lr = 0.001, hidden_size = 100, topk = 9)
    obj.train()
    #batch_Img_resized, predicted_heatmap, label_heatmap, label_Anno_Data, categories = obj.test()
    #batch_Img_resized, batch_Anno_Data, batch_Heatmap, gaussian_Heatmap, categories = obj.data_test()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Colocations handled automatically by placer.
output's shape :  (2,)
outputs_fw's shape :  (?, 10, 100)
outputs_bw's shape :  (?, 10, 100)


TypeError: int() argument must be a string, a bytes-like object or a number, not 'Tensor'