In [1]:
import time
time.gmtime()

time.struct_time(tm_year=2021, tm_mon=9, tm_mday=12, tm_hour=14, tm_min=10, tm_sec=50, tm_wday=6, tm_yday=255, tm_isdst=0)

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [3]:
import pandas as pd
import itertools, collections
import time
import numpy as np

from _code import DeepLogModel
from _code import generator
from _code import seed
from _code import trie

In [4]:
### Parameters
_seed=7
subdir = "static-7"
vocabulary_size = 20
num_patterns = 16

In [5]:
seed.seed(_seed)
subdir = subdir

In [6]:
vocabulary_size = vocabulary_size
num_patterns = num_patterns
vocabulary = generator.make_vocabulary(vocabulary_size=vocabulary_size)
patterns = generator.generate_patterns(num_patterns=num_patterns, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)
patterns_freq = np.array([256,256,128,128,64,64,32,32,16,16,16,16,16,16,1,1])
patterns_p = patterns_freq/sum(patterns_freq)
trie_g = trie.calc_g_value(patterns)
trie_h = trie.calc_h_value(patterns)
print("trie-g:", trie_g)
print("trie-h:", trie_h)
text_train, marks_train = generator.generate_text(patterns, text_size=50000, anomaly_ratio=0.00, vocabulary=vocabulary, patterns_p = patterns_p) 
tests = generator.generate_tests(patterns, vocabulary, n=10000, text_size = 4, anomaly_ratio=0.1,patterns_p = patterns_p)

trie-g: 11
trie-h: 3


In [7]:
import json, os

def write_inputs(subdir, vocabulary_size, vocabulary, patterns, text_train, marks_train, tests):
    dir_= os.path.join("..\\inputs", subdir)
    os.makedirs(dir_, exist_ok=True)
    j1={
        "vocabulary_size" : vocabulary_size,
        "vocabulary" : vocabulary,
        "workflows" : patterns
    }
    json.dump(j1, open(os.path.join(dir_, "v-wf.json"), "w"))

    j2 = {"text_train" : text_train, "marks_train" : marks_train}
    json.dump(j2, open(os.path.join(dir_, "train.json"), "w"))

    j3 = {"tests" : tests}
    json.dump(j3, open(os.path.join(dir_, "tests.json"), "w"))

def read_inputs(subdir):
    dir_ = os.path.join("..\\inputs", subdir)
    j1 = json.load(open(os.path.join(dir_, "v-wf.json")))
    j2 = json.load(open(os.path.join(dir_, "train.json")))
    j3 = json.load(open(os.path.join(dir_, "tests.json")))
    
    vocabulary_size = j1["vocabulary_size"]
    vocabulary = j1["vocabulary"]
    patterns = j1["workflows"]
    patterns = [tuple(p) for p in patterns]
    
    text_train = j2["text_train"]
    marks_train = j2["marks_train"]
    
    tests_str_keys = j3["tests"]
    tests = dict()
    for k,v in tests_str_keys.items():
        tests[int(k)] = tuple(v)
    
    
    return vocabulary_size, vocabulary, patterns, text_train, marks_train, tests

write_inputs(subdir, vocabulary_size, vocabulary, patterns, text_train, marks_train, tests)

In [8]:
ret_vocabulary_size, ret_vocabulary, ret_patterns, ret_text_train, ret_marks_train, ret_tests = read_inputs(subdir)

assert ret_vocabulary_size == vocabulary_size
assert ret_vocabulary == vocabulary
assert ret_patterns == patterns
assert ret_text_train == text_train
assert ret_marks_train == marks_train
assert ret_tests == tests


In [9]:
deep_log_model = DeepLogModel.DeepLogModel(h=trie_h+1, n=vocabulary_size, vocabulary=vocabulary)
deep_log_model.build(num_lstm_layers=2, lstm_size=64)
deep_log_model.fit(text_train,epochs=1)

<_code.DeepLogModel.HistoryLoss at 0x1f40fb5b580>

In [10]:
b = time.time()
entries = list()
for k, (text_test, text_marks, anomaly) in tests.items():
    for g in range(0, vocabulary_size+1):
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
e = time.time()
print("time:", round(e-b,3), "seconds")

time: 12300.722 seconds


In [11]:
df = pd.DataFrame(entries, columns = ["i","g","status"])
ddf = df.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e=ddf.apply(calc, axis=0)
e1=pd.DataFrame(list(e.values))

print("measure results for each g-value")
e1

measure results for each g-value


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
0,0.2772,1.0,0.2772,0.434075,2772,0,7228,0
1,0.28017,1.0,0.2878,0.437707,2772,106,7122,0
2,0.314659,0.998918,0.3966,0.478569,2769,1197,6031,3
3,0.429017,0.993146,0.6317,0.599195,2753,3564,3664,19
4,0.527606,0.985931,0.7514,0.687374,2733,4781,2447,39
5,0.624387,0.964286,0.8293,0.757975,2673,5620,1608,99
6,0.669487,0.91342,0.851,0.772658,2532,5978,1250,240
7,0.706073,0.863997,0.8626,0.777093,2395,6231,997,377
8,0.767179,0.809524,0.8791,0.787783,2244,6547,681,528
9,0.848497,0.753608,0.8944,0.798242,2089,6855,373,683


In [12]:

def write_outputs(subdir, df, ddf, e1):
    df.to_csv(os.path.join("..\inputs", subdir, "DL-results.csv"))
    ddf.to_csv(os.path.join("..\inputs", subdir, "DL-resuls_pivot.csv"))
    e1.to_csv(os.path.join("..\inputs", subdir, "DL-resutls_metrics.csv"))

write_outputs(subdir, df, ddf, e1)

In [13]:
display(e1.loc[[e1["acc"].argmax()]])
display(e1.loc[[e1["f1"].argmax()]])

Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
10,0.966901,0.695527,0.909,0.809064,1928,7162,66,844


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
10,0.966901,0.695527,0.909,0.809064,1928,7162,66,844
