In [1]:
import time
time.gmtime()

time.struct_time(tm_year=2021, tm_mon=9, tm_mday=13, tm_hour=17, tm_min=41, tm_sec=18, tm_wday=0, tm_yday=256, tm_isdst=0)

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [3]:
import pandas as pd
import itertools, collections
import time
import numpy as np

from _code import DeepLogModel
from _code import generator
from _code import seed
from _code import trie

In [4]:
### Parameters
_seed=2
subdir = "dynamic-2"
vocabulary_size = 10
num_patterns = 20
num_new_patterns = 3

In [5]:
seed.seed(_seed)
subdir = subdir

In [6]:
vocabulary_size = vocabulary_size
num_patterns = num_patterns
vocabulary = generator.make_vocabulary(vocabulary_size=vocabulary_size)
patterns = generator.generate_patterns(num_patterns=num_patterns, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)
trie_g = trie.calc_g_value(patterns)
trie_h = trie.calc_h_value(patterns)
print("trie-g:", trie_g)
print("trie-h:", trie_h)
text_train, marks_train = generator.generate_text(patterns, text_size=50000, anomaly_ratio=0.00, vocabulary=vocabulary) 
tests1 = generator.generate_tests(patterns, vocabulary, n=700, text_size = 4, anomaly_ratio=0.1)

trie-g: 8
trie-h: 4


In [7]:
import json, os

def write_inputs1(subdir, vocabulary_size, vocabulary, patterns, text_train, marks_train, tests):
    dir_= os.path.join("..\\inputs", subdir)
    os.makedirs(dir_, exist_ok=True)
    j1={
        "vocabulary_size" : vocabulary_size,
        "vocabulary" : vocabulary,
        "workflows" : patterns
    }
    json.dump(j1, open(os.path.join(dir_, "v-wf.json"), "w"))

    j2 = {"text_train" : text_train, "marks_train" : marks_train}
    json.dump(j2, open(os.path.join(dir_, "train.json"), "w"))

    j3 = {"tests" : tests}
    json.dump(j3, open(os.path.join(dir_, "tests1.json"), "w"))

def read_inputs1(subdir):
    dir_ = os.path.join("..\\inputs", subdir)
    j1 = json.load(open(os.path.join(dir_, "v-wf.json")))
    j2 = json.load(open(os.path.join(dir_, "train.json")))
    j3 = json.load(open(os.path.join(dir_, "tests1.json")))
    
    vocabulary_size = j1["vocabulary_size"]
    vocabulary = j1["vocabulary"]
    patterns = j1["workflows"]
    patterns = [tuple(p) for p in patterns]
    
    text_train = j2["text_train"]
    marks_train = j2["marks_train"]
    
    tests_str_keys = j3["tests"]
    tests = dict()
    for k,v in tests_str_keys.items():
        tests[int(k)] = tuple(v)
    
    
    return vocabulary_size, vocabulary, patterns, text_train, marks_train, tests

write_inputs1(subdir, vocabulary_size, vocabulary, patterns, text_train, marks_train, tests1)

In [8]:
ret_vocabulary_size, ret_vocabulary, ret_patterns, ret_text_train, ret_marks_train, ret_tests1 = read_inputs1(subdir)

assert ret_vocabulary_size == vocabulary_size
assert ret_vocabulary == vocabulary
assert ret_patterns == patterns
assert ret_text_train == text_train
assert ret_marks_train == marks_train
assert ret_tests1 == tests1


In [9]:
deep_log_model = DeepLogModel.DeepLogModel(h=trie_h+1, n=vocabulary_size, vocabulary=vocabulary)
deep_log_model.build(num_lstm_layers=2, lstm_size=64)
deep_log_model.fit(text_train,epochs=1)

<_code.DeepLogModel.HistoryLoss at 0x21912ffb7c0>

In [10]:
b = time.time()
entries = list()
for k, (text_test, text_marks, anomaly) in tests1.items():
    for g in range(0, vocabulary_size+1):
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
e = time.time()
print("time:", round(e-b,3), "seconds")

time: 364.029 seconds


In [11]:
df1 = pd.DataFrame(entries, columns = ["i","g","status"])
ddf1 = df1.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e=ddf1.apply(calc, axis=0)
e1=pd.DataFrame(list(e.values))

print("measure results for each g-value")
e1

measure results for each g-value


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
0,0.271429,1.0,0.271429,0.426966,190,0,510,0
1,0.271817,1.0,0.272857,0.427447,190,1,509,0
2,0.275912,0.994737,0.29,0.432,189,14,496,1
3,0.303079,0.984211,0.381429,0.463445,187,80,430,3
4,0.373444,0.947368,0.554286,0.535714,180,208,302,10
5,0.439153,0.873684,0.662857,0.584507,166,298,212,24
6,0.554264,0.752632,0.768571,0.638393,143,395,115,47
7,0.668478,0.647368,0.817143,0.657754,123,449,61,67
8,1.0,0.473684,0.857143,0.642857,90,510,0,100
9,1.0,0.247368,0.795714,0.396624,47,510,0,143


In [12]:
### finding emprical best g
display(e1.loc[[e1["acc"].argmax()]])
display(e1.loc[[e1["f1"].argmax()]])

Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
8,1.0,0.473684,0.857143,0.642857,90,510,0,100


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
7,0.668478,0.647368,0.817143,0.657754,123,449,61,67


In [13]:
best_g = e1["f1"].argmax()
print("best-g", best_g)

best-g 7


## online_cases

In [14]:
num_new_patterns = num_new_patterns
new_patterns = generator.generate_patterns(num_patterns=num_new_patterns, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)

online_cases = generator.generate_tests(patterns + new_patterns, vocabulary, n=10000, text_size = 4, anomaly_ratio=0.0)    

In [15]:
def write_inputs2(subdir, online_cases):
    dir_= os.path.join("..\\inputs", subdir)
    os.makedirs(dir_, exist_ok=True)
    
    j4 = {"online_cases" : online_cases}
    json.dump(j4, open(os.path.join(dir_, "online_cases.json"), "w"))

def read_inputs2(subdir):
    dir_ = os.path.join("..\\inputs", subdir)
    
    j4 = json.load(open(os.path.join(dir_, "online_cases.json")))
    online_cases_str_keys = j4["online_cases"]
    online_cases = dict()
    for k,v in online_cases_str_keys.items():
        online_cases[int(k)] = tuple(v)

    return online_cases

In [16]:
write_inputs2(subdir, online_cases)
ret_online_cases = read_inputs2(subdir)
assert ret_online_cases == online_cases

In [17]:
trie_g_new = trie.calc_g_value(patterns + new_patterns)
trie_h_new = trie.calc_h_value(patterns + new_patterns)
print("best-g:", best_g)
print("trie-g new: {}. (prev trie-g: {})".format(trie_g_new, trie_g))
print("trie-h new: {}. (prev trie-h: {})".format(trie_h_new, trie_h))
best_g_new = max(best_g, trie_g_new)
print("best-g-new:", best_g_new)

best-g: 7
trie-g new: 9. (prev trie-g: 8)
trie-h new: 4. (prev trie-h: 4)
best-g-new: 9


In [18]:
num_feedbacks = 0
num_feedbacks_new = 0
num_feedbacks_old = 0

def contains_new_pattern(text, new_patterns):
    new_ps_str = ["#".join(p) for p in new_patterns]
    text_str = "#".join(text)
    for new_p_str in new_ps_str:
        if new_p_str in text_str:
            return True
    return False
    
for index, case in online_cases.items():
    text, marks, anomaly_exists = case
    res = deep_log_model.monitor_session(text, marks, g=best_g)
    if res == "FP":
        deep_log_model.train_feedback(text, marks, g = best_g) # we are using here the old empirical best g. The theoretical g (trie_g_new) might be higher and give at the end worse results.
        num_feedbacks += 1
        if contains_new_pattern(text, new_patterns):
            num_feedbacks_new +=1
        else:
            num_feedbacks_old +=1 # count only totally "old" feedback cases without any new workflow 
        

In [19]:
print("num_feedbacks_new", num_feedbacks_new)
print("num_feedbacks_old", num_feedbacks_old)
print("num_feedbacks", num_feedbacks)

num_feedbacks_new 1469
num_feedbacks_old 1598
num_feedbacks 3067


In [20]:
tests2 = generator.generate_tests(patterns + new_patterns, vocabulary, n=700, text_size = 4, anomaly_ratio=0.1)

In [21]:
def write_inputs3(subdir, tests):
    dir_= os.path.join("..\\inputs", subdir)
    os.makedirs(dir_, exist_ok=True)
    
    j5 = {"tests" : tests}
    json.dump(j5, open(os.path.join(dir_, "tests2.json"), "w"))

def read_inputs3(subdir):
    dir_= os.path.join("..\\inputs", subdir)
    j5 = json.load(open(os.path.join(dir_, "tests2.json")))
    
    tests_str_keys = j5["tests"]
    tests = dict()
    for k,v in tests_str_keys.items():
        tests[int(k)] = tuple(v)
    return tests

In [22]:
write_inputs3(subdir, tests2)
ret_tests2 = read_inputs3(subdir)
assert ret_tests2 == tests2

In [23]:
b = time.time()
entries = list()
for k, (text_test, text_marks, anomaly) in tests2.items():
    for g in range(best_g, vocabulary_size+1): # adding WF can't cause lower g.
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
e = time.time()
print("time:", round(e-b,3), "seconds")

time: 135.079 seconds


In [24]:
df2 = pd.DataFrame(entries, columns = ["i","g","status"])
ddf2 = df2.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"g" : s.name, "prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e=ddf2.apply(calc, axis=0)
e2=pd.DataFrame(list(e.values))

print("measure results for each g-value")
e2

measure results for each g-value


Unnamed: 0,g,prec,rec,acc,f1,TP,TN,FP,FN
0,7,0.501558,0.777778,0.705714,0.609848,161,333,160,46
1,8,0.621469,0.531401,0.765714,0.572917,110,426,67,97
2,9,0.964286,0.26087,0.778571,0.410646,54,491,2,153
3,10,0.0,0.0,0.704286,0.0,0,493,0,207


In [25]:
best_result2 = e2.loc[[e2["f1"].argmax()]]
best_g2 = best_result2["g"].iloc[0]

In [26]:
e3 = e2[e2["g"] == best_g2].copy()
e3["num_feedbacks_new"] = num_feedbacks_new
e3["num_feedbacks_old"] = num_feedbacks_old
e3["num_feedbacks"] = num_feedbacks
e3

Unnamed: 0,g,prec,rec,acc,f1,TP,TN,FP,FN,num_feedbacks_new,num_feedbacks_old,num_feedbacks
0,7,0.501558,0.777778,0.705714,0.609848,161,333,160,46,1469,1598,3067
