In [None]:
import time
time.gmtime()

In [None]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [None]:
import pandas as pd
import itertools
import collections

from _code import DeepLogModel
from _code import generator
from _code import trie
from _code import seed

In [None]:
seed.seed(0)

In [None]:
vocabulary_size = 20
num_patterns = 20
vocabulary = generator.make_vocabulary(vocabulary_size=vocabulary_size)
patterns = generator.generate_patterns(num_patterns=num_patterns, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)

trie_g = trie.calc_g_value(patterns)
trie_h = trie.calc_h_value(patterns)
print("trie-g:", trie_g)
print("trie-h:", trie_h)

In [None]:
text_train, marks_train = generator.generate_text(patterns, text_size=50000, anomaly_ratio=0.00, vocabulary=vocabulary) 

In [None]:
deep_log_model = DeepLogModel.DeepLogModel(h=trie_h+1, n=vocabulary_size, vocabulary=vocabulary)
deep_log_model.build(num_lstm_layers=2, lstm_size=64)
deep_log_model.fit(text_train,epochs=1)

## README
1. After the training we evaluates the network over tests, to see the current best performnace.
2. We use feedback to train the network for a new workflow with different contexts
3. We evaluate the updated network again over the previous tests.
4. The decline in performance results is actually the degree of forgetfulness of the network.

In [None]:
tests_legit = generator.generate_tests(patterns, vocabulary, n=1000, text_size = 4, anomaly_ratio=0.05)

In [None]:
b = time.time()
entries = list()
for k, (text_test, text_marks, anomaly) in tests_legit.items():
    for g in range(0, vocabulary_size+1):
#     for g in [trie_g-2, trie_g-1,trie_g,trie_g+1,trie_g+2]:
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
e = time.time()
print("time:", round(e-b,3), "seconds")

In [None]:
df1 = pd.DataFrame(entries, columns = ["i","g","status"])
ddf1 = df1.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e1=ddf1.apply(calc, axis=0)
e1=pd.DataFrame(list(e1.values))

print("measure results for each g-value")
e1

In [None]:
best_g = e1["f1"].argmax()
print("best-g", best_g)

In [None]:
new_workflows = generator.generate_patterns(num_patterns=1, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)
new_workflow = new_workflows[0]
assert new_workflow not in patterns

trie_g2 = trie.calc_g_value(patterns + [new_workflow])
print("trie-g2", trie_g2)
if trie_g2 > best_g:
    best_g = trie_g2

In [None]:
new_marks = [1]*len(new_workflow)
new = dict()
for i in range(20): # this new workflow will typically appear in many different contexts
    before_text, before_marks = generator.generate_text(patterns, text_size=2, anomaly_ratio=0.00, vocabulary=vocabulary)
    after_text, after_marks = generator.generate_text(patterns, text_size=1, anomaly_ratio=0.00, vocabulary=vocabulary)
    new_text = before_text + list(new_workflow) + after_text
    new_marks = before_text + list(new_workflow) + after_text
    new[i] = (new_text, new_marks)

B = list()
for i, (new_text, new_marks) in new.items():
    B.append(deep_log_model.monitor_session(new_text, new_marks, g=best_g))
print("before", B)
    
A1 = list()
for i, (new_text, new_marks) in new.items():
    deep_log_model.train_feedback(new_text, new_marks, g = best_g)
    A1.append(deep_log_model.monitor_session(new_text, new_marks, g=best_g))
print("after", A1)

A = list()
for i, (new_text, new_marks) in new.items():
    A.append(deep_log_model.monitor_session(new_text, new_marks, g=best_g))

print("after all", A)

In [None]:
entries = list()
for k, (text_test, text_marks, anomaly) in tests_legit.items():
    for g in range(0, vocabulary_size+1):
#     for g in [best_g-2, best_g-1, best_g, best_g+1,best_g+2]:
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
# entries

In [None]:
df2 = pd.DataFrame(entries, columns = ["i","g","status"])
ddf2 = df2.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e=ddf2.apply(calc, axis=0)
e2=pd.DataFrame(list(e.values))

print("measure results for each g-value")
e2

In [None]:
e1.columns = ["1-"+c for c in e1.columns]
e2.columns = ["2-"+c for c in e2.columns]

In [None]:
df_1_2 = pd.concat([e1,e2],axis=1)
df_1_2["diff-f1"] = df_1_2["2-f1"] -  df_1_2["1-f1"]

In [None]:
df_1_2