In [1]:
import time
time.gmtime()

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=22, tm_hour=12, tm_min=43, tm_sec=42, tm_wday=6, tm_yday=234, tm_isdst=0)

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [3]:
import pandas as pd
import itertools

from _code import DeepLogModel
from _code import generator
from _code import trie
from _code import seed

In [4]:
seed.seed(0)

In [5]:
vocabulary_size = 15
vocabulary = generator.make_vocabulary(vocabulary_size=vocabulary_size)
patterns = generator.generate_patterns(num_patterns=100, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)
p_add = [("a","b","c","d","e"), ("c","d","e","f","g")]
vocabulary_add = set(list(itertools.chain.from_iterable(p_add)))
vocabulary = set(vocabulary)
vocabulary = vocabulary.union(vocabulary_add)
vocabulary_size = len(vocabulary)
patterns = patterns + p_add
vocabulary_size

22

In [6]:
text_train, marks_train = generator.generate_text(patterns, text_size=10000, anomaly_ratio=0.00, vocabulary=vocabulary)


In [7]:
deep_log_model = DeepLogModel.DeepLogModel(h=4, n=vocabulary_size, vocabulary=vocabulary)
deep_log_model.build(num_lstm_layers=2, lstm_size=50) # for this example we don't want over-fit the network
deep_log_model.fit(text_train,epochs=1)

<_code.DeepLogModel.HistoryLoss at 0x23b2045b0a0>

## README
1. We train the network on large corpus of workflow without over-fitting it.
   (If the network is in over-fit state it has different problems. In that case our LSTM will be just an n-gram model with n=history_size)
2. Two or the workflows in the training text are (a,b,c,d,e) and (c,d,e,f,g)
3. We inject the anomalous text (a,b,c,d,e,f,g) inside a legitimate context
4. An LSTM prediction which accept top-g predicted letters fail to alert on this anomaly.
5. This is because an LSTM give a pretty high probability to letter f because its closest context (c,d,e) predict it, even so we have (a,b) before them.

In [8]:
text_inject = ("a","b","c","d","e","f","g")
marks_inject = [1]*len(("a","b","c","d","e")) + [0]*len(("f","g"))

text_test_anomaly = patterns[1] + text_inject + patterns[2]
print(text_test_anomaly)
marks_test_anomaly = [1]*len(patterns[1]) + marks_inject + [1]*len(patterns[2])

tests_legit = generator.generate_tests(patterns, vocabulary, n=1, text_size = 300, anomaly_ratio=0.00)
text_test_legit, marks_test_legit, anomaly = tests_legit[0]

entries = list()
for g in range(0, vocabulary_size+1):
    res_legit = deep_log_model.monitor_session(text_test_legit, marks_test_legit,g=g)
    res_anomaly = deep_log_model.monitor_session(text_test_anomaly, marks_test_anomaly,g=g)
    entry = (g,res_legit, res_anomaly)
    entries.append(entry)
df = pd.DataFrame(entries, columns = ["g", "without_anomaly", "with_anomaly"])



('s05', 's02', 's04', 's07', 's06', 's08', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 's12', 's10', 's01')


In [9]:
df["total"] = df[["without_anomaly", "with_anomaly"]].min(axis=1)
def color(x):
    if x in {"FP","FN"}:
        return "background-color: red"
    elif x in {"TP","TN"}:
        return "background-color: green"
    else:
        raise ValueError()
        
df.style.applymap(color, subset = ["total"])

Unnamed: 0,g,without_anomaly,with_anomaly,total
0,0,FP,TP,FP
1,1,FP,TP,FP
2,2,FP,TP,FP
3,3,FP,TP,FP
4,4,FP,TP,FP
5,5,FP,TP,FP
6,6,FP,TP,FP
7,7,FP,TP,FP
8,8,FP,TP,FP
9,9,FP,TP,FP
