In [None]:
import time
time.gmtime()

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [3]:
import pandas as pd
import itertools
import collections
import numpy as np

from _code import DeepLogModel
from _code import generator
from _code import trie
from _code import seed

In [4]:
seed.seed(42)

In [5]:
vocabulary_size = 40
num_patterns = 20
vocabulary = generator.make_vocabulary(vocabulary_size=vocabulary_size)
patterns = generator.generate_patterns(num_patterns=num_patterns, vocabulary=vocabulary, min_pattern_size=3, max_pattern_size=7)


In [6]:
trie_g = trie.calc_g_value(patterns)
trie_h = trie.calc_h_value(patterns)
print("trie-g:", trie_g)
print("trie-h:", trie_h)


trie-g: 13
trie-h: 3


In [7]:
text_train, marks_train = generator.generate_text(patterns, text_size=50000, anomaly_ratio=0.00, vocabulary=vocabulary) 

In [8]:
deep_log_model = DeepLogModel.DeepLogModel(h=trie_h+1, n=vocabulary_size, vocabulary=vocabulary)
deep_log_model.build(num_lstm_layers=2, lstm_size=50)
deep_log_model.fit(text_train,epochs=1)

<_code.DeepLogModel.HistoryLoss at 0x1bcee0f1bb0>

In [9]:
tests_legit = generator.generate_tests(patterns, vocabulary, n=200, text_size = 4, anomaly_ratio=0.1)

In [10]:
b = time.time()
entries = list()
for k, (text_test, text_marks, anomaly) in tests_legit.items():
    for g in range(0, vocabulary_size+1):
        res = deep_log_model.monitor_session(text_test, text_marks, g=g)
        entry = (k,g,res)
        entries.append(entry)
e = time.time()
print("time:", round(e-b,3), "seconds")
# entries

time: 1247.892 seconds


In [11]:
df = pd.DataFrame(entries, columns = ["i","g","status"])
ddf = df.pivot_table(index="i", columns = "g", values = "status", aggfunc = "sum")
def calc(s):
    c = dict(collections.Counter(s))
    TP = c.get("TP", 0)
    TN = c.get("TN", 0)
    FP = c.get("FP", 0)
    FN = c.get("FN", 0)
    eps = 1e-9
    
    prec = TP / (TP + FP + eps)
    rec = TP / (TP + FN + eps)
    acc = (TP + TN) / (TP + TN + FP + FN + eps)
    f1 = 2*(prec*rec)/(prec+rec+eps)
    
    return {"prec" : prec, "rec" : rec, "acc" : acc, "f1" : f1, "TP" : TP, "TN" : TN, "FP" : FP, "FN" : FN}
    
e=ddf.apply(calc, axis=0)
e1=pd.DataFrame(list(e.values))

print("measure results for each g-value")
e1

measure results for each g-value


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
0,0.285,1.0,0.285,0.44358,57,0,143,0
1,0.285,1.0,0.285,0.44358,57,0,143,0
2,0.28934,1.0,0.3,0.448819,57,3,140,0
3,0.301587,1.0,0.34,0.463415,57,11,132,0
4,0.307692,0.982456,0.365,0.468619,56,17,126,1
5,0.327381,0.964912,0.425,0.488889,55,30,113,2
6,0.357143,0.964912,0.495,0.521327,55,44,99,2
7,0.381944,0.964912,0.545,0.547264,55,54,89,2
8,0.407407,0.964912,0.59,0.572917,55,63,80,2
9,0.44,0.964912,0.64,0.604396,55,73,70,2


In [12]:
best_g = e1["f1"].argmax()
print("empirical best g:", best_g)
print("trie g:", trie_g)
e1.loc[[best_g]]

empirical best g: 13
trie g: 13


Unnamed: 0,prec,rec,acc,f1,TP,TN,FP,FN
13,1.0,0.894737,0.97,0.944444,51,143,0,6


## README
1. First we trained the network and knows the best g-value  
2. We inject random letters that each of them is within top-g  
    2.1. We generate 2 legitimate workflows  
    2.2. We inject <15> consecuent letters, each of them within top-g  
    2.3. We try to finish with more legitimate workflows  
        2.3.1 Until we can inject legitimate workflow without being detected, we continue to inject random letters within top-g  
3. We got a text with at least <15> close-to-random letters inside it, and the network didn't alert on it.
4. That is an injection attack

In [13]:
text, marks = generator.generate_text(patterns, text_size=2, anomaly_ratio=0.00, vocabulary=vocabulary) 
def choose_next(topg_letters, injection_method):
    if injection_method == "random":
        return np.random.choice(list(topg_letters))
    else:
        raise ValueErorr()

        
def inject(text, n, injection_method):
    text_ = text + [text[-1]] # placeholder
    if n == 0:
        return text
    marks = [1]*len(text_)
    df_pred, df_marks = deep_log_model.get_df_pred(text_, marks)
    topg_letters = df_pred.iloc[-1].sort_values()[-(best_g):].index
    next_letter = choose_next(topg_letters, injection_method = injection_method)
    text = text + [next_letter]
    return inject(text, n-1,injection_method)

before = deep_log_model.monitor_session(text, marks, best_g)
print(len(text), before)
assert before == "TN"
print(text, marks)

text = inject(text, 15, injection_method = "random")
marks = [0]*len(text) # there is an anomaly here
after1 = deep_log_model.monitor_session(text, marks, best_g)
assert after1 == "FN" # no alert for our injection so far
print(len(text), after1)
print(text)

7 TN
['s28', 's14', 's00', 's20', 's32', 's11', 's21'] [1, 2, 3, 1, 2, 3, 4]
22 FN
['s28', 's14', 's00', 's20', 's32', 's11', 's21', 's04', 's10', 's11', 's17', 's28', 's14', 's13', 's07', 's19', 's39', 's28', 's08', 's33', 's00', 's39']


In [14]:

# now we want to go back to safety and complete our injection with normal WFs without triggering an alert

def inject2(text, patterns):
    for p in patterns:
        text_ = text + list(p)
        marks_ = [0]*len(text_) 
        res = deep_log_model.monitor_session(text_, marks_, g=best_g)
        if res == "FN":
            text = text_
            marks = marks_
            success = True
            return text, marks, success
    
    text = inject(text, 1, injection_method=random)
    marks = [0]*len(text)
    success = False
    
    return text, marks, success
    

def inject2_aux(text, n, patterns):
    for _ in range(n):
        text, marks, success = inject2(text, patterns)
        if success:
            return text, success
    
    return None, False

def add_workflows(text, patterns, n):
    text_add, marks_add = generator.generate_text(patterns, text_size=n, anomaly_ratio=0.00, vocabulary=vocabulary)
    text = text + text_add
    return text
    
    
text, success = inject2_aux(text, 10, patterns)
assert success
text = add_workflows(text, patterns, 3)

marks = [0]*len(text) # there was an anomaly here
after2 = deep_log_model.monitor_session(text, marks, best_g)
assert after2 == "FN" # no alert for our injection so far
print(len(text), after2)
print(text)

38 FN
['s28', 's14', 's00', 's20', 's32', 's11', 's21', 's04', 's10', 's11', 's17', 's28', 's14', 's13', 's07', 's19', 's39', 's28', 's08', 's33', 's00', 's39', 's28', 's14', 's07', 's20', 's38', 's18', 's07', 's23', 's10', 's03', 's28', 's17', 's25', 's28', 's14', 's00']
