## Prepare Training Dataset

In [1]:
import pandas as pd
import json
import os
from collections import Counter

In [None]:
# === PRE-PROCESS CONFIGURATION ===
INPUT_CSV = "path/to/raw/in.csv" # Must have _value and _time column
SEQUENCE_LENGTH = 20
OUTPUT_DIR = "out/"

import argparse
import torch

from bert_pytorch.dataset import WordVocab
from bert_pytorch import Predictor, Trainer, Processor
from bert_pytorch.dataset.utils import seed_everything

options = dict()
options['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
options["output_dir"] = OUTPUT_DIR
options["model_dir"] = options["output_dir"] + "bert/"
options["model_path"] = options["model_dir"] + "best_bert.pth"
options["train_vocab"] = options["output_dir"] + "train"
options["vocab_path"] = options["output_dir"] + "vocab.pkl"  # pickle file

options["drain_config"] = "out/drain/drain3.ini"
options["drain_state"] = "out/drain/concept_evaluation.bin"

options["window_size"] = 128
options["adaptive_window"] = True
options["seq_len"] = SEQUENCE_LENGTH
options["max_len"] = 512 # for position embedding
options["min_len"] = 10
options["mask_ratio"] = 0.85
# sample ratio
options["train_ratio"] = 0.7
options["test_ratio"] = 0.25
options["valid_ratio"] = 0.05

# features
options["is_logkey"] = True
options["is_time"] = False

options["hypersphere_loss"] = True
options["hypersphere_loss_test"] = False

options["scale"] = None # MinMaxScaler()
options["scale_path"] = options["model_dir"] + "scale.pkl"

# model
options["hidden"] = 256 # embedding size
options["layers"] = 4
options["attn_heads"] = 4

options["epochs"] = 100
options["n_epochs_stop"] = 10
options["batch_size"] = 32

options["corpus_lines"] = None
options["on_memory"] = True
options["num_workers"] = 5
options["lr"] = 1e-3
options["adam_beta1"] = 0.9
options["adam_beta2"] = 0.999
options["adam_weight_decay"] = 0.00
options["with_cuda"]= True
options["cuda_devices"] = None
options["log_freq"] = None

# predict
options["num_candidates"] = 6
options["gaussian_mean"] = 0
options["gaussian_std"] = 1

seed_everything(seed=1234)

if not os.path.exists(options['model_dir']):
    os.makedirs(options['model_dir'], exist_ok=True)

print("device", options["device"])
print("features logkey:{} time: {}\n".format(options["is_logkey"], options["is_time"]))
print("mask ratio", options["mask_ratio"])

device cuda
features logkey:True time: False

mask ratio 0.65


In [3]:
def process():
    proc = Processor(options)
    proc.preprocess(INPUT_CSV)
    proc.process(SEQUENCE_LENGTH)

def train():
    Trainer(options).train()

def predict(mean=0, std=1):
    options["gaussian_mean"] = mean
    options["gaussian_std"] = std

    Predictor(options).predict()

def vocab(vocab_size=None, encoding="utf-8", min_freq=1):
    with open(options["train_vocab"], "r", encoding=encoding) as f:
        texts = f.readlines()
        print(texts)

    vocab = WordVocab(texts, max_size=vocab_size, min_freq=min_freq)
    vocab.save_vocab(options["vocab_path"])

In [52]:
processor = Processor(options)
predictor = Predictor(options)

In [80]:
df = pd.read_csv("out/data/extract_ApplicationServer_nlxdsmcv39.csv")

logs = df['_value'].to_list()

In [83]:
logseq = processor.log_messages_to_keys(logs[10000:10100])
logseq = ' '.join(logseq)
logseq

'E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E28 E28 E4 E4 E4 E28 E4 E4 E4 E9 E4 E4 E4 E4 E4 E4 E4 E4 E9 E9 E8 E9 E4 E4 E9 E4 E9 E9 E4 E4 E4 E9 E49 E4 E4 E4 E8 E8 E4 E8 E9 E9 E55 E8 E9 E78 E4 E78 E4 E72 E78 E4 E72 E78 E4 E72 E78 E4 E72 E78 E4 E72 E9 E4 E4 E4 E4 E4 E49 E41 E8 E8 E41 E9 E9 E8 E4 E9 E4'

In [98]:
predictor.predict_single_sequence(logseq)

{'undetected_tokens': 2,
 'masked_tokens': 15,
 'anomaly': False,
 'predictions': [],
 'true_labels': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'hypersphere_dist': 6.0762834548950195,
 'deepSVDD_label': 1}