In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import argparse
import numpy as np
import pandas as pd
import random
from importlib import reload  
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.utils import shuffle

from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [4]:
ouput_dir = "../output/bgl/"
middle_dir = ""
log_file = "BGL.log"

<!-- # Produce event templates from train test dataset -->

# Split train test data

In [9]:
(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)

  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


Train normal size: 13718
Train abnormal size: 1207
Total logkey(exclude 0:UNK) 1000
Test normal size: 20579
Test abnormal size: 1811
num_unk_event in test data: 0


In [10]:
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

Train data shape: 14925-by-832

Test data shape: 22390-by-832



In [12]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 5
Project matrix shape: 832-by-832
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 1193, FP: 11915, TN: 1803, FN: 14
Precision: 9.101%, recall: 98.840%, F1-measure: 16.668%

Test validation:
Confusion Matrix: TP: 1777, FP: 17824, TN: 2755, FN: 34
Precision: 9.066%, recall: 98.123%, F1-measure: 16.598%

CPU times: user 16.9 s, sys: 66.9 ms, total: 17 s
Wall time: 1.73 s


In [13]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 174, FP: 0, TN: 13718, FN: 1033
Precision: 100.000, recall: 14.416, F1-measure: 25.199

Test validation:
Confusion Matrix: TP: 258, FP: 0, TN: 20579, FN: 1553
Precision: 100.000, recall: 14.246, F1-measure: 24.940

CPU times: user 17.2 s, sys: 2.96 s, total: 20.2 s
Wall time: 18.4 s


In [14]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 152, FP: 13718, TN: 0, FN: 1055
Precision: 1.096, recall: 12.593, F1-measure: 2.016

Test validation:
Confusion Matrix: TP: 227, FP: 20579, TN: 0, FN: 1584
Precision: 1.091, recall: 12.534, F1-measure: 2.007

CPU times: user 6min 39s, sys: 69.4 ms, total: 6min 39s
Wall time: 6min 39s


In [17]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)


Starting offline clustering...
Processed 1000 instances.
Found 92 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 6000 instances.
Processed 8000 instances.
Processed 10000 instances.
Processed 12000 instances.
Processed 13718 instances.
Found 172 clusters online.

Train validation:
Confusion Matrix: TP: 775, FP: 1, TN: 13717, FN: 432
Precision: 99.871, recall: 64.209, F1-measure: 78.164

Test validation:
Confusion Matrix: TP: 1215, FP: 64, TN: 20515, FN: 596
Precision: 94.996, recall: 67.090, F1-measure: 78.641

CPU times: user 1min 42s, sys: 28.1 ms, total: 1min 42s
Wall time: 1min 42s
