In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import argparse
import numpy as np
import random
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.utils import shuffle

from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [7]:
ouput_dir = "../output/tbird/"
middle_dir = ""
log_file = "Thunderbird_20M.log"

(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


Train normal size: 6000
Train abnormal size: 3000
Total logkey(exclude 0:UNK) 1086
Test normal size: 42396
Test abnormal size: 22311
num_unk_event in test data: 0
Train data shape: 9000-by-894

Test data shape: 64707-by-894



In [10]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 5
Project matrix shape: 894-by-894
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 3000, FP: 5996, TN: 4, FN: 0
Precision: 33.348%, recall: 100.000%, F1-measure: 50.017%

Test validation:
Confusion Matrix: TP: 22311, FP: 42332, TN: 64, FN: 0
Precision: 34.514%, recall: 100.000%, F1-measure: 51.317%

CPU times: user 39.9 s, sys: 404 ms, total: 40.3 s
Wall time: 4.04 s


In [11]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 69, FP: 135, TN: 5865, FN: 2931
Precision: 33.824, recall: 2.300, F1-measure: 4.307

Test validation:
Confusion Matrix: TP: 536, FP: 962, TN: 41434, FN: 21775
Precision: 35.781, recall: 2.402, F1-measure: 4.503

CPU times: user 27.9 s, sys: 6.25 s, total: 34.1 s
Wall time: 32.4 s


In [12]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 1265, FP: 6000, TN: 0, FN: 1735
Precision: 17.412, recall: 42.167, F1-measure: 24.647

Test validation:
Confusion Matrix: TP: 9313, FP: 42396, TN: 0, FN: 12998
Precision: 18.010, recall: 41.742, F1-measure: 25.163

CPU times: user 5min 30s, sys: 18.6 ms, total: 5min 30s
Wall time: 5min 30s


In [15]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)


Starting offline clustering...
Processed 1000 instances.
Found 75 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 6000 instances.
Processed 6000 instances.
Found 105 clusters online.

Train validation:
Confusion Matrix: TP: 1005, FP: 1, TN: 5999, FN: 1995
Precision: 99.901, recall: 33.500, F1-measure: 50.175

Test validation:
Confusion Matrix: TP: 7202, FP: 169, TN: 42227, FN: 15109
Precision: 97.707, recall: 32.280, F1-measure: 48.528

CPU times: user 1min 45s, sys: 0 ns, total: 1min 45s
Wall time: 1min 45s
