In [20]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from loglizer.models import PCA, IsolationForest, LogClustering, OneClassSVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [18]:
ouput_dir = "../output/hdfs/"
(x_train, y_train), (x_test, y_test) = dataloader.load_data(data_dir=ouput_dir)
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)


  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


Train normal size: 5000
Train abnormal size: 2500
Test normal size: 553223
Test abnormal size: 14338
Train data shape: 7500-by-40

Test data shape: 567561-by-40



In [12]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 2
Project matrix shape: 40-by-40
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 2500, FP: 2098, TN: 2902, FN: 0
Precision: 54.371%, recall: 100.000%, F1-measure: 70.442%

Test validation:
Confusion Matrix: TP: 14338, FP: 230566, TN: 322657, FN: 0
Precision: 5.854%, recall: 100.000%, F1-measure: 11.062%

CPU times: user 3.4 s, sys: 79.5 ms, total: 3.48 s
Wall time: 1.75 s


In [13]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 1718, FP: 26, TN: 4974, FN: 782
Precision: 98.509, recall: 68.720, F1-measure: 80.961

Test validation:
Confusion Matrix: TP: 9786, FP: 2570, TN: 550653, FN: 4552
Precision: 79.200, recall: 68.252, F1-measure: 73.320

CPU times: user 15.5 s, sys: 3.4 s, total: 18.9 s
Wall time: 18.8 s


In [21]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 469, FP: 5000, TN: 0, FN: 2031
Precision: 8.576, recall: 18.760, F1-measure: 11.771

Test validation:
Confusion Matrix: TP: 2769, FP: 553223, TN: 0, FN: 11569
Precision: 0.498, recall: 19.312, F1-measure: 0.971

CPU times: user 1min 50s, sys: 51.9 ms, total: 1min 50s
Wall time: 1min 50s


In [14]:
# %%time
# print("="*20 + " Model: one class SVM " + "="*20)

# nus = [0.001, 0.01, 0.1, 1]
# gammas = [0.001, 0.01, 0.1, 1]
# tuned_parameters = {'kernel' : ['rbf','poly','linear','sigmoid'], 'gamma' : gammas, 'nu': nus}

# ocsvm = svm.OneClassSVM()
# model = GridSearchCV(ocsvm, tuned_parameters, cv=5, scoring="f1_micro")

# model.fit(x_train, y_train.astype(int))

# # print('Train validation:')
# # precision, recall, f1 = model.predict(x_train, y_train.astype(int))
# # print('Test validation:')
# # precision, recall, f1 = model.predict(x_test, y_test.astype(int))

CPU times: user 2min 50s, sys: 3.56 s, total: 2min 54s
Wall time: 2min 54s


GridSearchCV(cv=5, estimator=OneClassSVM(),
             param_grid={'gamma': [0.001, 0.01, 0.1, 1],
                         'kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
                         'nu': [0.001, 0.01, 0.1, 1]},
             scoring='f1_micro')

In [15]:
# print('Train validation:')
# y_eval = model.predict(x_train)
# precision, recall, f1 = metrics(y_eval, y_train)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
    
# print('Test validation:')
# y_pred = model.predict(x_test)
# precision, recall, f1 = metrics(y_pred, y_test)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))


Train validation:
Confusion Matrix: TP: 1543, FP: 5000, TN: 0, FN: 957
Precision: 23.582, recall: 61.720, F1-measure: 34.126

Test validation:
Confusion Matrix: TP: 9114, FP: 553223, TN: 0, FN: 5224
Precision: 1.621, recall: 63.565, F1-measure: 3.161



In [16]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)




Starting offline clustering...
Processed 1000 instances.
Found 4 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5000 instances.
Found 4 clusters online.

Train validation:
Confusion Matrix: TP: 960, FP: 0, TN: 5000, FN: 1540
Precision: 100.000, recall: 38.400, F1-measure: 55.491

Test validation:
Confusion Matrix: TP: 5251, FP: 40, TN: 553183, FN: 9087
Precision: 99.244, recall: 36.623, F1-measure: 53.502

CPU times: user 26.9 s, sys: 4.13 ms, total: 27 s
Wall time: 26.9 s
