In [None]:
import sys
import pandas as pd
import numpy as np
import timeit
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import datetime

from loglizer.loglizer.models.DeepLog import DeepLog
from loglizer.loglizer.models.PCA import PCA
from loglizer.loglizer.models.LOF import LocalOutlierFactor as LOF
from loglizer.loglizer.models.KNeighbors import KNeighbors as KNN
from loglizer.loglizer.models.LogClustering import LogClustering
from loglizer.loglizer.models.InvariantsMiner import InvariantsMiner
from loglizer.loglizer.models.IsolationForest import IsolationForest
from loglizer.loglizer.models import DeepLog
from loglizer.loglizer.preprocessing import Vectorizer, Iterator
from loglizer.loglizer import preprocessing
from loglizer.loglizer.dataloader import HDFS, BGL, Thunderbird


In [None]:
def loadDatasets():
    datasets = {}
    
    #Thunderbird
    print("load Thunderbird")
    (x_train, y_train), (x_test, y_test) = Thunderbird.loadDataset('Drain_result/Thunderbird_10m.log_structured.csv', window='sliding', time_interval=3600*0.5, stepping_size=60*5, train_ratio=0.7)
    datasets['Thunderbird'] = {'x_train':x_train, 'y_train': y_train, 'x_test': x_test, 'y_test':y_test}
    

    
    #HDFS
    print("load HDFS")
    struct_log = 'Drain_result/HDFS.log_structured.csv'
    labels = 'logs/HDFS/anomaly_label.csv'
    (x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
                                                           label_file=labels,
                                                           window='session',
                                                           train_ratio=0.7,
                                                           split_type='uniform')
    
    datasets['HDFS'] = {'x_train':x_train, 
                        'y_train': y_train, 
                        'x_test': x_test, 
                        'y_test':y_test
                        }
    
      #BGL
    print("load BGL")
    (x_train, y_train), (x_test, y_test) = BGL.loadDataset('Drain_result/BGL.log_structured.csv', window='sliding', time_interval=3600*6, stepping_size=3600, train_ratio=0.7)
    datasets['BGL'] = {'x_train':x_train, 'y_train': y_train, 'x_test': x_test, 'y_test':y_test}
   
    
    return datasets

In [None]:
datasets = loadDatasets()
for d in datasets:
    datasets[d]['contamination'] = sum(datasets[d]['y_train'])/len(datasets[d]['y_train'])

In [None]:
def evalAndAddToBenchmark(modelName, model, dataName, data_x, data_y, data_unseen = False, traintime=0):

    print('Train accuracy:')
    start = timeit.default_timer()
    precision, recall, f1 = model.evaluate(data_x, data_y)
    stop = timeit.default_timer()
    evaltime = stop - start
    
    total = len(data_y)
    anomaly = sum(data_y)
    normal = total - anomaly
    
    
    benchmark_results.append([modelName, dataName, data_unseen, total, normal, anomaly, precision, recall, f1, evaltime, traintime])

In [None]:
ds = []

for d in datasets:
    train = datasets[d]['y_train']
    test  = datasets[d]['y_test']
    ds.append([d,'train', len(train), len(train)-sum(train), sum(train)])
    ds.append([d,'test', len(test), len(test)-sum(test), sum(test)])

dsf = pd.DataFrame(ds, columns=['dataset','type','total', 'normal', 'anomaly'])
dsf.to_csv("result_data/data-comperison.csv")
g = sns.barplot(
    data=dsf,
    x="dataset", 
    y="total", 
    hue="type",
    ci=None, 
    alpha=1
)

g = sns.barplot(
    data=dsf,
    x="dataset", 
    y="anomaly", 
    hue="type",
    palette="dark",
    ci=None, 
    alpha=1
)


g.set_title("Datensätze im Vergleich")


In [None]:
np.array(list(map(len, datasets['HDFS']['x_train']))).mean()

In [None]:
from loglizer.utils import metrics

class UnionModel():
    def __init__(self,model_list):
        self.models = model_list
        
    def fit(self, X, y=None):
        for m in self.models:
            m.fit(X)

    def predict(self, X):
        y_pred = np.zeros(len(X))
        for m in self.models:
            y_pred = [ a or b for (a,b) in zip(y_pred,m.predict(X)) ] 
        return list(y_pred)

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

class IntersectionModel():
    def __init__(self,model_list):
        self.models = model_list
        
    def fit(self, X, y=None):
        for m in self.models:
            m.fit(X)

    def predict(self, X):
        y_pred = np.ones(len(X))
        for m in self.models:
            #y_pred = map(operator.add, m.predict(X))
            y_pred = [ a and b for (a,b) in zip(y_pred,m.predict(X)) ] 
        return list(y_pred)

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

In [None]:
benchmark_results = []

feature_extractor = preprocessing.FeatureExtractor()

In [None]:
models = ['PCA', 'LOF', 'iForest', 'KNN']

In [None]:
for d in datasets:
    
    print("Ussing dataset: " + d )
    
    x_train = datasets[d]['x_train']
    y_train = datasets[d]['y_train']
    x_test = datasets[d]['x_test']
    y_test = datasets[d]['y_test']
    
    contamination = datasets[d]['contamination']

    x_train_extracted = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean')
    x_test_extracted = feature_extractor.transform(x_test)
    for m in models:
        
        print("Evaluationg Algorithmn: " + m)
        start = timeit.default_timer()
    
        if m == 'KNN':
            model = KNN(n_neighbors=39, contamination=contamination)
            model.fit(x_train_extracted)
        elif m == 'PCA':
            model = PCA(c_alpha=2.5)
            model.fit(x_train_extracted)
        elif m == 'InvariantMiner':
            model = InvariantsMiner(epsilon=0.5)
            model.fit(x_train_extracted)
        elif m == 'LOF':
            model = LOF(n_neighbors=60, leaf_size=60, contamination=contamination)
            model.fit(x_train_extracted)
            model.novelty = True
        elif m == 'iForest':
            model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=contamination)
            model.fit(x_train_extracted)
        elif m == 'DeepLog':
            # TODO DeepLog in den Dataframe einbinden 
            batch_size = 32
            num_workers = 1
            
            vectorizer = Vectorizer()
            train_dataset = vectorizer.fit_transform(x_train, window_y_train, y_train)
            test_dataset = vectorizer.transform(x_test, window_y_test, y_test)

            train_loader = Iterator(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers).iter
            test_loader = Iterator(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers).iter
            
            model = DeepLog(num_labels=vectorizer.num_labels)
            model.fit(train_loader)
            
            stop = timeit.default_timer()
            traintime = stop - start
            
            print('Train accuracy:')
            start = timeit.default_timer()
            metrics = model.evaluate(train_loader)
            stop = timeit.default_timer()
            evaltime = stop - start
            
            print('Train accuracy:')
            start = timeit.default_timer()
            metrics2 = model.evaluate(test_loader)
            stop = timeit.default_timer()
            evaltime = stop - start
            continue
        elif m == 'Union':
            pca = PCA()
            iforest = IsolationForest(random_state=2022, max_samples=0.9999, contamination=contamination)
            lof = LOF(n_neighbors=60, leaf_size=60, contamination='auto')
            model = UnionModel([pca,iforest,lof])
            model.fit(x_train_extracted)
        elif m == 'Intersection':
            pca = PCA()
            iforest = IsolationForest(random_state=2019, max_samples=0.9999, contamination=contamination)
            model = IntersectionModel([pca,iforest])
            model.fit(x_train_extracted)
        else: continue 
            
        stop = timeit.default_timer()
        traintime = stop - start
        evalAndAddToBenchmark(m, model, d, x_train_extracted, y_train, data_unseen=False, traintime=traintime)
        evalAndAddToBenchmark(m, model, d, x_test_extracted, y_test, data_unseen=True,traintime=0)

In [None]:
columns = ["algorithm","dataset", "unseen_data","data_total","data_normal","data_anomaly","accuracy","recall","f1", "evaltime", "traintime"]
df = pd.DataFrame(benchmark_results,columns=columns)
df.to_csv('result_data/model_comperison_last.csv')
t = str(datetime.datetime.utcnow())
df.to_csv('result_data/model_comperison_' + t +'.csv')
df

In [None]:
df_test = df[df["unseen_data"] == True] 
df_train = df[df["unseen_data"] == False] 

In [None]:
sns.set_theme(style="whitegrid")

g = sns.catplot(
    data=df_test, kind="bar",
    x="algorithm", y="accuracy", hue="dataset", hue_order=['HDFS','BGL','Thunderbird'],
    ci=None, alpha=1, height=6
)
g.despine(left=True)
g.set_axis_labels("", "Präzision")
g.legend.set_title("")
g.savefig('result_data/precision.png')

In [None]:
sns.set_theme(style="whitegrid")

g = sns.catplot(
    data=df_test, kind="bar",
    x="algorithm", y="recall", hue="dataset", hue_order=['HDFS','BGL','Thunderbird'],
    ci=None, alpha=1, height=6
)
g.despine(left=True)
g.set_axis_labels("", "Recall")
g.legend.set_title("")
g.savefig('result_data/recall.png')

In [None]:
sns.set_theme(style="whitegrid")

g = sns.catplot(
    data=df_test, kind="bar",
    x="algorithm", y="f1", hue="dataset", hue_order=['HDFS','BGL','Thunderbird'],
    ci=None, alpha=1, height=6
)
g.despine(left=True)
g.set_axis_labels("", "F1-score")
g.legend.set_title("")
g.savefig('result_data/f1.png')

In [None]:
sns.set_theme(style="whitegrid")

g = sns.catplot(
    data=df_train, kind="bar",
    x="algorithm", y="traintime", hue="dataset", hue_order=['HDFS','BGL','Thunderbird','ABC'],
    ci=None, alpha=1, height=6
)
g.despine(left=True)
g.set_axis_labels("", "Laufzeit")
g.legend.set_title("")

In [None]:
df.to_latex('result_data/model-comperison.tex')

In [None]:
g = df[df['unseen_data'] == False].get(['dataset', 'algorithm', 'accuracy', 'traintime'])
g.to_latex('result_data/runtime.tex')