## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [25]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
num_list = []
# datasets = ['OpenStack']
for dataset in datasets:
    print(f'Processing {dataset} dataset...')
    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    num_list.append(cluster_nums)
    # labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

Processing BGL dataset...
Processing HDFS dataset...
Processing Linux dataset...




Processing HealthApp dataset...
Processing OpenStack dataset...
Processing OpenSSH dataset...




Processing Proxifier dataset...
Processing HPC dataset...
Processing Zookeeper dataset...




Processing Mac dataset...
Processing Hadoop dataset...




Processing Android dataset...
Processing Windows dataset...




Processing Apache dataset...
Processing Thunderbird dataset...
Processing Spark dataset...




In [27]:
print(num_list)
print((sum(num_list)-14-3-77)/len(num_list))

[44, 11, 33, 25, 40, 28, 6, 26, 18, 107, 28, 61, 8, 6, 26, 16]
24.3125


In [22]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 17
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 61
len of templates: 158
cluster: 17
length: 85
template: HBM brightnessOut =<*>
--------------------
HBM brightnessOut =38


In [None]:
# check the cluster k
# k = 0
# lengh_cluster = len(inputs[k][1])
# print('cluster ', k)
# print('length:', lengh_cluster)
# print('template:', inputs[k][3])
# print('-'*20)
# for log in set(inputs[k][1]):
#     print(log)

#      len
# Linux 0.5   tokenize '=' difference between (<*>) and () group first will help
# HealthApp: 1   same length, 2 words different(80 logs) refine by difference of words will help
# Zookeeper: 0 same length, 2 words different(12 logs)
# Hadoop: 0 same length 1 words different(118 logs)
# Spark: 0  same length 1 words different(149 logs)

# good cluster datasets
# HDFS OpenStack Proxifier HPC Mac Windows Apache Thunderbird
# length solved datasets
# BGL OpenSSH Android
# 

## evaluate

In [2]:
import time
from utils.evaluator import evaluate
import pandas as pd
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'

datasets = table_order.split(' ')
m,n,p,q = [],[],[],[]
for dataset in datasets:
    file = f'outputs/parser/Test/{dataset}.csv'  # Fifth_=_0.1
    # df = pd.read_csv(f'outputs/k_means/initial/{dataset}.csv')
    # df2 =
    a,b,c,d = evaluate(file, dataset,mismatch=True)
    m.append(a)
    n.append(b)
    p.append(c)
    q.append(d)

print('avg---------: group Accuracy: %.4f, Message-Level Accuracy: %.4f, Edit Distance: %.4f' % (sum(m)/len(m), sum(n)/len(n), sum(p)/len(p)))

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000
      Hadoop: group Accuracy: 0.9860, Message-Level Accuracy: 0.8515, Edit Distance: 7.4530
       Spark: group Accuracy: 0.9225, Message-Level Accuracy: 0.9105, Edit Distance: 1.9430
   Zookeeper: group Accuracy: 0.9935, Message-Level Accuracy: 0.9825, Edit Distance: 0.2420
         BGL: group Accuracy: 0.9815, Message-Level Accuracy: 0.9575, Edit Distance: 0.7840
         HPC: group Accuracy: 0.8520, Message-Level Accuracy: 0.6380, Edit Distance: 3.4930
 Thunderbird: group Accuracy: 0.9815, Message-Level Accuracy: 0.9215, Edit Distance: 0.8130
     Windows: group Accuracy: 0.9945, Message-Level Accuracy: 0.9730, Edit Distance: 0.7345
       Linux: group Accuracy: 0.9330, Message-Level Accuracy: 0.6910, Edit Distance: 3.1410
     Android: group Accuracy: 0.9640, Message-Level Accuracy: 0.5745, Edit Distance: 14.9910
   HealthApp: group Accuracy: 0.8995, Message-Level Accuracy: 0.7650, Edit Dist

In [6]:
from utils.sample_byword import matches_template
print(matches_template('Failed to start LSB: Bring up/down networking.', 'Failed to start LSB: Bring up/down networking.'))

True


## Find similarity in all datasets

In [None]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count_logs = []
count_templates = []

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            if any(char.isdigit() for char in template):
                print(f"{template}")

In [17]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

2017-07-02 15:46:41.445 ksfetch[32435/0x7fff79824000] [lvl=2] main() ksfetch fetching URL (<NSMutableURLRequest: 0x1005110b0> { URL: https://tools.google.com/service/update2?cup2hreq=53f725cf03f511fab16f19e789ce64aa1eed72395fc246e9f1100748325002f4&cup2key=7:1132320327 }) to folder:/tmp/KSOutOfProcessFetcher.YH2CjY1tnx/download
<*> ksfetch[<*>] [lvl=<*>] main() ksfetch fetching URL (<NSMutableURLRequest: <*> { URL: <*> }) to folder:<*>
--------------------


Mutation
```
# if '3' in template:
#     Count[3] += 1
# if '4' in template:
#     Count[4] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
# if '5' in template:
#     Count[5] += 1
#     print(dataset, template)
#     # 1 + 2
# if '6' in template:
#     Count[6] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
```

In [4]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize



datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'


for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    freq = Counter(templates)
    list_read = []
    for template in templates:
        if template not in list_read:
            tokens = tokenize(template)
            list_read.append(template)
            if '<*>/<*>' in template:
                print(f"{template}\nappear {freq[template]} times in {dataset}")



PCI: Using IRQ router PIIX/ICH [<*>/<*>] at <*>
appear 1 times in Linux
My election bind port: <*>/<*>:<*>
appear 1 times in Zookeeper
<*>@<*>/<*>/<*>:<*>:<*>:<*> WcpInitialize (wcp.dll version <*>) called (stack @<*>)
appear 6 times in Windows
<*>@<*>/<*>/<*>:<*>:<*>:<*> CSI Transaction @<*> initialized for deployment engine {<*>} with flags <*> and client id [<*>]"<*>/"
appear 1 times in Windows
<*>@<*>/<*>/<*>:<*>:<*>:<*> CSI Transaction @<*> destroyed
appear 1 times in Windows
<*>@<*>/<*>/<*>:<*>:<*>:<*> PopulateComponentFamiliesKey - Begin
appear 1 times in Windows
<*>@<*>/<*>/<*>:<*>:<*>:<*> PopulateComponentFamiliesKey - End
appear 1 times in Windows


In [9]:
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# datasets = ['Linux']
import pandas as pd
from utils.postprocess import  correct_single_template
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_templates_corrected.csv')
    print(f"Processing {dataset} dataset...")
    print('-'*20)
    templates = df['EventTemplate'].tolist()
    for template in templates:
        tmp = correct_single_template(template)
        if tmp != template:
            print(f"{template}\n{tmp}")
                

Processing BGL dataset...
--------------------
Processing HDFS dataset...
--------------------
Processing Linux dataset...
--------------------
authentication failure; logname= uid=<*> euid=<*> tty=<*> ruser= rhost=<*>  user=<*>
authentication failure; logname= uid=<*> euid=<*> tty=<*> ruser= rhost=<*> user=<*>
ROOT LOGIN ON <*>
<*> LOGIN ON <*>
ANONYMOUS FTP LOGIN FROM <*>,  (anonymous)
ANONYMOUS FTP LOGIN FROM <*>, (anonymous)
Kernel command line: ro root=<*>=<*> rhgb quiet
Kernel command line: ro <*>=<*>=<*> rhgb quiet
SELinux:  Initializing.
SELinux: Initializing.
SELinux:  Starting in permissive mode
SELinux: Starting in permissive mode
<*>:  Registering secondary module capability
<*>: Registering secondary module capability
Initializing random number generator:  succeeded
Initializing random number generator: succeeded
Starting pcmcia:  succeeded
Starting pcmcia: succeeded
Setting network parameters:  succeeded
Setting network parameters: succeeded
Bringing up loopback interface