## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [5]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster
import time

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# num_list = []
# datasets = ['OpenStack']
# for dataset in datasets:
dataset = 'Mac'
print(f'Processing {dataset} dataset...')
# load the dataset
df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content'].tolist()
# logs.extend(df['Content'].tolist())
templates = df['EventTemplate'].tolist()

# tokenize -> vectorize -> cluster -> reassign_clusters
tokenized_logs = [tokenize(log) for log in logs]
labels, cluster_nums = cluster(vectorize(tokenized_logs))
labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)


Processing Mac dataset...




## Show Clusters

In [10]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 122
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 366
len of templates: 341
cluster: 122
length: 3
template: [HID] [MT] AppleMultitouchDevice::willTerminate entered
--------------------
[HID] [MT] AppleMultitouchDevice::willTerminate entered


## Evaluate

In [12]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets('Test3_0125_pure', send_email=False)
HTML(table)

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000
      Hadoop: group Accuracy: 0.9850, Message-Level Accuracy: 0.4495, Edit Distance: 12.2160
       Spark: group Accuracy: 0.9220, Message-Level Accuracy: 0.8820, Edit Distance: 2.3325
   Zookeeper: group Accuracy: 0.9915, Message-Level Accuracy: 0.9655, Edit Distance: 0.4070
         BGL: group Accuracy: 0.9790, Message-Level Accuracy: 0.9560, Edit Distance: 0.9920
         HPC: group Accuracy: 0.8760, Message-Level Accuracy: 0.8630, Edit Distance: 1.6555
 Thunderbird: group Accuracy: 0.9700, Message-Level Accuracy: 0.6235, Edit Distance: 4.7990
     Windows: group Accuracy: 0.9960, Message-Level Accuracy: 0.9735, Edit Distance: 0.8555
       Linux: group Accuracy: 0.4770, Message-Level Accuracy: 0.5990, Edit Distance: 3.3590
     Android: group Accuracy: 0.9365, Message-Level Accuracy: 0.6465, Edit Distance: 5.8900
   HealthApp: group Accuracy: 0.8715, Message-Level Accuracy: 0.7395, Edit Dist

dataset,GA,PA,ED
HDFS,1.0,1.0,0.0
Hadoop,0.985,0.45,12.216
Spark,0.922,0.882,2.332
Zookeeper,0.992,0.966,0.407
BGL,0.979,0.956,0.992
HPC,0.876,0.863,1.656
Thunderbird,0.97,0.624,4.799
Windows,0.996,0.974,0.856
Linux,0.477,0.599,3.359
Android,0.936,0.646,5.89


## Check out some strings' freq in the whole logs or templates

In [4]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        for token in template.split():
            if '/' in token and '<*>' in token and template not in count_templates:
                print(template)
                count_templates.append(template)

Processing BGL ----------------
Processing HDFS ----------------
Processing Linux ----------------
PCI: Using IRQ router PIIX/ICH [<*>/<*>] at <*>
Processing HealthApp ----------------
Processing OpenStack ----------------
Processing OpenSSH ----------------
Processing Proxifier ----------------
Processing HPC ----------------
Processing Zookeeper ----------------
My election bind port: <*>/<*>
Processing Mac ----------------
-[UABestAppSuggestionManager notifyBestAppChanged:type:options:bundleIdentifier:activityType:dynamicIdentifier:when:confidence:deviceName:deviceIdentifier:deviceType:] (<*>) UASuggestedActionType=<*> (<*>)/(<*>) opts=(<*>) when=<*>
after trim url = https://www.google.com/_/chrome/newtab?rlz=<*>&espv=<*>&ie=UTF-<*>
Could not get event name for stream/token: com.apple.xpc.activity/<*>: <*>: Request for stale data
Processing Hadoop ----------------
Processing Android ----------------
printFreezingDisplayLogsopening app wtoken = AppWindowToken{<*> token=Token{<*> Acti

## sample based on entropy

In [None]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Mutation Count -- num

In [None]:

# 4: 15 + 28 + 5 + 18 + 10
# 5: 1 + 2
# 6: 15 + 28 + 5 + 18 + 10

In [None]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    list_log = []
    list_tmp = []
    print('-'*20)
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    logs = df['Content'].tolist()
    freq = Counter(templates)
    
    for template,log in zip(templates,logs):
        tokens = template.split()
        for token in tokens:
            if ':' in token and '<*>' in token:
                # print(f"{template}\n{log}\n{'-'*20}")
                list_tmp.append(template)
                list_log.append(log)
                break
            
    for tmp in list_tmp:
        print(tmp)

In [None]:
import pandas as pd
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',

            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

# datasets = ['Linux']

count_list = []
shot = 0

for dataset in datasets:
    print(f"Processing {dataset} dataset...")
    with open (f'outputs/parser/Test/{dataset}.txt', 'r') as f:
        lines = f.readlines()
    Read = False
    count = 0
    for line in lines:
        if 'len=' in line and 'cluster' in line:
            if count > 5:
                print(length)
                for tmp in tmp_list:
                    print(tmp.strip('\n'))
                shot+=1
            count = 0
            tmp_list = []
            parts = line.strip().split("len=")
            if len(parts) == 2:  # 确保字符串中包含"len="
                tmp = parts[1]
                length = int(tmp)
            if length > 50:
                Read = True
            else:
                Read = False
        else:
            if Read:
                tmp_list.append(line)
                # print(line)
                count += 1

print(shot/len(datasets))
# [147, 49, 156, 98, 65, 202, 45, 61, 82, 361, 143, 164, 93, 43, 203, 69]
# [137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]

                

In [20]:
from utils.sample_byword import extract_variables

matches = extract_variables(
    '1 is over than 2 as result', '<*> is over than <*> as result')
if matches == []:
    print('no matches')
if matches == ():
    print("2")
else:
    print(matches)

no matches
[]


In [6]:
template = '<*> is <*>, logname=<*>'
list2 = ['1', '2', '']
list1 = template.split('<*>')
template2 = list1[0]
for index, tmp in enumerate(list2):
    if tmp != '':
        template2 += '<*>' + list1[index + 1]
    else:
        template2 += list1[index + 1]
print(template2)



<*> is <*>, logname=


In [2]:
from utils.postprocess import post_process

post_process('`{{1}} is over {{2}}, logname = {{3}}`', '1 is over 2, logname = ')

ok


('{{1}} is over {{2}}, logname = {{3}}', '<*> is over <*>, logname = ')