## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [3]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster
import time


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

dataset = 'hadoop'
print(f'Clustering {dataset} dataset...')

# load the dataset
df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content'].tolist()
templates = df['EventTemplate'].tolist()

# tokenize -> vectorize -> cluster -> reassign_clusters
tokenized_logs = [tokenize(log) for log in logs]
labels, cluster_nums = cluster(vectorize(tokenized_logs))
labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)


Executing with tokens
Clustering hadoop dataset...


## Show Clusters

In [35]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 23
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('len of set:', len(set(inputs[num][1])))
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)

num of clusters: 116
len of templates: 114
cluster: 23
length: 289
template: Progress of TaskAttempt <*> is : <*>
len of set: 112
--------------------
Progress of TaskAttempt attempt_1445144423722_0020_m_000005_0 is : 0.32285523
Progress of TaskAttempt attempt_1445144423722_0020_m_000000_0 is : 0.29998285
Progress of TaskAttempt attempt_1445144423722_0020_m_000001_0 is : 0.1066108
Progress of TaskAttempt attempt_1445144423722_0020_m_000002_0 is : 0.22765201
Progress of TaskAttempt attempt_1445144423722_0020_m_000003_0 is : 0.6199081
Progress of TaskAttempt attempt_1445144423722_0020_m_000000_0 is : 0.023958297
Progress of TaskAttempt attempt_1445144423722_0020_m_000002_0 is : 0.26314905
Progress of TaskAttempt attempt_1445144423722_0020_m_000001_0 is : 0.079464614
Progress of TaskAttempt attempt_1445144423722_0020_m_000003_0 is : 0.36323506
Progress of TaskAttempt attempt_1445144423722_0020_m_000003_0 is : 0.5091932
Progress of TaskAttempt attempt_1445144423722_0020_m_000004_0 is : 0.4

## Evaluate

In [2]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets(
    'result_LILAC_2k_0_0_gpt-3.5-turbo-0613', send_email=False)
HTML(table)

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 0.9425, Edit Distance: 0.0575
      Hadoop: group Accuracy: 0.9575, Message-Level Accuracy: 0.8430, Edit Distance: 3.0310
       Spark: group Accuracy: 0.9980, Message-Level Accuracy: 0.8055, Edit Distance: 0.4050
   Zookeeper: group Accuracy: 0.9885, Message-Level Accuracy: 0.3740, Edit Distance: 3.3065
         BGL: group Accuracy: 0.9410, Message-Level Accuracy: 0.8695, Edit Distance: 3.7345
         HPC: group Accuracy: 0.9110, Message-Level Accuracy: 0.6405, Edit Distance: 2.8530
 Thunderbird: group Accuracy: 0.9565, Message-Level Accuracy: 0.8515, Edit Distance: 2.2670
     Windows: group Accuracy: 0.6940, Message-Level Accuracy: 0.0200, Edit Distance: 17.5835
       Linux: group Accuracy: 0.2975, Message-Level Accuracy: 0.3435, Edit Distance: 5.1890
     Android: group Accuracy: 0.9310, Message-Level Accuracy: 0.4805, Edit Distance: 10.8390
   HealthApp: group Accuracy: 0.9005, Message-Level Accuracy: 0.7450, Edit Dis

dataset,GA,PA,ED
HDFS,1.0,0.942,0.058
Hadoop,0.958,0.843,3.031
Spark,0.998,0.806,0.405
Zookeeper,0.989,0.374,3.306
BGL,0.941,0.87,3.734
HPC,0.911,0.64,2.853
Thunderbird,0.957,0.852,2.267
Windows,0.694,0.02,17.584
Linux,0.297,0.344,5.189
Android,0.931,0.48,10.839


In [2]:
from evaluate import evaluate_single_dataset

evaluate_single_dataset('outputs/parser/Test3/Hadoop.csv', 'Hadoop')

      Hadoop: group Accuracy: 0.9890, Message-Level Accuracy: 0.6344, Edit Distance: 11.4817


In [2]:
import re

pattern = r'\b0[xX][0-9a-fA-F]+\b'
text = "Here are some hex numbers: 0x1a3, 0X4D2, and 0xABCDEF."

matches = re.findall(pattern, text)
print(matches)

['0x1a3', '0X4D2', '0xABCDEF']


## Check out some strings' freq in the whole logs or templates

In [12]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template
from utils.postprocess import extract_variables

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    # templates = list(set(templates))
    for log, template in zip(logs, templates):
        if template not in count_templates and '  ' in template:
            count_templates.append(template)
            print(template)
            print(log)
            print('-'*20)

Processing BGL ----------------
Processing HDFS ----------------
Processing Linux ----------------
authentication failure; logname= uid=<*> euid=<*> tty=<*> ruser= rhost=<*>  user=<*>
authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root
--------------------
ANONYMOUS FTP LOGIN FROM <*>,  (anonymous)
ANONYMOUS FTP LOGIN FROM 84.102.20.2,  (anonymous)
--------------------
SELinux:  Initializing.
SELinux:  Initializing.
--------------------
SELinux:  Starting in permissive mode
SELinux:  Starting in permissive mode
--------------------
<*>:  Registering secondary module capability
selinux_register_security:  Registering secondary module capability
--------------------
Initializing random number generator:  succeeded
Initializing random number generator:  succeeded
--------------------
Starting pcmcia:  succeeded
Starting pcmcia:  succeeded
--------------------
Setting network parameters:  succeeded
Setting network parameters: 

## sample based on entropy

In [None]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Mutation Count -- num

In [None]:

# 4: 15 + 28 + 5 + 18 + 10
# 5: 1 + 2
# 6: 15 + 28 + 5 + 18 + 10

In [None]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    list_log = []
    list_tmp = []
    print('-'*20)
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    logs = df['Content'].tolist()
    freq = Counter(templates)
    
    for template,log in zip(templates,logs):
        tokens = template.split()
        for token in tokens:
            if ':' in token and '<*>' in token:
                # print(f"{template}\n{log}\n{'-'*20}")
                list_tmp.append(template)
                list_log.append(log)
                break
            
    for tmp in list_tmp:
        print(tmp)

In [None]:
import pandas as pd
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',

            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

# datasets = ['Linux']

count_list = []
shot = 0

for dataset in datasets:
    print(f"Processing {dataset} dataset...")
    with open (f'outputs/parser/Test/{dataset}.txt', 'r') as f:
        lines = f.readlines()
    Read = False
    count = 0
    for line in lines:
        if 'len=' in line and 'cluster' in line:
            if count > 5:
                print(length)
                for tmp in tmp_list:
                    print(tmp.strip('\n'))
                shot+=1
            count = 0
            tmp_list = []
            parts = line.strip().split("len=")
            if len(parts) == 2:  # 确保字符串中包含"len="
                tmp = parts[1]
                length = int(tmp)
            if length > 50:
                Read = True
            else:
                Read = False
        else:
            if Read:
                tmp_list.append(line)
                # print(line)
                count += 1

print(shot/len(datasets))
# [147, 49, 156, 98, 65, 202, 45, 61, 82, 361, 143, 164, 93, 43, 203, 69]
# [137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]

                

In [20]:
from utils.sample_byword import extract_variables

matches = extract_variables(
    '1 is over than 2 as result', '<*> is over than <*> as result')
if matches == []:
    print('no matches')
if matches == ():
    print("2")
else:
    print(matches)

no matches
[]


In [6]:
template = '<*> is <*>, logname=<*>'
list2 = ['1', '2', '']
list1 = template.split('<*>')
template2 = list1[0]
for index, tmp in enumerate(list2):
    if tmp != '':
        template2 += '<*>' + list1[index + 1]
    else:
        template2 += list1[index + 1]
print(template2)



<*> is <*>, logname=


In [2]:
from utils.postprocess import post_process

post_process('`{{1}} is over {{2}}, logname = {{3}}`', '1 is over 2, logname = ')

ok


('{{1}} is over {{2}}, logname = {{3}}', '<*> is over <*>, logname = ')