## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [18]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# num_list = []
# datasets = ['OpenStack']
# for dataset in datasets:
dataset = 'Spark'
print(f'Processing {dataset} dataset...')
# load the dataset
df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content'].tolist()[200:400]
templates = df['EventTemplate'].tolist()[200:400]

# tokenize -> vectorize -> cluster -> reassign_clusters
tokenized_logs = [tokenize(log) for log in logs]
labels, cluster_nums = cluster(vectorize(tokenized_logs))
labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

Processing Spark dataset...




In [31]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 8
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 11
len of templates: 10
cluster: 8
length: 6
template: Started reading broadcast variable <*>
--------------------
Started reading broadcast variable 17
Started reading broadcast variable 19
Started reading broadcast variable 15
Started reading broadcast variable 16
Started reading broadcast variable 14
Started reading broadcast variable 18


In [None]:
# check the cluster k
# k = 0
# lengh_cluster = len(inputs[k][1])
# print('cluster ', k)
# print('length:', lengh_cluster)
# print('template:', inputs[k][3])
# print('-'*20)
# for log in set(inputs[k][1]):
#     print(log)

#      len
# Linux 0.5   tokenize '=' difference between (<*>) and () group first will help
# HealthApp: 1   same length, 2 words different(80 logs) refine by difference of words will help
# Zookeeper: 0 same length, 2 words different(12 logs)
# Hadoop: 0 same length 1 words different(118 logs)
# Spark: 0  same length 1 words different(149 logs)

# good cluster datasets
# HDFS OpenStack Proxifier HPC Mac Windows Apache Thunderbird
# length solved datasets
# BGL OpenSSH Android
# 

## evaluate

In [6]:
import os
from utils.evaluator import evaluate
import pandas as pd
from IPython.display import HTML


def calculate_avg(numbers):
    avg = sum(numbers) / len(numbers)
    numbers.append(avg)
    numbers = [round(num, 3) for num in numbers]
    return numbers

table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'
datasets = table_order.split(' ')

table_data = {
    'dataset': [],
    'GA': [],
    'PA': [],
    'ED': []
}

# Note: chage the file name to the name of the log file
results_file_name = ['0shot_bestTry', 'Test']
file_name = 'Test_batch_match_first'

result_table_path = f'outputs/parser/{file_name}/result_tabel.csv'
if os.path.exists(result_table_path):
    df = pd.read_csv(result_table_path)
else:
    ga, pa ,ed = [],[],[]
    for dataset in datasets:
        table_data['dataset'].append(dataset)
        file_path = f'outputs/parser/{file_name}/{dataset}.csv'
        
        a,b,c,d = evaluate(file_path, dataset)
        ga.append(a)
        pa.append(b)
        ed.append(c)

    table_data['dataset'].append('avg')
    table_data['GA'] = calculate_avg(ga)
    table_data['PA'] = calculate_avg(pa)
    table_data['ED'] = calculate_avg(ed)

    df = pd.DataFrame(table_data)
    df.to_csv(result_table_path, index=False)

table = df.to_html(index=False)
display(HTML(table))

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000
      Hadoop: group Accuracy: 0.9835, Message-Level Accuracy: 0.8755, Edit Distance: 6.7150
       Spark: group Accuracy: 0.9785, Message-Level Accuracy: 0.9510, Edit Distance: 1.4710
   Zookeeper: group Accuracy: 0.9700, Message-Level Accuracy: 0.9615, Edit Distance: 0.4110
         BGL: group Accuracy: 0.9655, Message-Level Accuracy: 0.9205, Edit Distance: 2.1070
         HPC: group Accuracy: 0.8580, Message-Level Accuracy: 0.8460, Edit Distance: 1.8715
 Thunderbird: group Accuracy: 0.9640, Message-Level Accuracy: 0.5595, Edit Distance: 6.8510
     Windows: group Accuracy: 0.9905, Message-Level Accuracy: 0.8270, Edit Distance: 2.7905
       Linux: group Accuracy: 0.7445, Message-Level Accuracy: 0.7060, Edit Distance: 1.2305
     Android: group Accuracy: 0.7520, Message-Level Accuracy: 0.5290, Edit Distance: 9.3950
   HealthApp: group Accuracy: 0.8990, Message-Level Accuracy: 0.7610, Edit Dista

dataset,GA,PA,ED
HDFS,1.0,1.0,0.0
Hadoop,0.984,0.876,6.715
Spark,0.979,0.951,1.471
Zookeeper,0.97,0.962,0.411
BGL,0.966,0.92,2.107
HPC,0.858,0.846,1.872
Thunderbird,0.964,0.56,6.851
Windows,0.991,0.827,2.79
Linux,0.745,0.706,1.23
Android,0.752,0.529,9.395


## Find similarity in all datasets

In [6]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count_logs = []
count_templates = []

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            if '<*>:' in template:
                print(f"{template}")

Processing BGL ----------------
ciod: Error loading <*>: invalid or missing program image, No such file or directory
ciod: Error creating node map from file <*>: No child processes
ciod: Error creating node map from file <*>: Bad file descriptor
ciod: Error creating node map from file <*>: Block device required
ciod: Error creating node map from file <*>: Permission denied
rts: kernel terminated for reason <*>: bad message header: invalid cpu, type=<*>, cpu=<*>, index=<*>, total=<*>
ciod: Error loading <*>: invalid or missing program image, Exec format error
ciod: Error reading message prefix after LOAD_MESSAGE on CioStream socket to <*>: Link has been severed
ciod: Error loading <*>: invalid or missing program image, Permission denied
ciod: Error loading <*>: program image too big, <*> > <*>
Processing HDFS ----------------
<*>:Got exception while serving <*> to <*>:
Processing Linux ----------------
<*>: Auto-detected intellimouse <*>
<*>: <*> - <*> (usable)
<*>: <*> - <*> (reserved)

In [None]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

Mutation
```
# if '3' in template:
#     Count[3] += 1
# if '4' in template:
#     Count[4] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
# if '5' in template:
#     Count[5] += 1
#     print(dataset, template)
#     # 1 + 2
# if '6' in template:
#     Count[6] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
```

In [4]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize



datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'



for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    list_log = []
    list_tmp = []
    print('-'*20)
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    logs = df['Content'].tolist()
    freq = Counter(templates)
    
    for template,log in zip(templates,logs):
        if '{' in template or '}' in template and template not in list_tmp:
            list_tmp.append(template)
            list_log.append(log)
            
    for tmp in list_tmp:
        print(tmp)

Processing BGL ----------------
--------------------
Processing HDFS ----------------
--------------------
Processing Linux ----------------
--------------------
Processing HealthApp ----------------
--------------------
startSync hiSyncOption = HiSyncOption{syncAction=<*>, syncMethod=<*>, syncScope=<*>, syncDataType=<*>, syncModel=<*>, pushAction=<*>},app = <*> who = <*>
startSync hiSyncOption = HiSyncOption{syncAction=<*>, syncMethod=<*>, syncScope=<*>, syncDataType=<*>, syncModel=<*>, pushAction=<*>},app = <*> who = <*>
startSync hiSyncOption = HiSyncOption{syncAction=<*>, syncMethod=<*>, syncScope=<*>, syncDataType=<*>, syncModel=<*>, pushAction=<*>},app = <*> who = <*>
startSync hiSyncOption = HiSyncOption{syncAction=<*>, syncMethod=<*>, syncScope=<*>, syncDataType=<*>, syncModel=<*>, pushAction=<*>},app = <*> who = <*>
startSync hiSyncOption = HiSyncOption{syncAction=<*>, syncMethod=<*>, syncScope=<*>, syncDataType=<*>, syncModel=<*>, pushAction=<*>},app = <*> who = <*>
Processin

In [6]:
import pandas as pd
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',

            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']




# datasets = ['Linux']

count_list = []

shot = 0

for dataset in datasets:
    print(f"Processing {dataset} dataset...")
    with open (f'outputs/parser/Test/{dataset}.txt', 'r') as f:
        lines = f.readlines()
    Read = False
    count = 0
    for line in lines:
        if 'len=' in line and 'cluster' in line:
            if count > 5:
                print(length)
                shot+=1
            count = 0
            parts = line.strip().split("len=")
            if len(parts) == 2:  # 确保字符串中包含"len="
                tmp = parts[1]
                length = int(tmp)
            if length > 50:
                Read = True
            else:
                Read = False
        else:
            if Read:
                count += 1

print(shot/len(datasets))
# [147, 49, 156, 98, 65, 202, 45, 61, 82, 361, 143, 164, 93, 43, 203, 69]
# [137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]

                

Processing BGL dataset...
60
51
Processing HDFS dataset...
Processing Linux dataset...
372
216
118
Processing HealthApp dataset...
242
144
Processing OpenStack dataset...
931
Processing OpenSSH dataset...
384
Processing Proxifier dataset...
954
947
Processing HPC dataset...
394
91
60
Processing Zookeeper dataset...
266
Processing Mac dataset...
Processing Hadoop dataset...
476
326
Processing Android dataset...
200
Processing Windows dataset...
280
224
224
Processing Apache dataset...
Processing Thunderbird dataset...
568
62
Processing Spark dataset...
80
1.5


In [14]:
101 // 50

2