## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [25]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
num_list = []
# datasets = ['OpenStack']
for dataset in datasets:
    print(f'Processing {dataset} dataset...')
    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    num_list.append(cluster_nums)
    # labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

Processing BGL dataset...
Processing HDFS dataset...
Processing Linux dataset...




Processing HealthApp dataset...
Processing OpenStack dataset...
Processing OpenSSH dataset...




Processing Proxifier dataset...
Processing HPC dataset...
Processing Zookeeper dataset...




Processing Mac dataset...
Processing Hadoop dataset...




Processing Android dataset...
Processing Windows dataset...




Processing Apache dataset...
Processing Thunderbird dataset...
Processing Spark dataset...




In [27]:
print(num_list)
print((sum(num_list)-14-3-77)/len(num_list))

[44, 11, 33, 25, 40, 28, 6, 26, 18, 107, 28, 61, 8, 6, 26, 16]
24.3125


In [22]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 17
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 61
len of templates: 158
cluster: 17
length: 85
template: HBM brightnessOut =<*>
--------------------
HBM brightnessOut =38


In [None]:
# check the cluster k
# k = 0
# lengh_cluster = len(inputs[k][1])
# print('cluster ', k)
# print('length:', lengh_cluster)
# print('template:', inputs[k][3])
# print('-'*20)
# for log in set(inputs[k][1]):
#     print(log)

#      len
# Linux 0.5   tokenize '=' difference between (<*>) and () group first will help
# HealthApp: 1   same length, 2 words different(80 logs) refine by difference of words will help
# Zookeeper: 0 same length, 2 words different(12 logs)
# Hadoop: 0 same length 1 words different(118 logs)
# Spark: 0  same length 1 words different(149 logs)

# good cluster datasets
# HDFS OpenStack Proxifier HPC Mac Windows Apache Thunderbird
# length solved datasets
# BGL OpenSSH Android
# 

## evaluate

In [28]:
import time
from utils.evaluator import evaluate
import pandas as pd
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'

datasets = table_order.split(' ')
m,n,p,q = [],[],[],[]
for dataset in datasets:
    file = f'outputs/parser/Test1/{dataset}.csv'  # Fifth_=_0.1
    # df = pd.read_csv(f'outputs/k_means/initial/{dataset}.csv')
    # df2 =
    a,b,c,d = evaluate(file, dataset,mismatch=True)
    m.append(a)
    n.append(b)
    p.append(c)
    q.append(d)

print('avg---------: group Accuracy: %.4f, Message-Level Accuracy: %.4f, Edit Distance: %.4f' % (sum(m)/len(m), sum(n)/len(n), sum(p)/len(p)))

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 0.9995, Edit Distance: 0.0025
      Hadoop: group Accuracy: 0.9925, Message-Level Accuracy: 0.9855, Edit Distance: 0.4690
       Spark: group Accuracy: 0.9985, Message-Level Accuracy: 0.9960, Edit Distance: 0.1100
   Zookeeper: group Accuracy: 0.9945, Message-Level Accuracy: 0.9900, Edit Distance: 0.0930
         BGL: group Accuracy: 0.9900, Message-Level Accuracy: 0.9835, Edit Distance: 0.4530
         HPC: group Accuracy: 0.9345, Message-Level Accuracy: 0.9670, Edit Distance: 0.3395
 Thunderbird: group Accuracy: 0.9865, Message-Level Accuracy: 0.9770, Edit Distance: 0.2580
     Windows: group Accuracy: 0.9955, Message-Level Accuracy: 0.9880, Edit Distance: 0.9870
       Linux: group Accuracy: 0.6305, Message-Level Accuracy: 0.8950, Edit Distance: 1.3375
     Android: group Accuracy: 0.9520, Message-Level Accuracy: 0.8615, Edit Distance: 2.2785
   HealthApp: group Accuracy: 1.0000, Message-Level Accuracy: 0.9960, Edit Dista

2000


## caculate the information entropy

In [None]:
import math
from collections import Counter
import pandas as pd
import re

def extract_variables(log, template):
    # 将模板中的 <*> 替换为正则表达式的捕获组 (.*?)
    # 为了避免正则表达式的特殊字符导致的问题，先将模板中除了 <*> 外的其他部分进行转义
    # 然后将 <*> 替换为正则表达式的捕获组
    # 这里假设模板中的 <*> 不紧邻正则特殊字符，如果有，需要更复杂的处理
    pattern_parts = template.split("<*>")
    pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
    regex_pattern = "(.*?)".join(pattern_parts_escaped)
    regex = "^" + regex_pattern + "$"  # 添加开始和结束锚点以确保完整匹配

    matches = re.search(regex, log)
    if matches:
        return matches.groups()
    else:
        return []

def calculate_entropy(lst):
    # 计算列表中每个元素出现的频率

    # list to str
    # print(''.join(lst))

    counter = Counter(lst)
    probs = [count / len(lst) for count in counter.values()]

    # 计算信息熵
    entropy = -sum(p * math.log2(p) for p in probs)

    return entropy
def select_log_template_pairs_based_on_entropy(pairs, num_examples):
    # 计算每个对的信息熵
    entropies = [(pair, calculate_entropy(list(pair[0]) + list(pair[1])))  # list(pair[0]) + list(pair[1]) / extract_variables(pair[0], pair[1])
                 for pair in pairs]

    # 根据信息熵对对进行排序
    sorted_pairs = sorted(entropies, key=lambda x: x[1], reverse=True)

    # 选择信息熵最高的对
    selected_pairs = sorted_pairs[:num_examples]

    return [pair for pair, entropy in selected_pairs]

# discard the target dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
            'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# datasets.remove('BGL')
demonstration_templates = []
demonstration_logs = []
pairs = []
for d in datasets:
    df = pd.read_csv(f'dataset\{d}\{d}_2k.log_structured_corrected.csv')
    list1 = df['Content'].tolist()
    list2 = df['EventTemplate'].tolist()
    for log, template in zip(list1, list2):
        if template not in demonstration_templates:
            pairs.append((log, template))
            demonstration_templates.append(template)
            demonstration_logs.append(log)

list =  select_log_template_pairs_based_on_entropy(pairs, 10)
for log, template in list:
    print(log)
    print(template)
    

## Find similarity in all datasets

In [None]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
    'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count_logs = []
count_templates = []

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            if any(char.isdigit() for char in template):
                print(f"{template}")