## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [None]:
import pandas as pd
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

datasets = ['BGL']

for dataset in datasets:
    print('-' * 50)
    print(f'Clustering {dataset} dataset...')

    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    print(vectorize(tokenized_logs))
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

    # store the logs in the cluster
    # inputs = []
    # for i in range(cluster_nums):
    #     inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
    # for i, label in enumerate(labels):
    #     inputs[label][0] = label
    #     inputs[label][1].append(logs[i])
    #     inputs[label][2].append(i)
    #     if inputs[label][3] == '':
    #         inputs[label][3] = df['EventTemplate'][i]

    # clusters = []
    # for input in inputs:
    #     c = Cluster(*input, remove_duplicate= True)
    #     clusters.append(c)
    # Count = 0
    # for c in clusters:    
    #     if len(c.indexs) > 10:
    #         print(c.logs[0])
    #         print(c.oracle_template)
    #         print(len(c.indexs))
    #         print('=' * 50)
    #         Count+=1
    # print(f'Count : {Count}')

## Show Clusters

In [None]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

clusters = []
for input in inputs:
    c = Cluster(*input, remove_duplicate= True)
    clusters.append(c)

# for cluster in clusters:
#     print(f'cluster {cluster.label}: {len(cluster.logs)} logs, {len(cluster.indexs)} indexs')

num = 1
print('cluster:', num)
print('length:', len(clusters[num].indexs))
print('template:', clusters[num].oracle_template)
set_logs = set(clusters[num].static_logs)
print('len of set:', len(set_logs))
print('-'*20)
for log in set_logs:
    print(log)
print('='*40)

# for cluster in clusters:
#     if len(set(cluster.logs)) == 1 and not any(char.isdigit() for char in cluster.logs[0]):
#         print(f"{cluster.logs[0]}\n{cluster.oracle_template}")
#         print('='*40)

## Evaluate

In [59]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets(
    'LogBatcher_0shot_32candidate_10batchsize_with_smilarity_sample', send_email=False)
HTML(table)

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000, Normalized Edit Distance: 1.000000
      Hadoop: group Accuracy: 0.9885, Message-Level Accuracy: 0.8850, Edit Distance: 6.6165, Normalized Edit Distance: 0.951579
       Spark: group Accuracy: 0.9790, Message-Level Accuracy: 0.9535, Edit Distance: 0.4600, Normalized Edit Distance: 0.987549
   Zookeeper: group Accuracy: 0.9745, Message-Level Accuracy: 0.8340, Edit Distance: 1.9040, Normalized Edit Distance: 0.933939
         BGL: group Accuracy: 0.9925, Message-Level Accuracy: 0.9455, Edit Distance: 1.8415, Normalized Edit Distance: 0.988423
         HPC: group Accuracy: 0.9525, Message-Level Accuracy: 0.9370, Edit Distance: 0.4215, Normalized Edit Distance: 0.994895
 Thunderbird: group Accuracy: 0.9190, Message-Level Accuracy: 0.8585, Edit Distance: 2.0180, Normalized Edit Distance: 0.953605
     Windows: group Accuracy: 0.9960, Message-Level Accuracy: 0.6075, Edit Distance: 9.0435, Normalized 

dataset,GA,PA,ED,N_ED
HDFS,1.0,1.0,0.0,1.0
Hadoop,0.9885,0.885,6.6165,0.95158
Spark,0.979,0.9535,0.46,0.98755
Zookeeper,0.9745,0.834,1.904,0.93394
BGL,0.9925,0.9455,1.8415,0.98842
HPC,0.9525,0.937,0.4215,0.99489
Thunderbird,0.919,0.8585,2.018,0.9536
Windows,0.996,0.6075,9.0435,0.86132
Linux,0.996,0.2755,9.637,0.81082
Android,0.836,0.7725,4.4225,0.94423


In [None]:
from evaluate import evaluate_single_dataset
datasets = ['BGL', 'HDFS', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper']
for dataset in datasets:
    evaluate_single_dataset(
        f'outputs/parser/Test_10shot_with_pruning/{dataset}_2k.log_structured.csv', dataset)

## Check out some strings' freq in the whole logs or templates

In [None]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template
from utils.postprocess import extract_variables

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count = 0
count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    # templates = list(set(templates))
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            count += 1
print(count)

## sample based on entropy

In [None]:
from utils.sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Token Caclulate

In [None]:
import json
import tiktoken
import pandas as pd


def count_prompt_tokens(prompt, model_name):
    # 根据模型名称加载合适的编码器
    if model_name == "gpt-4":
        encoder = tiktoken.encoding_for_model("gpt-4")
    elif model_name == "gpt-3.5-turbo":
        encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
    else:
        raise ValueError("未知的模型名称")

    # 计算编码后的token数
    prompt_tokens = encoder.encode(prompt)
    return len(prompt_tokens)


def count_message_tokens(messages, model_name):
    # 根据模型名称加载合适的编码器
    if model_name == "gpt-4":
        encoder = tiktoken.encoding_for_model("gpt-4")
    elif model_name == "gpt-3.5-turbo":
        encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
    else:
        raise ValueError("未知的模型名称")

    # 初始化token计数
    token_count = 0

    # 计算每个消息的token数
    for message in messages:
        role_tokens = encoder.encode(message['role'])
        content_tokens = encoder.encode(message['content'])
        token_count += len(role_tokens) + \
            len(content_tokens) + 4  # 加上特殊的消息分隔符的token数

    return token_count

## Caculate Cost

In [None]:



table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache OpenSSH OpenStack Mac'
datasets = table_order.split(' ')
logs = {}
counts_token = {}
counts_message = {}
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs[dataset] = df['Content'].tolist()
    counts_token[dataset] = 0
    counts_message[dataset] = 0
for log in logs['HealthApp']:
    log = log.strip()

# 存储解析后的日志列表
message_list = []
# load every message
file = 'cost_lilac_32_3.json'
with open('outputs/cost/LogBatcher_3shot_32candidate_10batchsize.json', 'r') as file:
    for line in file:
        if line.strip() == '[':
            list_str = ''
            start_load = True
        if line.strip() == ']':
            list_str += line
            message = json.loads(list_str)
            message_list.append(message)
            start_load = False
        if start_load:
            list_str += line
# print(len(message_list))
for message in message_list:
    # for LILAC
    log = message[-1]['content'].split('\n')[0].replace('Log message: `', '').replace('`', '')
    # for LogBatcher
    # log = message[-1]['content'].split('\n')[0] 
    # print(log)
    for dataset in datasets:
        if log in logs[dataset]:
            counts_token[dataset] += count_message_tokens(message, 'gpt-3.5-turbo')
            counts_message[dataset] += 1
            break
        if dataset == 'Mac':
            print(log)
for dataset in datasets:
    print(counts_token[dataset], counts_token[dataset] / counts_message[dataset]  )

print(sum(counts_token.values()), sum(counts_token.values()) / sum(counts_message.values()))

# remove the same log messages

# def make_hashable(log_list):

#     return tuple(tuple(sorted(d.items())) for d in log_list)
# unique_lists = list(set(make_hashable(log_list) for log_list in message_list))

# unique_big_list = [list(map(dict, log_list)) for log_list in unique_lists]
# print(len(unique_big_list))

In [None]:


table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache OpenSSH OpenStack Mac'
datasets = table_order.split(' ')
# datasets = ['HDFS']
logs = {}
counts_token = {}
counts_message = {}
for dataset in datasets:
    counts_token[dataset] = 0
    counts_message[dataset] = 0
    with open(f'DivLog/cost/cost_divlog_for_{dataset}.json', 'r') as file:
        prompt_list = json.load(file)
    # print(f"caculate {dataset}: len: {len(prompt_list)}")
    for prompt in prompt_list:
        counts_token[dataset] += count_prompt_tokens(prompt, 'gpt-3.5-turbo')
        counts_message[dataset] += 1
    print(counts_token[dataset], counts_token[dataset] / counts_message[dataset])

# print average token count
print(sum(counts_token.values()), sum(counts_token.values()) / sum(counts_message.values()))

In [None]:
print(count_prompt_tokens('For each log after <prompt> tag, extract one log template(substitute variable tokens in the log as <*> and remain constant tokens to construct the template)and put the template after <extraction> tag and between <START> and <END> tags.', 'gpt-3.5-turbo'))

In [None]:
from utils.sample_byword import extract_variables
print(extract_variables("488205 floating point alignment exceptions",
      "<*> floating point alignment exceptions")) 

In [None]:
list1 = [1,2,3,4,5]
class test:
    def __init__(self):
        self.list2 = [1,2,3,4,7]

t_class = test()
for a in list1:
    if a == 4:
        t_class.list2.remove(a)
print(t_class.list2)
print(list1)