## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [5]:
import pandas as pd
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

datasets = ['HealthApp']

for dataset in datasets:
    print('-' * 50)
    print(f'Clustering {dataset} dataset...')

    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    print(vectorize(tokenized_logs))
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

    # store the logs in the cluster
    inputs = []
    for i in range(cluster_nums):
        inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
    for i, label in enumerate(labels):
        inputs[label][0] = label
        inputs[label][1].append(logs[i])
        inputs[label][2].append(i)
        if inputs[label][3] == '':
            inputs[label][3] = df['EventTemplate'][i]

    clusters = []
    for input in inputs:
        c = Cluster(*input, remove_duplicate= True)
        clusters.append(c)
    Count = 0
    for c in clusters:    
        if len(c.indexs) > 10:
            print(c.logs[0])
            print(c.oracle_template)
            print(len(c.indexs))
            print('=' * 50)
            Count+=1
    print(f'Count : {Count}')

--------------------------------------------------
Clustering HealthApp dataset...
  (0, 100)	1.0
  (1, 97)	1.0
  (2, 16)	0.6248332566193072
  (2, 11)	0.5520794333347835
  (2, 98)	0.5520794333347835
  (3, 12)	0.8517032026021748
  (3, 105)	0.5240244790820354
  (4, 45)	0.5668201002003888
  (4, 121)	0.5825439786011101
  (4, 64)	0.5825439786011101
  (5, 72)	1.0
  (6, 125)	1.0
  (7, 100)	1.0
  (8, 156)	0.7071067811865475
  (8, 31)	0.7071067811865475
  (9, 155)	0.7071067811865475
  (9, 30)	0.7071067811865475
  (10, 0)	0.7064072898835545
  (10, 8)	0.7078055812151892
  (11, 97)	1.0
  (12, 100)	1.0
  (13, 100)	1.0
  (14, 97)	1.0
  (15, 72)	1.0
  (16, 125)	1.0
  :	:
  (1987, 105)	0.6960412725776134
  (1988, 13)	0.7180017735831414
  (1988, 105)	0.6960412725776134
  (1989, 13)	0.7180017735831414
  (1989, 105)	0.6960412725776134
  (1990, 13)	0.7180017735831414
  (1990, 105)	0.6960412725776134
  (1991, 13)	0.7180017735831414
  (1991, 105)	0.6960412725776134
  (1992, 13)	0.7180017735831414
  (1992, 1

## Show Clusters

In [26]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

clusters = []
for input in inputs:
    c = Cluster(*input, remove_duplicate= True)
    clusters.append(c)

# for cluster in clusters:
#     print(f'cluster {cluster.label}: {len(cluster.logs)} logs, {len(cluster.indexs)} indexs')

# num = 27
# print('cluster:', num)
# print('length:', len(clusters[num].indexs))
# print('template:', clusters[num].oracle_template)
# print('len of set:', len(clusters[num].logs))
# print('-'*20)
# for log in clusters[num].logs:
#     print(log)
# print('='*40)

# for cluster in clusters:
#     if len(set(cluster.logs)) == 1 and not any(char.isdigit() for char in cluster.logs[0]):
#         print(f"{cluster.logs[0]}\n{cluster.oracle_template}")
#         print('='*40)

num of clusters: 75
len of templates: 75
onReceive action: android.intent.action.SCREEN_ON
onReceive action: android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
flush sensor data
flush sensor data
onReceive action: android.intent.action.SCREEN_OFF
onReceive action: android.intent.action.SCREEN_OFF
processHandleBroadcastAction action:android.intent.action.TIME_TICK
processHandleBroadcastAction action:android.intent.action.TIME_TICK
getBinderPackageName packageName = com.huawei.health
getBinderPackageName packageName = com.huawei.health
needAutoSync autoSyncSwitch is open
needAutoSync autoSyncSwitch is open
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy is true
initDataPrivacy the dataPrivacy is <*>
initUserPrivacy the userPrivacy switch is open, 

## Evaluate

In [1]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets(
    'LogBatcher_0shot_32candidate_10batchsize_with_random', send_email=False)
HTML(table)

        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000, Normalized Edit Distance: 1.000000
      Hadoop: group Accuracy: 0.9890, Message-Level Accuracy: 0.8865, Edit Distance: 6.5760, Normalized Edit Distance: 0.951896
       Spark: group Accuracy: 0.9230, Message-Level Accuracy: 0.9135, Edit Distance: 0.6600, Normalized Edit Distance: 0.984732
   Zookeeper: group Accuracy: 0.9745, Message-Level Accuracy: 0.9675, Edit Distance: 0.5705, Normalized Edit Distance: 0.991814
         BGL: group Accuracy: 0.9925, Message-Level Accuracy: 0.9375, Edit Distance: 1.9295, Normalized Edit Distance: 0.986919
         HPC: group Accuracy: 0.9525, Message-Level Accuracy: 0.9430, Edit Distance: 0.3755, Normalized Edit Distance: 0.995286
 Thunderbird: group Accuracy: 0.9135, Message-Level Accuracy: 0.8525, Edit Distance: 2.2000, Normalized Edit Distance: 0.951577
     Windows: group Accuracy: 1.0000, Message-Level Accuracy: 0.6095, Edit Distance: 12.0005, Normalized

dataset,GA,PA,ED,N_ED
HDFS,1.0,1.0,0.0,1.0
Hadoop,0.989,0.8865,6.576,0.9519
Spark,0.923,0.9135,0.66,0.98473
Zookeeper,0.9745,0.9675,0.5705,0.99181
BGL,0.9925,0.9375,1.9295,0.98692
HPC,0.9525,0.943,0.3755,0.99529
Thunderbird,0.9135,0.8525,2.2,0.95158
Windows,1.0,0.6095,12.0005,0.79537
Linux,0.996,0.728,2.4395,0.96777
Android,0.8605,0.734,4.526,0.94588


In [2]:
from evaluate import evaluate_single_dataset
datasets = ['BGL', 'HDFS', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper']
for dataset in datasets:
    evaluate_single_dataset(
        f'outputs/parser/Test_10shot_with_pruning/{dataset}_2k.log_structured.csv', dataset)

         BGL: group Accuracy: 0.9870, Message-Level Accuracy: 0.9410, Edit Distance: 1.2245, Normalized Edit Distance: 0.989453
        HDFS: group Accuracy: 1.0000, Message-Level Accuracy: 1.0000, Edit Distance: 0.0000, Normalized Edit Distance: 1.000000
   HealthApp: group Accuracy: 0.9195, Message-Level Accuracy: 0.9140, Edit Distance: 2.5880, Normalized Edit Distance: 0.961141
   OpenStack: group Accuracy: 1.0000, Message-Level Accuracy: 0.9820, Edit Distance: 0.3585, Normalized Edit Distance: 0.995158
     OpenSSH: group Accuracy: 1.0000, Message-Level Accuracy: 0.9755, Edit Distance: 1.0610, Normalized Edit Distance: 0.989056
         HPC: group Accuracy: 0.9525, Message-Level Accuracy: 0.9430, Edit Distance: 0.3350, Normalized Edit Distance: 0.995339
   Zookeeper: group Accuracy: 0.9945, Message-Level Accuracy: 0.9875, Edit Distance: 0.4305, Normalized Edit Distance: 0.994671


## Check out some strings' freq in the whole logs or templates

In [22]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template
from utils.postprocess import extract_variables

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count = 0
count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    # templates = list(set(templates))
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            count += 1
print(count)

Processing BGL ----------------
Processing HDFS ----------------
Processing Linux ----------------
Processing HealthApp ----------------
Processing OpenStack ----------------
Processing OpenSSH ----------------
Processing HPC ----------------
Processing Zookeeper ----------------
Processing Mac ----------------
Processing Hadoop ----------------
Processing Android ----------------
Processing Windows ----------------
Processing Apache ----------------
Processing Thunderbird ----------------
Processing Spark ----------------
1334


## sample based on entropy

In [None]:
from utils.sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Mutation Count -- num

In [None]:

# 4: 15 + 28 + 5 + 18 + 10
# 5: 1 + 2
# 6: 15 + 28 + 5 + 18 + 10

## Caculate Cost

In [28]:
import json
import tiktoken
import pandas as pd




def count_message_tokens(messages, model_name):
    # 根据模型名称加载合适的编码器
    if model_name == "gpt-4":
        encoder = tiktoken.encoding_for_model("gpt-4")
    elif model_name == "gpt-3.5-turbo":
        encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
    else:
        raise ValueError("未知的模型名称")

    # 初始化token计数
    token_count = 0

    # 计算每个消息的token数
    for message in messages:
        role_tokens = encoder.encode(message['role'])
        content_tokens = encoder.encode(message['content'])
        token_count += len(role_tokens) + \
            len(content_tokens) + 4  # 加上特殊的消息分隔符的token数

    return token_count


table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache OpenSSH OpenStack Mac'
datasets = table_order.split(' ')
logs = {}
counts_token = {}
counts_message = {}
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs[dataset] = df['Content'].tolist()
    counts_token[dataset] = 0
    counts_message[dataset] = 0
for log in logs['HealthApp']:
    log = log.strip()

# 存储解析后的日志列表
message_list = []
# load every message
file = 'cost_lilac_32_3.json'
with open('outputs/cost/cost_logbatcher_32_1.json', 'r') as file:
    for line in file:
        if line.strip() == '[':
            list_str = ''
            start_load = True
        if line.strip() == ']':
            list_str += line
            message = json.loads(list_str)
            message_list.append(message)
            start_load = False
        if start_load:
            list_str += line
# print(len(message_list))
for message in message_list:
    # for LILAC
    log = message[-1]['content'].split('\n')[0].replace('Log message: `', '').replace('`', '')
    # for LogBatcher
    # log = message[-1]['content'].split('\n')[0] 
    # print(log)
    for dataset in datasets:
        if log in logs[dataset]:
            counts_token[dataset] += count_message_tokens(message, 'gpt-3.5-turbo')
            counts_message[dataset] += 1
            break
        if dataset == 'Mac':
            print(log)
for dataset in datasets:
    print(counts_token[dataset], counts_token[dataset] / counts_message[dataset]  )

print(sum(counts_token.values()), sum(counts_token.values()) / sum(counts_message.values()))

# remove the same log messages

# def make_hashable(log_list):

#     return tuple(tuple(sorted(d.items())) for d in log_list)
# unique_lists = list(set(make_hashable(log_list) for log_list in message_list))

# unique_big_list = [list(map(dict, log_list)) for log_list in unique_lists]
# print(len(unique_big_list))

7139 509.92857142857144
22342 204.9724770642202
6609 200.27272727272728
9222 174.0
23509 195.90833333333333
6639 144.32608695652175
25522 157.54320987654322
8501 170.02
17485 158.95454545454547
31405 196.28125
10799 143.98666666666668
1234 205.66666666666666
5853 225.1153846153846
16935 413.0487804878049
73047 216.7566765578635
266241 198.39120715350222
