## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [1]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# num_list = []
# datasets = ['OpenStack']
# for dataset in datasets:
dataset = 'Proxifier'
print(f'Processing {dataset} dataset...')
# load the dataset
df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content'].tolist()
templates = df['EventTemplate'].tolist()

# tokenize -> vectorize -> cluster -> reassign_clusters
tokenized_logs = [tokenize(log) for log in logs]
labels, cluster_nums = cluster(vectorize(tokenized_logs))
labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

Processing Proxifier dataset...




In [20]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 10
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 11
len of templates: 8
cluster: 10
length: 31
template: <*> open through proxy <*> SOCKS5
--------------------
16.client-channel.google.com:443 error : Could not connect to proxy proxy.cse.cuhk.edu.hk:5070 - connection attempt failed with error 10061
13.client-channel.google.com:443 error : Could not connect to proxy proxy.cse.cuhk.edu.hk:5070 - connection attempt failed with error 10061
www.google.com:443 error : Could not connect to proxy proxy.cse.cuhk.edu.hk:5070 - connection attempt failed with error 10061
183.62.156.108:22 open through proxy socks.cse.cuhk.edu.hk:5070 SOCKS5
bolt.dropbox.com:443 error : Could not connect to proxy proxy.cse.cuhk.edu.hk:5070 - connection attempt failed with error 10061
config.pinyin.sogou.com:80 error : Could not connect through proxy proxy.cse.cuhk.edu.hk:5070 - Proxy closed the connection unexpectedly.
qa.sockets.stackexchange.com:443 error : Could not connect to proxy proxy.cse.cuhk.edu.hk:5070 - connection attempt failed with e

In [None]:
# check the cluster k
# k = 0
# lengh_cluster = len(inputs[k][1])
# print('cluster ', k)
# print('length:', lengh_cluster)
# print('template:', inputs[k][3])
# print('-'*20)
# for log in set(inputs[k][1]):
#     print(log)

#      len
# Linux 0.5   tokenize '=' difference between (<*>) and () group first will help
# HealthApp: 1   same length, 2 words different(80 logs) refine by difference of words will help
# Zookeeper: 0 same length, 2 words different(12 logs)
# Hadoop: 0 same length 1 words different(118 logs)
# Spark: 0  same length 1 words different(149 logs)

# good cluster datasets
# HDFS OpenStack Proxifier HPC Mac Windows Apache Thunderbird
# length solved datasets
# BGL OpenSSH Android
# 

## evaluate

In [4]:
import os
from utils.evaluator import evaluate
import pandas as pd
from IPython.display import HTML


def calculate_avg(numbers):
    avg = sum(numbers) / len(numbers)
    numbers.append(avg)
    numbers = [round(num, 3) for num in numbers]
    return numbers

table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'
datasets = table_order.split(' ')

table_data = {
    'dataset': [],
    'GA': [],
    'PA': [],
    'ED': []
}

# Note: chage the file name to the name of the log file
file_name = '0shot_bestTry'

result_table_path = f'outputs/parser/{file_name}/result_tabel.csv'
if os.path.exists(result_table_path):
    df = pd.read_csv(result_table_path)
else:
    ga, pa ,ed = [],[],[]
    for dataset in datasets:
        table_data['dataset'].append(dataset)
        file_path = f'outputs/parser/{file_name}/{dataset}.csv'
        
        a,b,c,d = evaluate(file_path, dataset)
        ga.append(a)
        pa.append(b)
        ed.append(c)

    table_data['dataset'].append('avg')
    table_data['GA'] = calculate_avg(ga)
    table_data['PA'] = calculate_avg(pa)
    table_data['ED'] = calculate_avg(ed)

    df = pd.DataFrame(table_data)
    df.to_csv(result_table_path, index=False)

table = df.to_html(index=False)
display(HTML(table))

dataset,GA,PA,ED
HDFS,1.0,1.0,0.0
Hadoop,0.986,0.852,7.453
Spark,0.922,0.91,1.943
Zookeeper,0.994,0.666,1.51
BGL,0.982,0.958,0.784
HPC,0.852,0.638,3.493
Thunderbird,0.982,0.922,0.813
Windows,0.995,0.973,0.734
Linux,0.933,0.691,3.141
Android,0.964,0.574,14.991


## Find similarity in all datasets

In [None]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count_logs = []
count_templates = []

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            if any(char.isdigit() for char in template):
                print(f"{template}")

In [None]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

Mutation
```
# if '3' in template:
#     Count[3] += 1
# if '4' in template:
#     Count[4] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
# if '5' in template:
#     Count[5] += 1
#     print(dataset, template)
#     # 1 + 2
# if '6' in template:
#     Count[6] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
```

In [29]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize



datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'


list_log = []
list_tmp = []
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    logs = df['Content'].tolist()
    freq = Counter(templates)
    
    for template,log in zip(templates,logs):
        if 'mb' in log.lower() and template not in list_tmp:
            list_tmp.append(template)
            list_log.append(log)
            
for log in list_log:
    print(log)


1 ddr errors(s) detected and corrected on rank 0, symbol 25, bit 1
Can not get assembly information for node card
ciod: Error loading /bgl/apps/scaletest/performance/MINIBEN/mb_243_0810/allreduce.rts: invalid or missing program image, Exec format error
ciod: generated 64 core files for program IMB-MPI1.2MB_perf
ciod: Error loading /bgl/apps/swl-prep/rky-swl/MPI-PERF/IMB/perf_tests/IMB-MPI1.5124KB: invalid or missing program image, No such file or directory
1 ddr error(s) detected and corrected on rank 0, symbol 24 over 335 seconds
minus normalized number..................0
0MB HIGHMEM available.
126MB LOWMEM available.
Initializing random number generator:  succeeded
Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]
[instance: 96abccce-8d1f-4e07-b6d1-4b2ab87e23b4] Attempting claim: memory 2048 MB, disk 20 GB, vcpus 1 CPU
[instance: 96abccce-8d1f-4e07-b6d1-4b2ab87e23b

In [34]:
import pandas as pd
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',

            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']




# datasets = ['Linux']

count_list = []



for dataset in datasets:
    with open (f'outputs/parser/Test/{dataset}.txt', 'r') as f:
        lines = f.readlines()
    Read = False
    count = 0
    for line in lines:
        if 'len=' in line and 'cluster' in line:
            parts = line.strip().split("len=")
            if len(parts) == 2:  # 确保字符串中包含"len="
                tmp = parts[1]
            else:
                print("wrong")
                exit()
            count += int(tmp)//50
            count += 1 if int(tmp)%50 else 0
    count_list.append(count)
print(count_list)

# [147, 49, 156, 98, 65, 202, 45, 61, 82, 361, 143, 164, 93, 43, 203, 69]
# [137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]
print(sum(count_list)/len(count_list))
                

[137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]
103.9375


In [14]:
101 // 50

2