## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [2]:
import pandas as pd
import re
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

# select the dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# num_list = []
# datasets = ['OpenStack']
# for dataset in datasets:
dataset = 'Proxifier'
print(f'Processing {dataset} dataset...')
# load the dataset
df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content'].tolist()
templates = df['EventTemplate'].tolist()

# tokenize -> vectorize -> cluster -> reassign_clusters
tokenized_logs = [tokenize(log) for log in logs]
labels, cluster_nums = cluster(vectorize(tokenized_logs))
labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

Processing Proxifier dataset...




In [8]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

num = 7
print('cluster:', num)
print('length:', len(inputs[num][1]))
print('template:', inputs[num][3])
print('-'*20)
for log in set(inputs[num][1]):
    print(log)
print('='*40)



num of clusters: 14
len of templates: 8
cluster: 7
length: 9
template: <*> close, <*> sent, <*> received, lifetime <*>
--------------------
proxy.cse.cuhk.edu.hk:5070 close, 31257 bytes (30.5 KB) sent, 1846301 bytes (1.76 MB) received, lifetime 01:33
r6---sn-i3b7kn7d.googlevideo.com:443 close, 12789 bytes (12.4 KB) sent, 13833013 bytes (13.1 MB) received, lifetime 01:01
proxy.cse.cuhk.edu.hk:5070 close, 2933 bytes (2.86 KB) sent, 11721005 bytes (11.1 MB) received, lifetime 02:48
proxy.cse.cuhk.edu.hk:5070 close, 47856 bytes (46.7 KB) sent, 4090387 bytes (3.90 MB) received, lifetime 01:01
video-hkg3-2.xx.fbcdn.net:443 close, 58373 bytes (57.0 KB) sent, 8896991 bytes (8.48 MB) received, lifetime 02:25
r2---sn-i3b7kne6.googlevideo.com:443 close, 17742 bytes (17.3 KB) sent, 11581393 bytes (11.0 MB) received, lifetime 01:08
proxy.cse.cuhk.edu.hk:5070 close, 7437 bytes (7.26 KB) sent, 2235596 bytes (2.13 MB) received, lifetime 00:07
r1---sn-i3belnez.googlevideo.com:443 close, 10046 bytes (9.

In [None]:
# check the cluster k
# k = 0
# lengh_cluster = len(inputs[k][1])
# print('cluster ', k)
# print('length:', lengh_cluster)
# print('template:', inputs[k][3])
# print('-'*20)
# for log in set(inputs[k][1]):
#     print(log)

#      len
# Linux 0.5   tokenize '=' difference between (<*>) and () group first will help
# HealthApp: 1   same length, 2 words different(80 logs) refine by difference of words will help
# Zookeeper: 0 same length, 2 words different(12 logs)
# Hadoop: 0 same length 1 words different(118 logs)
# Spark: 0  same length 1 words different(149 logs)

# good cluster datasets
# HDFS OpenStack Proxifier HPC Mac Windows Apache Thunderbird
# length solved datasets
# BGL OpenSSH Android
# 

## evaluate

In [1]:
import os
from utils.evaluator import evaluate
import pandas as pd
from IPython.display import HTML


def calculate_avg(numbers):
    avg = sum(numbers) / len(numbers)
    numbers.append(avg)
    numbers = [round(num, 3) for num in numbers]
    return numbers

table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'
datasets = table_order.split(' ')

table_data = {
    'dataset': [],
    'GA': [],
    'PA': [],
    'ED': []
}

# Note: chage the file name to the name of the log file
file_name = 'Test'

result_table_path = f'outputs/parser/{file_name}/result_tabel.csv'
if os.path.exists(result_table_path):
    df = pd.read_csv(result_table_path)
else:
    ga, pa ,ed = [],[],[]
    for dataset in datasets:
        table_data['dataset'].append(dataset)
        file_path = f'outputs/parser/{file_name}/{dataset}.csv'
        
        a,b,c,d = evaluate(file_path, dataset)
        ga.append(a)
        pa.append(b)
        ed.append(c)

    table_data['dataset'].append('avg')
    table_data['GA'] = calculate_avg(ga)
    table_data['PA'] = calculate_avg(pa)
    table_data['ED'] = calculate_avg(ed)

    df = pd.DataFrame(table_data)
    df.to_csv(result_table_path, index=False)

table = df.to_html(index=False)
display(HTML(table))

dataset,GA,PA,ED
HDFS,1.0,1.0,0.0
Hadoop,0.984,0.858,7.432
Spark,0.941,0.913,1.294
Zookeeper,0.991,0.94,0.587
BGL,0.963,0.935,1.19
HPC,0.866,0.85,1.66
Thunderbird,0.971,0.864,2.322
Windows,0.999,0.973,0.552
Linux,0.934,0.702,3.516
Android,0.94,0.666,4.352


## Find similarity in all datasets

In [6]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

count_logs = []
count_templates = []

for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    for log, template in zip(logs, templates):
        if template not in count_templates:
            count_templates.append(template)
            if '<*>:' in template:
                print(f"{template}")

Processing BGL ----------------
ciod: Error loading <*>: invalid or missing program image, No such file or directory
ciod: Error creating node map from file <*>: No child processes
ciod: Error creating node map from file <*>: Bad file descriptor
ciod: Error creating node map from file <*>: Block device required
ciod: Error creating node map from file <*>: Permission denied
rts: kernel terminated for reason <*>: bad message header: invalid cpu, type=<*>, cpu=<*>, index=<*>, total=<*>
ciod: Error loading <*>: invalid or missing program image, Exec format error
ciod: Error reading message prefix after LOAD_MESSAGE on CioStream socket to <*>: Link has been severed
ciod: Error loading <*>: invalid or missing program image, Permission denied
ciod: Error loading <*>: program image too big, <*> > <*>
Processing HDFS ----------------
<*>:Got exception while serving <*> to <*>:
Processing Linux ----------------
<*>: Auto-detected intellimouse <*>
<*>: <*> - <*> (usable)
<*>: <*> - <*> (reserved)

In [None]:
from utils.demonstrations_sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

Mutation
```
# if '3' in template:
#     Count[3] += 1
# if '4' in template:
#     Count[4] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
# if '5' in template:
#     Count[5] += 1
#     print(dataset, template)
#     # 1 + 2
# if '6' in template:
#     Count[6] += 1
#     # Mac 15 + 28 + 5 + 18 + 10
```

In [29]:
from collections import Counter
import pandas as pd
import re
from utils.cluster import tokenize



datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


a,b = 0,0
pattern = r'^[a-zA-Z]+[0-9]+$'


list_log = []
list_tmp = []
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    templates = df['EventTemplate'].tolist()
    logs = df['Content'].tolist()
    freq = Counter(templates)
    
    for template,log in zip(templates,logs):
        if 'mb' in log.lower() and template not in list_tmp:
            list_tmp.append(template)
            list_log.append(log)
            
for log in list_log:
    print(log)


1 ddr errors(s) detected and corrected on rank 0, symbol 25, bit 1
Can not get assembly information for node card
ciod: Error loading /bgl/apps/scaletest/performance/MINIBEN/mb_243_0810/allreduce.rts: invalid or missing program image, Exec format error
ciod: generated 64 core files for program IMB-MPI1.2MB_perf
ciod: Error loading /bgl/apps/swl-prep/rky-swl/MPI-PERF/IMB/perf_tests/IMB-MPI1.5124KB: invalid or missing program image, No such file or directory
1 ddr error(s) detected and corrected on rank 0, symbol 24 over 335 seconds
minus normalized number..................0
0MB HIGHMEM available.
126MB LOWMEM available.
Initializing random number generator:  succeeded
Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]
[instance: 96abccce-8d1f-4e07-b6d1-4b2ab87e23b4] Attempting claim: memory 2048 MB, disk 20 GB, vcpus 1 CPU
[instance: 96abccce-8d1f-4e07-b6d1-4b2ab87e23b

In [6]:
import pandas as pd
from utils.postprocess import correct_single_template

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',

            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']




# datasets = ['Linux']

count_list = []

shot = 0

for dataset in datasets:
    print(f"Processing {dataset} dataset...")
    with open (f'outputs/parser/Test/{dataset}.txt', 'r') as f:
        lines = f.readlines()
    Read = False
    count = 0
    for line in lines:
        if 'len=' in line and 'cluster' in line:
            if count > 5:
                print(length)
                shot+=1
            count = 0
            parts = line.strip().split("len=")
            if len(parts) == 2:  # 确保字符串中包含"len="
                tmp = parts[1]
                length = int(tmp)
            if length > 50:
                Read = True
            else:
                Read = False
        else:
            if Read:
                count += 1

print(shot/len(datasets))
# [147, 49, 156, 98, 65, 202, 45, 61, 82, 361, 143, 164, 93, 43, 203, 69]
# [137, 49, 121, 94, 62, 56, 45, 56, 78, 322, 135, 141, 86, 43, 175, 63]

                

Processing BGL dataset...
60
51
Processing HDFS dataset...
Processing Linux dataset...
372
216
118
Processing HealthApp dataset...
242
144
Processing OpenStack dataset...
931
Processing OpenSSH dataset...
384
Processing Proxifier dataset...
954
947
Processing HPC dataset...
394
91
60
Processing Zookeeper dataset...
266
Processing Mac dataset...
Processing Hadoop dataset...
476
326
Processing Android dataset...
200
Processing Windows dataset...
280
224
224
Processing Apache dataset...
Processing Thunderbird dataset...
568
62
Processing Spark dataset...
80
1.5


In [14]:
101 // 50

2