## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [13]:
import pandas as pd
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

dataset = 'HealthApp'

for dataset in datasets:
    print('-' * 50)
    print(f'Clustering {dataset} dataset...')

    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

    # store the logs in the cluster
    inputs = []
    for i in range(cluster_nums):
        inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
    for i, label in enumerate(labels):
        inputs[label][0] = label
        inputs[label][1].append(logs[i])
        inputs[label][2].append(i)
        if inputs[label][3] == '':
            inputs[label][3] = df['EventTemplate'][i]

    clusters = []
    for input in inputs:
        c = Cluster(*input, remove_duplicate= True)
        clusters.append(c)
    Count = 0
    for c in clusters:    
        if len(c.indexs) > 10:
            print(c.logs[0])
            print(c.oracle_template)
            print(len(c.indexs))
            print('=' * 50)
            Count+=1
    print(f'Count : {Count}')

--------------------------------------------------
Clustering BGL dataset...
instruction cache parity error corrected
instruction cache parity error corrected
42
63543 double-hummer alignment exceptions
<*> double-hummer alignment exceptions
109
CE sym 2, at 0x0b85eee0, mask 0x05
CE sym <*>, at <*>, mask <*>
92
generating core.2275
generating core.<*>
721
ciod: LOGIN chdir(/p/gb2/glosli/8M_5000K/t800) failed: No such file or directory
ciod: LOGIN chdir(<*>) failed: No such file or directory
16
1 ddr errors(s) detected and corrected on rank 0, symbol 25, bit 1
<*> ddr errors(s) detected and corrected on rank <*>, symbol <*>, bit <*>
18
data TLB error interrupt
data TLB error interrupt
60
data storage interrupt
data storage interrupt
30
instruction address: 0x00004ed8
instruction address: <*>
21
total of 1 ddr error(s) detected and corrected
total of <*> ddr error(s) detected and corrected
13
ciod: Error loading /home/draeger/testQboxhang-nozerobytebug-nosleepyescomm: invalid or missing 

## Show Clusters

In [26]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

clusters = []
for input in inputs:
    c = Cluster(*input, remove_duplicate= True)
    clusters.append(c)

# for cluster in clusters:
#     print(f'cluster {cluster.label}: {len(cluster.logs)} logs, {len(cluster.indexs)} indexs')

# num = 27
# print('cluster:', num)
# print('length:', len(clusters[num].indexs))
# print('template:', clusters[num].oracle_template)
# print('len of set:', len(clusters[num].logs))
# print('-'*20)
# for log in clusters[num].logs:
#     print(log)
# print('='*40)

# for cluster in clusters:
#     if len(set(cluster.logs)) == 1 and not any(char.isdigit() for char in cluster.logs[0]):
#         print(f"{cluster.logs[0]}\n{cluster.oracle_template}")
#         print('='*40)

num of clusters: 75
len of templates: 75
onReceive action: android.intent.action.SCREEN_ON
onReceive action: android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
flush sensor data
flush sensor data
onReceive action: android.intent.action.SCREEN_OFF
onReceive action: android.intent.action.SCREEN_OFF
processHandleBroadcastAction action:android.intent.action.TIME_TICK
processHandleBroadcastAction action:android.intent.action.TIME_TICK
getBinderPackageName packageName = com.huawei.health
getBinderPackageName packageName = com.huawei.health
needAutoSync autoSyncSwitch is open
needAutoSync autoSyncSwitch is open
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy is true
initDataPrivacy the dataPrivacy is <*>
initUserPrivacy the userPrivacy switch is open, 

## Evaluate

In [9]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets(
    'Test_20shot', send_email=False)
HTML(table)

dataset,GA,PA,ED
HDFS,1.0,1.0,0.0
Hadoop,0.988,0.888,6.529
Spark,0.978,0.955,1.484
Zookeeper,0.993,0.988,0.222
BGL,0.976,0.881,2.864
HPC,0.951,0.938,0.356
Thunderbird,0.885,0.534,8.714
Windows,1.0,0.606,12.278
Linux,0.936,0.784,0.633
Android,0.947,0.786,2.328


In [34]:
from evaluate import evaluate_single_dataset

evaluate_single_dataset(
    'outputs/parser/Test_10shot/BGL_2k.log_structured.csv', 'BGL')

         BGL: group Accuracy: 0.9845, Message-Level Accuracy: 0.8818, Edit Distance: 3.5548


In [2]:
import re

pattern = r'\b0[xX][0-9a-fA-F]+\b'
text = "Here are some hex numbers: 0x1a3, 0X4D2, and 0xABCDEF."

matches = re.findall(pattern, text)
print(matches)

['0x1a3', '0X4D2', '0xABCDEF']


## Check out some strings' freq in the whole logs or templates

In [12]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template
from utils.postprocess import extract_variables

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    # templates = list(set(templates))
    for log, template in zip(logs, templates):
        if template not in count_templates and '  ' in template:
            count_templates.append(template)
            print(template)
            print(log)
            print('-'*20)

Processing BGL ----------------
Processing HDFS ----------------
Processing Linux ----------------
authentication failure; logname= uid=<*> euid=<*> tty=<*> ruser= rhost=<*>  user=<*>
authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root
--------------------
ANONYMOUS FTP LOGIN FROM <*>,  (anonymous)
ANONYMOUS FTP LOGIN FROM 84.102.20.2,  (anonymous)
--------------------
SELinux:  Initializing.
SELinux:  Initializing.
--------------------
SELinux:  Starting in permissive mode
SELinux:  Starting in permissive mode
--------------------
<*>:  Registering secondary module capability
selinux_register_security:  Registering secondary module capability
--------------------
Initializing random number generator:  succeeded
Initializing random number generator:  succeeded
--------------------
Starting pcmcia:  succeeded
Starting pcmcia:  succeeded
--------------------
Setting network parameters:  succeeded
Setting network parameters: 

## sample based on entropy

In [None]:
from utils.sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Mutation Count -- num

In [None]:

# 4: 15 + 28 + 5 + 18 + 10
# 5: 1 + 2
# 6: 15 + 28 + 5 + 18 + 10