## DBCSAN Clustering
`另一种聚类方式：将所有数字替换为0，不经过分词直接聚类`
``` python
re.sub(r'\d+(\.\d+)?', '0', text)
```

In [5]:
import pandas as pd
from utils.cluster import reassign_clusters, cluster, vectorize, tokenize,Cluster

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']

datasets = ['HealthApp']

for dataset in datasets:
    print('-' * 50)
    print(f'Clustering {dataset} dataset...')

    # load the dataset
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()

    # tokenize -> vectorize -> cluster -> reassign_clusters
    tokenized_logs = [tokenize(log) for log in logs]
    print(vectorize(tokenized_logs))
    labels, cluster_nums = cluster(vectorize(tokenized_logs))
    labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)

    # store the logs in the cluster
    inputs = []
    for i in range(cluster_nums):
        inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
    for i, label in enumerate(labels):
        inputs[label][0] = label
        inputs[label][1].append(logs[i])
        inputs[label][2].append(i)
        if inputs[label][3] == '':
            inputs[label][3] = df['EventTemplate'][i]

    clusters = []
    for input in inputs:
        c = Cluster(*input, remove_duplicate= True)
        clusters.append(c)
    Count = 0
    for c in clusters:    
        if len(c.indexs) > 10:
            print(c.logs[0])
            print(c.oracle_template)
            print(len(c.indexs))
            print('=' * 50)
            Count+=1
    print(f'Count : {Count}')

--------------------------------------------------
Clustering HealthApp dataset...
  (0, 100)	1.0
  (1, 97)	1.0
  (2, 16)	0.6248332566193072
  (2, 11)	0.5520794333347835
  (2, 98)	0.5520794333347835
  (3, 12)	0.8517032026021748
  (3, 105)	0.5240244790820354
  (4, 45)	0.5668201002003888
  (4, 121)	0.5825439786011101
  (4, 64)	0.5825439786011101
  (5, 72)	1.0
  (6, 125)	1.0
  (7, 100)	1.0
  (8, 156)	0.7071067811865475
  (8, 31)	0.7071067811865475
  (9, 155)	0.7071067811865475
  (9, 30)	0.7071067811865475
  (10, 0)	0.7064072898835545
  (10, 8)	0.7078055812151892
  (11, 97)	1.0
  (12, 100)	1.0
  (13, 100)	1.0
  (14, 97)	1.0
  (15, 72)	1.0
  (16, 125)	1.0
  :	:
  (1987, 105)	0.6960412725776134
  (1988, 13)	0.7180017735831414
  (1988, 105)	0.6960412725776134
  (1989, 13)	0.7180017735831414
  (1989, 105)	0.6960412725776134
  (1990, 13)	0.7180017735831414
  (1990, 105)	0.6960412725776134
  (1991, 13)	0.7180017735831414
  (1991, 105)	0.6960412725776134
  (1992, 13)	0.7180017735831414
  (1992, 1

## Show Clusters

In [26]:
print('num of clusters:', cluster_nums)
print('len of templates:', len(set(templates)))

# store the logs in the cluster
inputs = []
for i in range(cluster_nums):
    inputs.append([-1, [], [], '']) # label, logs, indexs, ground_truth
for i, label in enumerate(labels):
    inputs[label][0] = label
    inputs[label][1].append(logs[i])
    inputs[label][2].append(i)
    if inputs[label][3] == '':
        inputs[label][3] = df['EventTemplate'][i]

clusters = []
for input in inputs:
    c = Cluster(*input, remove_duplicate= True)
    clusters.append(c)

# for cluster in clusters:
#     print(f'cluster {cluster.label}: {len(cluster.logs)} logs, {len(cluster.indexs)} indexs')

# num = 27
# print('cluster:', num)
# print('length:', len(clusters[num].indexs))
# print('template:', clusters[num].oracle_template)
# print('len of set:', len(clusters[num].logs))
# print('-'*20)
# for log in clusters[num].logs:
#     print(log)
# print('='*40)

# for cluster in clusters:
#     if len(set(cluster.logs)) == 1 and not any(char.isdigit() for char in cluster.logs[0]):
#         print(f"{cluster.logs[0]}\n{cluster.oracle_template}")
#         print('='*40)

num of clusters: 75
len of templates: 75
onReceive action: android.intent.action.SCREEN_ON
onReceive action: android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
processHandleBroadcastAction action:android.intent.action.SCREEN_ON
flush sensor data
flush sensor data
onReceive action: android.intent.action.SCREEN_OFF
onReceive action: android.intent.action.SCREEN_OFF
processHandleBroadcastAction action:android.intent.action.TIME_TICK
processHandleBroadcastAction action:android.intent.action.TIME_TICK
getBinderPackageName packageName = com.huawei.health
getBinderPackageName packageName = com.huawei.health
needAutoSync autoSyncSwitch is open
needAutoSync autoSyncSwitch is open
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy switch is open, start push health data!
initDataPrivacy the dataPrivacy is true
initDataPrivacy the dataPrivacy is <*>
initUserPrivacy the userPrivacy switch is open, 

## Evaluate

In [1]:
from evaluate import evaluate_all_datasets
from IPython.display import HTML

table = evaluate_all_datasets(
    'Test_10shot', send_email=False)
HTML(table)

dataset,GA,PA,ED,N_ED
HDFS,1.0,1.0,0.0,1.0
Hadoop,0.989,0.8765,6.6965,0.94737
Spark,0.974,0.9695,0.507,0.97628
Zookeeper,0.993,0.989,0.2205,0.99629
BGL,0.9805,0.9075,3.5035,0.97453
HPC,0.8375,0.824,0.9365,0.90846
Thunderbird,0.886,0.8125,5.3875,0.89564
Windows,0.9995,0.607,12.038,0.79458
Linux,0.936,0.773,0.7855,0.98472
Android,0.924,0.774,2.634,0.94142


In [34]:
from evaluate import evaluate_single_dataset

evaluate_single_dataset(
    'outputs/parser/Test_10shot/BGL_2k.log_structured.csv', 'BGL')

         BGL: group Accuracy: 0.9845, Message-Level Accuracy: 0.8818, Edit Distance: 3.5548


In [2]:
import re

pattern = r'\b0[xX][0-9a-fA-F]+\b'
text = "Here are some hex numbers: 0x1a3, 0X4D2, and 0xABCDEF."

matches = re.findall(pattern, text)
print(matches)

['0x1a3', '0X4D2', '0xABCDEF']


## Check out some strings' freq in the whole logs or templates

In [12]:
import pandas as pd
from utils.cluster import tokenize
from utils.sample_byword import extract_variables
from utils.postprocess import correct_single_template
from utils.postprocess import extract_variables

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']


count_templates = []
for dataset in datasets:
    print(f"Processing {dataset} ----------------")
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
    logs = df['Content'].tolist()
    templates = df['EventTemplate'].tolist()
    # templates = list(set(templates))
    for log, template in zip(logs, templates):
        if template not in count_templates and '  ' in template:
            count_templates.append(template)
            print(template)
            print(log)
            print('-'*20)

Processing BGL ----------------
Processing HDFS ----------------
Processing Linux ----------------
authentication failure; logname= uid=<*> euid=<*> tty=<*> ruser= rhost=<*>  user=<*>
authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root
--------------------
ANONYMOUS FTP LOGIN FROM <*>,  (anonymous)
ANONYMOUS FTP LOGIN FROM 84.102.20.2,  (anonymous)
--------------------
SELinux:  Initializing.
SELinux:  Initializing.
--------------------
SELinux:  Starting in permissive mode
SELinux:  Starting in permissive mode
--------------------
<*>:  Registering secondary module capability
selinux_register_security:  Registering secondary module capability
--------------------
Initializing random number generator:  succeeded
Initializing random number generator:  succeeded
--------------------
Starting pcmcia:  succeeded
Starting pcmcia:  succeeded
--------------------
Setting network parameters:  succeeded
Setting network parameters: 

## sample based on entropy

In [None]:
from utils.sample import sample_based_on_entropy

# datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
#         'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# for dataset in datasets:
#     pair = sample_based_on_entropy(dataset, 1)
#     print(pair[0][0])
dataset = 'HDFS'
pairs = sample_based_on_entropy(dataset, 1)
for pair in pairs:
    print(f"{pair[0][0]}\n{pair[0][1]}\n{'-'*20}")

## Mutation Count -- num

In [None]:

# 4: 15 + 28 + 5 + 18 + 10
# 5: 1 + 2
# 6: 15 + 28 + 5 + 18 + 10

In [None]:
['Found','block','rdd_2_2','locally']