In [1]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter, defaultdict
import datetime
from tqdm import tqdm
import string
from copy import deepcopy

import hashlib
import time

pd.set_option('display.max_rows', None)

In [2]:
def hasNumbers(s):
    return any(char.isdigit() for char in s)

def isDynamicToken(token: str, special_tokens: set) -> bool:
    """ Kiểm tra xem token có phải là token động hay không """

    # 1. Kiểm tra nếu token.lower() nằm trong special_tokens
    if token.lower() in special_tokens:
        return True

    # 2. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
    if re.fullmatch(r'0x[0-9a-fA-F]+', token) or re.fullmatch(r'[0-9a-fA-F]{8,}', token):
        return True

    # 3. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', token):
        return True

    return False

In [3]:
def splitSubToken(s, seps):
    placeholder = "~~WILDCARD~~"
    s = s.replace("<*>", placeholder)

    pattern = '|'.join(re.escape(sep) for sep in seps)

    tokensL = re.split(f'({pattern})', s)
    tokensL = [tok.replace(placeholder, "<*>") for tok in tokensL if tok.strip() != '']

    return tokensL
def processingSubToken(tok):
    # 1. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
    if re.fullmatch(r'0x[0-9a-fA-F]+', tok) or re.fullmatch(r'[0-9a-fA-F]{8,}', tok):
        return True
    
    if not hasNumbers(tok):
        return False

    # 2. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', tok):
        return True
    
    number_groups = re.findall(r'\d+', tok)
    if len(number_groups) > 1:
        return True
    
    matches = list(re.finditer(r'[a-zA-Z]+[0-9]+', tok))
    if len(matches) == 1:
        end = matches[0].end()
        if end == len(tok) or not tok[end].isalnum():
            return False  # tĩnh

    return True

In [4]:
log_df = pd.read_csv('test.csv')

special_tokens = ['true', 'false', 'null', 'root']
punctuationL = "[](){}=:,#"
special_set = set(tok.lower() for tok in special_tokens)
regexs = [
    [r'(?:[0-9a-fA-F]{2}:){3,}[0-9a-fA-F]{2}', "<*>"],
    [r'([\\\/]([\w.-_@#!]+?)){2,}[\\\/]([\w.-_@#!]+)', "<*>"],
]

logTemplate = defaultdict(list)

for idx, row in log_df.iterrows():
    content_str = row['Content']
    for pattern, *replacement in regexs:
        replacement = replacement[0] if replacement else "<*>"
        content_str = re.sub(pattern, replacement, content_str)
    special_set = set(tok.lower() for tok in special_tokens)
    
    tokensL = str(content_str).strip().split()
    
    new_tokens = []
    idx_dynamic_token = []
    static_tokenL = []
    
    for idx_tok, token in enumerate(tokensL):
        sub_tokensL = splitSubToken(token, punctuationL)
        for idx_sub, sub_token in enumerate(sub_tokensL):
            if sub_token in special_set:
                sub_tokensL[idx_sub] = "<*>"
                continue
            
            if processingSubToken(sub_token):
                sub_tokensL[idx_sub] = "<*>"
        
        if len(sub_tokensL) <= 1:
            new_tokens.append(sub_tokensL[0])
        else:
            new_tokens.append("".join(sub_tokensL))
            idx_dynamic_token.append(idx_tok)
            static_tokenL.append(sub_tokensL)
    logTemplate[" ".join(new_tokens)].append(idx)
    print(f"Row {idx}: {row['Content']}\n{new_tokens}")


    

Row 0: session closed for user root
['session', 'closed', 'for', 'user', '<*>']
Row 1: session opened for user root by (uid=0)
['session', 'opened', 'for', 'user', '<*>', 'by', '(uid=<*>)']
Row 2: (root) CMD (run-parts /etc/cron.hourly)
['(<*>)', 'CMD', '(run-parts', '/etc/cron.hourly)']
Row 3: session closed for user root
['session', 'closed', 'for', 'user', '<*>']
Row 4: session opened for user root by (uid=0)
['session', 'opened', 'for', 'user', '<*>', 'by', '(uid=<*>)']
Row 5: (root) CMD (run-parts /etc/cron.hourly)
['(<*>)', 'CMD', '(run-parts', '/etc/cron.hourly)']
Row 6: session closed for user root
['session', 'closed', 'for', 'user', '<*>']
Row 7: session opened for user root by (uid=0)
['session', 'opened', 'for', 'user', '<*>', 'by', '(uid=<*>)']
Row 8: (root) CMD (run-parts /etc/cron.hourly)
['(<*>)', 'CMD', '(run-parts', '/etc/cron.hourly)']
Row 9: session closed for user root
['session', 'closed', 'for', 'user', '<*>']
Row 10: session opened for user root by (uid=0)
['ses

In [5]:
# for key, val in logTemplate.items():
#     print(f"{key}:\n{val}")


trust_df = pd.read_csv("./logs2k/HDFS/HDFS_2k.log_structured_corrected.csv")

grouped_indices = trust_df.groupby('EventTemplate').apply(lambda x: x.index.tolist())

# for key, val in grouped_indices.items():
#     print(f"{key}:\n{val}")

temp = []
print(len(grouped_indices), len(logTemplate))
for key, val in grouped_indices.items():
    if key in logTemplate.keys():
        temp.append(key)
        # print(f"Trust: {val}\nParse: {logTemplate[key]}")

for key, val in logTemplate.items():
    if key not in temp:
        print(f"{key}:\n{val}")
        


14 267
session closed for user <*>:
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 42, 96, 107, 112, 115, 126, 1568, 1578, 1581, 1586, 1591]
session opened for user <*> by (uid=<*>):
[1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 43, 1099, 1101, 1103, 1105, 1107]
(<*>) CMD (run-parts /etc/cron.hourly):
[2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 44]
data_thread() got not answer from any [Thunderbird_A8] datasource:
[39, 104, 203, 268, 315, 353, 399, 462, 537, 599, 650, 708, 766, 819, 875, 930, 993, 1046, 1097, 1172, 1414, 1524, 1585, 1642, 1693, 1752, 1804, 1856, 1904, 1946, 1989]
data_thread() got not answer from any [Thunderbird_B8] datasource:
[40, 87, 172, 258, 310, 354, 416, 470, 549, 597, 647, 704, 763, 813, 874, 931, 985, 1038, 1090, 1151, 1422, 1468, 1549, 1625, 1672, 1713, 1763, 1816, 1863, 1905, 1945]
data_thread() got not answer from any [Thunderbird_C5] datasource:
[41, 118, 186, 264, 322, 370, 422, 481, 550, 590, 643, 701, 754, 814, 866, 925, 988, 1040, 1080, 11

  grouped_indices = trust_df.groupby('EventTemplate').apply(lambda x: x.index.tolist())


In [6]:
def processingSubToken(tok):
    # 1. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
    if re.fullmatch(r'0x[0-9a-fA-F]+', tok) or re.fullmatch(r'[0-9a-fA-F]{8,}', tok):
        return True
    
    if not hasNumbers(tok):
        return False

    # 2. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', tok):
        return True
    
    number_groups = re.findall(r'\d+', tok)
    if len(number_groups) > 1:
        return True
    
    matches = list(re.finditer(r'[a-zA-Z]+[0-9]+', tok))
    if len(matches) == 1:
        end = matches[0].end()
        if end == len(tok) or not tok[end].isalnum():
            return False  # tĩnh

    return True

token = ["abcd123s", "abcd123", "abcd", "123", "0x1234567890abcdef", "0x1234567890abcdefg", "0x12345678", "0x1234567", "0x1234567a"]
for tok in token:
    print(tok, processingSubToken(tok))

abcd123s True
abcd123 False
abcd False
123 True
0x1234567890abcdef True
0x1234567890abcdefg True
0x12345678 True
0x1234567 True
0x1234567a True
