#### **0. CHUẨN BỊ**

##### **0.1. CẤU HÌNH CÁC THAM SỐ**

In [1]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter, defaultdict
import datetime
from tqdm import tqdm
import string
from copy import deepcopy

import hashlib
import time

pd.set_option('display.max_rows', None)

##### **0.2. CLASS**

In [2]:
class LogCluster:
    def __init__(self, keyGroup, logTemplate, tokens, length, logIDL=None):
        self.keyGroup = keyGroup
        self.logTemplate = logTemplate
        self.tokens = tokens
        self.length = length
        self.logIDL = logIDL if logIDL is not None else []
    def __str__(self):
        return (
            f"Key: {self.keyGroup}\n"
            f"Template: {self.logTemplate}\n"
            f"Tokens: {self.tokens}\n"
            f"Length: {self.length}\n"
            f"Len LogIDs: {len(self.logIDL)}\n"
        )

##### **0.3. CÁC PHƯƠNG THỨC ĐỌC DỮ LIỆU**

In [3]:
# ================================= READ DATA ================================= #
def log_to_dataframe(log_file, regex, headers):
    """ Phương thức chuyển đổi file log thành dataframe
    """ 
    log_messages = []
    linecount = 0
    with open(log_file, 'r', encoding="utf8") as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
            except Exception as e:
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

def generate_logformat_regex(logformat):
    """ Phương thức tạo regex từ logformat, biểu thức định dạng của một event log: 
    Ex: 'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>'
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

def load_data(logfile, logformat):
    """ Phương thức trả về một dataframe từ một file log chỉ định
    """
    log_headers, log_regex = generate_logformat_regex(logformat)
    logs_df = log_to_dataframe(logfile, log_regex, log_headers)
    return logs_df

#### **1. CÁC PHƯƠNG THỨC SỬ DỤNG**

##### **1.1. TIỀN XỬ LÝ MỨC TOKEN**

In [4]:
# ============================ CLUSTRING TOKENs =========================== #
def hasNumbers(s):
    return any(char.isdigit() for char in s)

# def isDynamicToken(token: str, special_tokens: set) -> bool:
#     """ Kiểm tra xem token có phải là token động hay không """

#     # 1. Kiểm tra nếu token.lower() nằm trong special_tokens
#     if token.lower() in special_tokens:             
#         return True

#     # 2. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
#     if re.fullmatch(r'0x[0-9a-fA-F]+', token) or re.fullmatch(r'[0-9a-fA-F]{8,}', token):
#         return True

#     # 3. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
#     if re.fullmatch(r'-?\d+(\.\d+)?', token):
#         return True

#     return False

##### **1.2. TIỀN XỬ LÝ MỨC SUBTOKEN**

In [None]:
# ============================ CLUSTRING SUB_TOKENs =========================== #
def splitSubToken(s, seps):
    placeholder = "~~WILDCARD~~"
    s = s.replace("<*>", placeholder)

    pattern = '|'.join(re.escape(sep) for sep in seps)

    tokensL = re.split(f'({pattern})', s)
    tokensL = [tok.replace(placeholder, "<*>") for tok in tokensL if tok.strip() != '']

    return tokensL

def processingSubToken(tok):
    # 1. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
    if re.fullmatch(r'[0-9a-fA-F]{8,}', tok):
    # if re.fullmatch(r'0x[0-9a-fA-F]+', tok) or re.fullmatch(r'[0-9a-fA-F]{8,}', tok):
        return True
    
    if not hasNumbers(tok):
        return False

    # 2. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', tok):
        return True
    
    number_groups = re.findall(r'\d+', tok)
    if len(number_groups) > 1:
        return True
    
    matches = list(re.finditer(r'[a-zA-Z]+[0-9]+', tok))
    if len(matches) == 1:
        end = matches[0].end()
        if end == len(tok) or not tok[end].isalnum():
            return False 

    return True
        
def mergeSpecialTok(token_str, seps):
    """ Gộp các chuỗi "<*>" liên tiếp hoặc ngăn cách bằng các ký tự đặc biệt.
    Sau đó tách lại thành danh sách token, bảo toàn chuỗi "<*>".
    """
    sep_pattern = '|'.join(re.escape(sep) for sep in seps)
    
    prev = None
    while token_str != prev:
        prev = token_str
        # Gộp mẫu: <*> + (các ký tự phân tách giống nhau) + <*>
        token_str = re.sub(rf'(<\*>)(({sep_pattern})\3*)(<\*>)', r'<*>', token_str)
        
        # Gộp nhiều <*><*> liên tiếp:
        token_str = re.sub(r'(<\*>)+', r'<*>', token_str)

    return token_str

##### **1.3. TẠO DATAFRAME TỪ DỮ LIỆU**

In [6]:
# ========================= PROCESSING LOGS 2 DATAFRAME ========================= #
# def processLine(line, regexs, punctuationL, special_tokens=set()):
#     """ Phương thức hỗ trợ xử lý từng dòng log """
#     special_set = set(tok.lower() for tok in special_tokens)
    
#     tokensL = str(line["Content"]).strip().split()
    
#     new_tokens = []
#     idx_dynamic_token = []
#     static_tokenL = []
    
#     for idx_tok, token in enumerate(tokensL):
#         # 1. Xử lý token với các regex đã cho
#         for pattern, *replacement in regexs:
#             replacement = replacement[0] if replacement else "<*>"
#             token = re.sub(pattern, replacement, token)
        
#         # 2. Nếu tồn tại ký tự đặc biệt trong token, thì xử lý qua phương thức splitSubToken
#         sub_tokensL = splitSubToken(token, punctuationL)
#         for idx_sub, sub_token in enumerate(sub_tokensL):
#             if sub_token in special_set:
#                 sub_tokensL[idx_sub] = "<*>"
#                 continue
            
#             if processingSubToken(sub_token):
#                 sub_tokensL[idx_sub] = "<*>"
        
#         if len(sub_tokensL) <= 1:
#             new_tokens.append(sub_tokensL[0])
#         else:
#             new_tokens.append("<*>")
#             idx_dynamic_token.append(idx_tok)
#             static_tokenL.append(sub_tokensL)
    
#     # print("NEW TOKENS is: ")
#     # print(new_tokens)
#     # print("DYNAMIC IDX TOKEN is: ")
#     # print(idx_dynamic_token)
#     # print("STATIC TOKENS is:")
#     # print(static_tokenL)
    
#     groupTem_str = f"{' '.join(new_tokens)} : {len(new_tokens)} : {' '.join(str(idx) for idx in idx_dynamic_token)} : {' '.join([str(len(i)) for i in static_tokenL])}"

#     return pd.Series({
#         'GroupTemplate': hashlib.md5(groupTem_str.encode('utf-8')).hexdigest(),
#         'GroupTokens': new_tokens,
#         'idxDynamicTok': idx_dynamic_token,
#         'StaticTokList': static_tokenL,
#         'EventTemplate': f"{' '.join(new_tokens)}",
#     })

def processLine(line, regexs, punctuationL, special_tokens=set()):
    """ Phương thức hỗ trợ xử lý từng dòng log """
    
    content_str = line['Content']
    for pattern, *replacement in regexs:
        replacement = replacement[0] if replacement else "<*>"
        content_str = re.sub(pattern, replacement, content_str)
    special_set = set(tok.lower() for tok in special_tokens)
    
    tokensL = str(content_str).strip().split()
    
    new_tokens = []
    idx_dynamic_token = []
    static_tokenL = []
    
    for idx_tok, token in enumerate(tokensL):
        sub_tokensL = splitSubToken(token, punctuationL)
        for idx_sub, sub_token in enumerate(sub_tokensL):
            if sub_token in special_set:
                sub_tokensL[idx_sub] = "<*>"
                continue
            
            if processingSubToken(sub_token):
                sub_tokensL[idx_sub] = "<*>"
        
        if len(sub_tokensL) <= 1:
            new_tokens.append(sub_tokensL[0])
        else:
            new_tokens.append("<*>")
            idx_dynamic_token.append(idx_tok)
            static_tokenL.append(sub_tokensL)
    
    groupTem_str = f"{' '.join(new_tokens)} : {len(new_tokens)} : {' '.join(str(idx) for idx in idx_dynamic_token)} : {' '.join([str(len(i)) for i in static_tokenL])}"

    return pd.Series({
        'GroupTemplate': hashlib.md5(groupTem_str.encode('utf-8')).hexdigest(),
        'GroupTokens': new_tokens,
        'idxDynamicTok': idx_dynamic_token,
        'StaticTokList': static_tokenL,
        'EventTemplate': f"{' '.join(new_tokens)}",
    })
    
def regexAndCreateDf(datasets, special_tokens=['true', 'false'], punctuationL = set('(),<>:;{}[]~=')):
    logs_df = load_data(datasets['log_file'], datasets['log_format'])

    # ================================ PROCESSING TOKEN AND SUBTOKEN ================================ #
    parse_df = logs_df.copy()
    parse_df['GroupTemplate'] = ""                                  # Lưu template sử dụng để nhóm
    parse_df['GroupTokens'] = [[] for _ in range(len(parse_df))]    # Lưu list token của Group Teplate
    parse_df['idxDynamicTok'] = [[] for _ in range(len(parse_df))]  # Lưu vị trí token động
    parse_df['StaticTokList'] = [[] for _ in range(len(parse_df))]  # Lưu list token tĩnh theo vị trí tương ứng
    parse_df['EventTemplate'] = ""                                  # Template cuối cùng sau khi xử lý


    tqdm.pandas(desc="Tiền xử lý ở mức TOKEN!")
    special_set = set(s.lower() for s in special_tokens)
    
    results = parse_df.progress_apply(
            lambda row: processLine(row, datasets['token_regexs'], punctuationL, special_set),
            axis=1
        )

    for col in results.columns:
        parse_df[col] = results[col]
    
    return parse_df

##### **1.4. TẠO CÁC PHÂN CỤM CHO DỮ LIỆU**

In [7]:
# ================================ TẠO CÁC NHÓM GROUP ================================ #
def generateStaticSubToken(group_staticL, n_merge=3):
    generalized = deepcopy(group_staticL)
    
    for layer_idx in range(len(group_staticL[0])):
        # Lấy toàn bộ layer (dòng dọc) tại vị trí layer_idx
        columns = list(zip(*[row[layer_idx] for row in group_staticL]))
        # columns[i] là cột thứ i trong layer layer_idx
        for subtok_idx, subtok_col in enumerate(columns):
            unique_sub = set(subtok_col)
            if len(unique_sub) >= n_merge or (len(unique_sub) > 1 and "<*>" in unique_sub): 
                for row in generalized:
                    row[layer_idx][subtok_idx] = "<*>"
    
    return generalized

def createGroupClust(parse_df, punctuationL, n_merge = 3, merge_special=False): 
    log_clusters_list = []                                          # List lưu trữ các nhóm log logCluster

    unique_groups = parse_df.groupby("GroupTemplate")
    print(len(unique_groups)) # in ra số nhóm chưa xử lý

    for key, group_val in unique_groups:
        first_row = group_val.iloc[0]
        tokens = first_row['GroupTokens']
        
        if len(first_row["idxDynamicTok"]) != 0:                    # Ktra có token động chưa xử lý hay không?
            group_staticL = group_val['StaticTokList'].to_list()
            group_idL = group_val.index.tolist()
            
            process_staticL = generateStaticSubToken(group_staticL, n_merge)
            temp = defaultdict(list)
            for i, row in enumerate(process_staticL):
                row_key = str(row)
                temp[row_key].append(group_idL[i])
            
            # Trả về danh sách các nhóm với LineID
            result = []
            for key, ids in temp.items():
                group_template = eval(key)  # Chuyển lại thành list gốc
                for idx, val in enumerate(first_row["idxDynamicTok"]):
                    if merge_special:
                        tokens[val] = mergeSpecialTok("".join(group_template[idx]), punctuationL)
                    else:
                        tokens[val] = "".join(group_template[idx])
                    
                logTemplate = " ".join(tokens)
                cluster = LogCluster(
                    keyGroup= hashlib.md5(logTemplate.encode('utf-8')).hexdigest(),
                    logTemplate=logTemplate,
                    tokens=tokens.copy(),
                    length=len(tokens),
                    logIDL=ids.copy(),
                )
                log_clusters_list.append(cluster)            
        else:
            # Nếu trong đó không có token động nào thì: 
            logTemplate = " ".join(tokens)
            cluster = LogCluster(
                    keyGroup= hashlib.md5(logTemplate.encode('utf-8')).hexdigest(),
                    logTemplate=logTemplate,
                    tokens=tokens.copy(),
                    length=len(tokens),
                    logIDL=group_val.index.tolist(),
                )
            log_clusters_list.append(cluster)  
            
    return log_clusters_list

##### **1.5. NHÓM CÁC PHÂN CỤM**

In [8]:
# ======================================= TẠO CLASS ======================================= #
class MergeGroupTemplate:
    def __init__(self, st=0.6, n_merge=3, template_gr=None, punctuationL=set()):
        self.ST = st
        self.N_MERGE = n_merge
        self.TEMPLATE_GR = template_gr if template_gr is not None else []
        self.punctuationL = punctuationL
    
    def similarySeq(self, seq1, seq2):
        """ So sánh độ tương đồng giữa các token của 2 nhóm cluster dựa trên ý tưởng của Drain"""
        assert len(seq1) == len(seq2)
        simTokens = 0
        numOfPar = 0

        for token1, token2 in zip(seq1, seq2):
            if token1 == "<*>":
                numOfPar += 1
                continue
            if token1 == token2:
                simTokens += 1

        retVal = float(simTokens) / len(seq1)

        return retVal, numOfPar
    
    def fastMatchCLuster(self, seqGroupL, seq):
        choose_group = None
        maxSim = -1
        maxNumOfPara = -1
        maxGroup = None

        for gr in seqGroupL:
                curSim, curNumOfPara = self.similarySeq(gr[0].tokens, seq.tokens)
                if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
                    maxSim = curSim
                    maxNumOfPara = curNumOfPara
                    maxGroup = gr
                    
                if maxSim >= self.ST:
                    choose_group = maxGroup
        return choose_group

    def findGeneralToken(self, strings):
        def wildcard2Regex(pattern_str):
            # Tách theo wildcard rồi escape từng phần
            parts = pattern_str.split('<*>')
            regex = '.*'.join(re.escape(p) for p in parts)
            return '^' + regex + '$'

        strings = list(strings)

        for candidate in strings:
            regex = wildcard2Regex(candidate)
            if all(re.fullmatch(regex, s) for s in strings if s != candidate):
                return candidate

        return None    
    
    def generalizeGroup(self, group):
        """Tạo pattern chung bằng cách đếm số lượng token khác nhau tại mỗi vị trí"""
        
        mask_positions = defaultdict(str)               # Danh sách các vị trí cần thay thế bằng <*>    
        tokensL = [s.tokens for s in group]
        
        for idx, col in enumerate(zip(*tokensL)):
            unique_token = set(col)
            if len(unique_token) > 1:
                if "<*>" in unique_token:
                    mask_positions[idx] = "<*>"
                else:
                    common_token = self.findGeneralToken(unique_token)
                    if common_token is not None:
                        mask_positions[idx] = common_token
                    else:
                        if len(unique_token) >= self.N_MERGE:
                            sub_tokensL = [splitSubToken(token, self.punctuationL) for token in unique_token]
                            unique_len = set(len(sub_token) for sub_token in sub_tokensL)
                            if len(unique_len) > 1:
                                mask_positions[idx] = "<*>"
                            else:
                                replace_str = []
                                for sub_idx, col_sub in enumerate(zip(*sub_tokensL)):
                                    unique_sub = set(col_sub)
                                    if len(unique_sub) > 1:
                                        replace_str.append("<*>")
                                    else:
                                        replace_str.append(next(iter(unique_sub)))
                                replace_str = "".join(replace_str)
                                while "<*><*>" in replace_str:
                                    replace_str = replace_str.replace("<*><*>", "<*>")
                                mask_positions[idx] = replace_str
            
        # Tạo pattern chung
        for seq in group:
            seq.tokens = [mask_positions[i] if i in mask_positions else token for i, token in enumerate(seq.tokens)]
            seq.logTemplate = " ".join(seq.tokens)

        # Gom nhóm lại theo pattern
        pattern_dict = defaultdict(list)
        for seq in group:
            key = tuple(seq.tokens)
            pattern_dict[key].append(seq)

        result = []
        for key, values in pattern_dict.items():
            if len(values) != 1: 
                logIDL = []
                for x in values:
                    logIDL.extend(x.logIDL)
                values[0].logIDL = logIDL
            result.append(values[0])
        return result
    
    def mergeGroup(self, printL=False):
        grouped_by_length = defaultdict(list)
        [grouped_by_length[t.length].append(t) for t in self.TEMPLATE_GR]
        
        newClusterGroupsL = []
        
        # Nhóm theo chiều dài:
        for length, groups_len in grouped_by_length.items():
            groupsSimTemL = []
            for log_clust in groups_len:
                matched_gr = self.fastMatchCLuster(groupsSimTemL, log_clust)
                if matched_gr is not None:
                    matched_gr.append(log_clust)
                else:
                    groupsSimTemL.append([log_clust])
            for group in groupsSimTemL:
                if len(group) == 1:
                    newClusterGroupsL.extend(group)
                else:
                    refined_groups = self.generalizeGroup(group)
                    newClusterGroupsL.extend(refined_groups)
        
        self.TEMPLATE_GR = newClusterGroupsL

        if printL:
            self.printList()
        
        return newClusterGroupsL
    
    def printList(self):
        print(len(self.TEMPLATE_GR))
        # df = pd.read_csv(datasets['log_template'])
        # print(len(df))

        sorted_list = sorted(self.TEMPLATE_GR, key=lambda log: (log.length, log.logTemplate))
        for e in sorted_list:
            print(f"{e.length:3} {e.logTemplate}")
# ======================================= END CLASS ======================================= #

#### **2. LÀM VIỆC CHÍNH**

##### **2.0. PARAMETER DATASET**

In [9]:
# SETTING_PARAMS_TEST = {
#     'Apache': {
#         'log_file': './logs2k/Apache/Apache_2k.log',
#         'log_template': './logs2k/Apache/Apache_2k.log_templates.csv',
#         'log_structure': './logs2k/Apache/Apache_2k.log_structured_corrected.csv',
#         'log_format': '\[<Time>\] \[<Level>\] <Content>',
#         'token_regexs': [
#             [r'\/(?:\w+\/){2,}\w+\.\w+$', "<*>"],
#             [r'\/(?:[^\/\s]+\/)*[^\/\s]*', "<*>"],
#             [r'(?:[0-9a-fA-F]{2,}:){3,}[0-9a-fA-F]{2,}', "<*>"],
#         ],   
#         'n_merge': 3,
#         'st':0.6,
#         'merge_special': True,  
#         'punctuationL': "()\{\}[]=:,@_",
#         'special_tokens': ['true', 'false', 'null', 'root'],
        
#     },
#     'BGL': {
#             'log_file': './logs2k/BGL/BGL_2k.log',
#             'log_template': './logs2k/BGL/BGL_2k.log_templates_corrected.csv',
#             'log_structure': './logs2k/BGL/BGL_2k.log_structured_corrected.csv',
#             'log_format': '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>',
#             'token_regexs': [
#                 [r"core\.\d+", "core.<*>"],
#                 [r'(?:[0-9a-fA-F]{2,}:){3,}[0-9a-fA-F]{2,}', "<*>"],
#                 [r'(\.{2,})\d+', r'\1<*>'],
#             ],  
#             'n_merge': 4,
#             'st':0.7,
#             'merge_special': False,  
#             'punctuationL': "[]<>()\{\}=:,@",
#             'special_tokens': ['true', 'false', 'null', 'root'],  
#         },
#     'Hadoop': {
#         'log_file': './logs2k/Hadoop/Hadoop_2k.log',
#         'log_template': './logs2k/Hadoop/Hadoop_2k.log_templates.csv',
#         'log_structure': './logs2k/Hadoop/Hadoop_2k.log_structured_corrected.csv',
#         'log_format': '<Date> <Time> <Level> \[<Process>\] <Component>: <Content>',
#         'token_regexs': [
#             # [r'\[.*?(_.*?)+\]', "<*>"],
#             [ r'^(?:[\\\/]?[^\\\/]+[\\\/]){2,}[^\\\/]+\.\w+$', "<*>"],
#         ],   
#         'n_merge': 3,
#         'st':0.6,
#         'merge_special': False,  
#         'punctuationL': '[]<>(){}=:,@#/',
#         'special_tokens': ['true', 'false', 'null', 'root'], 
#     },
#     'HDFS': {
#             'log_file': './logs2k/HDFS/HDFS_2k.log',
#             'log_template': './logs2k/HDFS/HDFS_2k.log_templates.csv',
#             'log_structure': './logs2k/HDFS/HDFS_2k.log_structured_corrected.csv',
#             'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
#             'token_regexs': [
#             ],    
#             'n_merge': 3,
#             'st':0.6,
#             'merge_special': False,  
#             'punctuationL': '[]<>(){}=:,@#',
#             'special_tokens': ['true', 'false', 'null', 'root'], 
#     },
#     'HealthApp': {
#         'log_file': './logs2k/HealthApp/HealthApp_2k.log',
#         'log_template': './logs2k/HealthApp/HealthApp_2k.log_templates.csv',
#         'log_structure': './logs2k/HealthApp/HealthApp_2k.log_structured_corrected.csv',
#         'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
#         'token_regexs': [
            
#         ],  
#         'n_merge': 4,
#         'st':0.6,
#         'merge_special': False,  
#         'punctuationL': '[]<>(){}=:,#',
#         'special_tokens': ['true', 'false', 'null', 'root'], 
#     },
# }

In [10]:
SETTING_PARAMS_TEST = {
    'Proxifier': {
        'log_file': './logs2k/Proxifier/Proxifier_2k.log',
        'log_template': './logs2k/Proxifier/Proxifier_2k.log_templates.csv',
        'log_structure': './logs2k/Proxifier/Proxifier_2k.log_structured_corrected.csv',
        'log_format': '\[<Time>\] <Program> - <Content>',
        'token_regexs': [
            [r'<\d+\ssec', "<*>"],
            [r'\(\d+(?:.\d+) [KGTM]B\)', "(<*>)"], 
            [r'(?:\b|^)[\w.-]+\.cuhk\.edu\.hk', "<*>"],
        ],
        'n_merge': 4,
        'st':0.6,
        'merge_special': False,  
        'punctuationL': '[]<>(){}=:,#',
        'special_tokens': ['true', 'false', 'null', 'root'], 
    },
}

##### **2.1. LUỒNG LÀM VIỆC**

In [11]:
# DICT_SPECIAL_TOKEN = ['true', 'false', 'null', 'root']
# PUNCTUATIONL = '(){}[]=:;,#$@'

# N_MERGE = 4
# ST = 0.6

def mainDrainDS(SETTING_PARAMS):
    for name_dataset, dataset in SETTING_PARAMS.items():
        print('\n================ Processing on %s =====================' % name_dataset)
        start_time = time.time()
        PUNCTUATION_CHAR = set(dataset['punctuationL'])
        SPECIAL_TOKENS = set(dataset['special_tokens'])
        N_MERGE = dataset['n_merge']
        ST = dataset['st']
        
        # ============== PROCESSING 0 ================ #
        parse_df = regexAndCreateDf(dataset, special_tokens=SPECIAL_TOKENS, punctuationL=PUNCTUATION_CHAR)

        # ============== CREATE GROUP =============== #
        log_clusters_list = createGroupClust(parse_df, PUNCTUATION_CHAR, n_merge=N_MERGE, merge_special=dataset['merge_special'])
        # for val in log_clusters_list:
        #     print(val)
        # ============== MERGE TEMPLATE ============= #
        # Sử dụng ý tưởng giống như Drain, như sau:
        merge_group = MergeGroupTemplate(st=ST, n_merge=N_MERGE, template_gr=log_clusters_list, punctuationL=PUNCTUATION_CHAR)
        new_groupL = merge_group.mergeGroup(printL=False)
        print("NUM of GROUP: ", len(new_groupL))
        for val in new_groupL:
            print(val)

        for item in merge_group.TEMPLATE_GR:
            parse_df.loc[item.logIDL, "EventTemplate"] = item.logTemplate
        parse_df.to_csv(os.path.join("./res/DrainDS/", name_dataset+"_structured.csv"), index=False)
        elapsed_time = time.time() - start_time
        print(f"Hoàn thành xong {name_dataset}: ", elapsed_time)
        print("-"*80)

mainDrainDS(SETTING_PARAMS_TEST)




Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 4491.24it/s]

16
NUM of GROUP:  11
Key: c8855f5bd756e07b826b967f2a885a2e
Template: <*>:<*> error : A connection request was canceled before the completion.
Tokens: ['<*>:<*>', 'error', ':', 'A', 'connection', 'request', 'was', 'canceled', 'before', 'the', 'completion.']
Length: 11
Len LogIDs: 31

Key: dfec893318f4a9ce7787e0d3919d7a1b
Template: <*>:<*> close, <*> bytes (<*>) sent, <*> bytes received, lifetime <*>
Tokens: ['<*>:<*>', 'close,', '<*>', 'bytes', '(<*>)', 'sent,', '<*>', 'bytes', 'received,', 'lifetime', '<*>']
Length: 11
Len LogIDs: 121

Key: 7238cc078b651c98f0afb6809b642ffe
Template: <*>:<*> close, <*> bytes sent, <*> bytes (<*>) received, lifetime <*>
Tokens: ['<*>:<*>', 'close,', '<*>', 'bytes', 'sent,', '<*>', 'bytes', '(<*>)', 'received,', 'lifetime', '<*>']
Length: 11
Len LogIDs: 168

Key: 451b6141564a3eb9a1d78f5e0fdb7fb5
Template: <*>:<*> error : Could not connect through proxy <*>:<*> - Proxy server cannot establish a connection with the target, status code <*>
Tokens: ['<*>:<*>'




##### **2.2. ĐÁNH GIÁ BENCHMARK**

In [12]:
from tqdm import tqdm

from evaluation.settings import benchmark_settings
from evaluation.utils.common import common_args
from evaluation.utils.evaluator_main import *
from evaluation.utils.postprocess import post_average

from evaluation.utils.GA_calculator import evaluate
from evaluation.utils.template_level_analysis import evaluate_template_level
from evaluation.utils.PA_calculator import calculate_parsing_accuracy

import importlib
import evaluation.utils.evaluator_main as evaluator_main
importlib.reload(evaluator_main)

file_path = './benchmark/parsing_accuracy.csv'
if os.path.exists(file_path):
    os.remove(file_path)
result_file = evaluator_main.prepare_results(output_dir="./benchmark")
for name_dataset, dataset_setting in SETTING_PARAMS_TEST.items():
    print('\n================ Evaluation on %s =====================' % name_dataset)
    groundtruth = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    
    parsedresult = os.path.join("./res/DrainDS/", name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    tqdm.pandas()
    print("Start to align with null values")
    groundtruth['EventTemplate'] = groundtruth.progress_apply(align_with_null_values, axis=1)
    # groundtruth['EventTemplate'] = groundtruth['EventTemplate'].map(correct_template_general)
    parsedresult['EventTemplate'] = parsedresult.progress_apply(align_with_null_values, axis=1)
    
    filter_templates = None
    
    # =============== BENCHMARK GA =============== #
    start_time = time.time()
    GA, FGA = evaluate(groundtruth, parsedresult, filter_templates)

    GA_end_time = time.time() - start_time
#     print('Grouping Accuracy calculation done. [Time taken: {:.3f}]'.format(GA_end_time))
    
    start_time = time.time()
    PA = calculate_parsing_accuracy(groundtruth, parsedresult, filter_templates)
    
    # correctly_parsed_messages = parsedresult[['EventTemplate']].eq(truth_template[['EventTemplate']]).values.sum()
    mismatch_mask = ~parsedresult[['EventTemplate']].eq(groundtruth[['EventTemplate']]).squeeze()
    mismatch_indices = parsedresult.index[mismatch_mask].tolist()
    for idx in mismatch_indices:
        trust_template = groundtruth.at[idx, 'EventTemplate']
        parsed_template = parsedresult.at[idx, 'EventTemplate']
        print(f"Row index: {idx}")
        print(f"  Trust: {trust_template}")
        print(f"  Parse: {parsed_template}")
        print("-" * 40)
    print(f"NUM of FAILURE PA: {len(mismatch_indices)}")
    print("="*45 + " END " + "="*45 + "\n")
        
    PA_end_time = time.time() - start_time
#     print('Parsing Accuracy calculation done. [Time taken: {:.3f}]'.format(PA_end_time))

    # # =============== BENCHMARK TEMPLATE-LEVEL-ACCURACY =============== #
    start_time = time.time()
    identified_templates, ground_templates, FTA, PTA, RTA = evaluate_template_level(name_dataset, groundtruth, parsedresult, filter_templates)
    
    TA_end_time = time.time() - start_time
#     print('Template-level accuracy calculation done. [Time taken: {:.3f}]'.format(TA_end_time))

    result = name_dataset + ',' + \
            str(identified_templates) + ',' + \
            str(ground_templates) + ',' + \
            "{:.3f}".format(GA) + ',' + \
            "{:.3f}".format(PA) + ',' + \
            "{:.3f}".format(FGA) + ',' + \
            "{:.3f}".format(PTA) + ',' + \
            "{:.3f}".format(RTA) + ',' + \
            "{:.3f}".format(FTA) + '\n'

    with open(os.path.join("./benchmark", result_file), 'a') as summary_file:
        summary_file.write(result)

result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
print(result_df)


Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 72981.23it/s]
100%|██████████| 2000/2000 [00:00<00:00, 81629.83it/s]
100%|██████████| 8/8 [00:00<?, ?it/s]


Grouping_Accuracy (GA): 0.5265, FGA: 0.7368,
Parsing_Accuracy (PA): 0.5135
Row index: 3
  Trust: <*> close, <*> <*>, <*> <*>, lifetime <*>
  Parse: <*>:<*> close, <*> bytes sent, <*> bytes received, lifetime <*>
----------------------------------------
Row index: 7
  Trust: <*> close, <*> <*>, <*> <*>, lifetime <*>
  Parse: <*>:<*> close, <*> bytes sent, <*> bytes received, lifetime <*>
----------------------------------------
Row index: 10
  Trust: <*> close, <*> <*>, <*> <*>, lifetime <*>
  Parse: <*>:<*> close, <*> bytes sent, <*> bytes (<*>) received, lifetime <*>
----------------------------------------
Row index: 11
  Trust: <*> close, <*> <*>, <*> <*>, lifetime <*>
  Parse: <*>:<*> close, <*> bytes sent, <*> bytes (<*>) received, lifetime <*>
----------------------------------------
Row index: 13
  Trust: <*> close, <*> <*>, <*> <*>, lifetime <*>
  Parse: <*>:<*> close, <*> bytes (<*>) sent, <*> bytes (<*>) received, lifetime <*>
----------------------------------------
Row inde

100%|██████████| 11/11 [00:00<?, ?it/s]

PTA: 0.5455, RTA: 0.7500 FTA: 0.6316
Identify : 11, Groundtruth : 8
     Dataset  parse_gr  truth_gr     GA     PA    FGA    PTA   RTA    FTA
0  Proxifier        11         8  0.526  0.513  0.737  0.545  0.75  0.632





##### **2.4. BẢNG ĐIỂM**

In [13]:
result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
# Chỉ chọn các cột số để tính trung bình và độ lệch chuẩn
numeric_cols = result_df.select_dtypes(include='number').columns

# Tính trung bình
avg_row = result_df[numeric_cols].mean().round(3)
avg_row['Dataset'] = 'Average'
avg_row['parse_gr'] = ''
avg_row['truth_gr'] = ''

# Tính độ lệch chuẩn
std_row = result_df[numeric_cols].std().round(3)
std_row['Dataset'] = 'Std'
std_row['parse_gr'] = ''
std_row['truth_gr'] = ''

# Thêm hai dòng mới vào DataFrame
result_df = pd.concat([result_df, pd.DataFrame([avg_row, std_row])], ignore_index=True)
print(result_df)

     Dataset parse_gr truth_gr     GA     PA    FGA    PTA   RTA    FTA
0  Proxifier       11        8  0.526  0.513  0.737  0.545  0.75  0.632
1    Average                    0.526  0.513  0.737  0.545  0.75  0.632
2        Std                      NaN    NaN    NaN    NaN   NaN    NaN


##### **2.5. SO SÁNH LỖI SAI**

In [14]:
def compare_templates(datasets, parse_df):
    structured_df = pd.read_csv(datasets['log_structure']) 
    unique_templates = structured_df['EventTemplate'].unique()
    print(f"SHAPE: {structured_df.shape}")
    print(f"SHAPE PARSER: {parse_df.shape}")
    print(f"Num of truth templates: {len(unique_templates)}")

    template_compare = {}
    for template in unique_templates:
        arr_index = structured_df[structured_df['EventTemplate'] == template].index.tolist()
        parse_template_series = parse_df.loc[arr_index, 'EventTemplate']
        parse_template_unique = parse_template_series.unique().tolist()
        content_list = structured_df.loc[arr_index[:5], 'Content'].tolist()
        content_str = "[\n\t" + "\n\t".join(content_list) + "\n]"

        hash_key = hash(template)
        template_compare[hash_key] = {
            'ground_truth': template,
            'parse': parse_template_unique,
            'content_lst': content_str,
            'index': arr_index,
            'length': len(template.strip().split()),
            'nums': len(arr_index),
        }
        
    sorted_items = sorted(
            template_compare.items(),
            key=lambda item: (item[1]['length'], item[1]['ground_truth'])
        )

    num_dif = 0
    for idx, (key, value) in enumerate(sorted_items, 1):
        if len(value['parse']) != 1 or value['parse'][0] != value['ground_truth']:
            num_dif += 1
            print(f"No. {idx}")
            print(f"Length: {value['length']}, Nums: {value['nums']}")
            print(f"Ground truth  : {value['ground_truth']}")
            print(f"Parse templs  : {value['parse']}")
            print(f"Content List: {value['content_lst']}")
            print(f"Length parse: {len(value['parse'])}")
            print("-" * 40)
    print(f"Total differences found: {num_dif}")
    
result_path_dir = "./res/DrainDS/"
choose_dataset = ["Apache", "BGL", "Hadoop", "HDFS", "HealthApp", "HPC", "Linux", "Mac", "OpenSSH", "OpenStack", "Proxifier", "Spark", "Thunderbird", "Zookeeper"]
for name_dataset, dataset_setting in SETTING_PARAMS_TEST.items():
    if name_dataset not in choose_dataset:
        continue
    print("="*40 + f" COMPARE {name_dataset} " + "="*40)
    parsedresult = os.path.join(result_path_dir, name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    truth_template = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    unique_templates = truth_template['EventTemplate'].unique()
    
    compare_templates(dataset_setting, parsedresult)
    print("="*45 + " END " + "="*45 + "\n")

SHAPE: (2000, 6)
SHAPE PARSER: (2000, 9)
Num of truth templates: 8
No. 3
Length: 8, Nums: 947
Ground truth  : <*> close, <*> <*>, <*> <*>, lifetime <*>
Parse templs  : ['<*>:<*> close, <*> bytes sent, <*> bytes received, lifetime <*>', '<*>:<*> close, <*> bytes sent, <*> bytes (<*>) received, lifetime <*>', '<*>:<*> close, <*> bytes (<*>) sent, <*> bytes (<*>) received, lifetime <*>', '<*>:<*> close, <*> bytes (<*>) sent, <*> bytes received, lifetime <*>']
Content List: [
	proxy.cse.cuhk.edu.hk:5070 close, 0 bytes sent, 0 bytes received, lifetime 00:01
	proxy.cse.cuhk.edu.hk:5070 close, 403 bytes sent, 426 bytes received, lifetime <1 sec
	proxy.cse.cuhk.edu.hk:5070 close, 451 bytes sent, 18846 bytes (18.4 KB) received, lifetime <1 sec
	proxy.cse.cuhk.edu.hk:5070 close, 445 bytes sent, 5174 bytes (5.05 KB) received, lifetime <1 sec
	proxy.cse.cuhk.edu.hk:5070 close, 1190 bytes (1.16 KB) sent, 1671 bytes (1.63 KB) received, lifetime 00:02
]
Length parse: 4
-------------------------------

In [15]:
# result_path_dir = "./res/DrainDS/"
# choose_dataset = ["Apache", "BGL", "Hadoop", "HDFS", "HealthApp", "HPC", "Linux", "Mac", "OpenSSH", "OpenStack", "Proxifier", "Spark", "Thunderbird", "Zookeeper"]
# for name_dataset, dataset_setting in SETTING_PARAMS_TEST.items():
#     if name_dataset not in choose_dataset:
#         continue
#     print("="*40 + f" COMPARE {name_dataset} PA " + "="*40)
#     parsedresult = os.path.join(result_path_dir, name_dataset + "_structured.csv")
#     parsedresult = pd.read_csv(parsedresult, dtype=str)
#     parsedresult.fillna("", inplace=True)
    
#     truth_template = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    
#     # correctly_parsed_messages = parsedresult[['EventTemplate']].eq(truth_template[['EventTemplate']]).values.sum()
#     mismatch_mask = ~parsedresult[['EventTemplate']].eq(truth_template[['EventTemplate']]).squeeze()
#     mismatch_indices = parsedresult.index[mismatch_mask].tolist()
#     for idx in mismatch_indices:
#         trust_template = truth_template.at[idx, 'EventTemplate']
#         parsed_template = parsedresult.at[idx, 'EventTemplate']
#         print(f"Row index: {idx}")
#         print(f"  Trust: {trust_template}")
#         print(f"  Parse: {parsed_template}")
#         print("-" * 40)
#     print("="*45 + " END " + "="*45 + "\n")

In [16]:
df = load_data(SETTING_PARAMS_TEST['Proxifier']['log_file'], SETTING_PARAMS_TEST['Proxifier']['log_format'])
# Thiết lập để hiển thị toàn bộ nội dung
df['Content'].to_csv('test.csv', index=False)