#### **0. CHUẨN BỊ**

##### **0.1. CẤU HÌNH CÁC THAM SỐ**

In [1]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter, defaultdict
import datetime
from tqdm import tqdm
import string
from copy import deepcopy

import hashlib
import time

pd.set_option('display.max_rows', None)



##### **0.2. CLASS**

In [2]:
class LogCluster:
    def __init__(self, keyGroup, logTemplate, tokens, length, logIDL=None):
        self.keyGroup = keyGroup
        self.logTemplate = logTemplate
        self.tokens = tokens
        self.length = length
        self.logIDL = logIDL if logIDL is not None else []
        # self.static_tokenL = static_tokenL if static_tokenL is not None else {}
    def __str__(self):
        # static_str = "Cover Tokens: {\n"
        # for k, v in self.static_tokenL.items():
        #     static_str += f"  {k}: {v},\n"
        # static_str += "}"
        return (
            f"Key: {self.keyGroup}\n"
            f"Template: {self.logTemplate}\n"
            f"Tokens: {self.tokens}\n"
            f"Length: {self.length}\n"
            f"Len LogIDs: {len(self.logIDL)}\n"
            # f"{static_str}\n"
        )

In [3]:
# =============================== READ DATA =============================== #
def log_to_dataframe(log_file, regex, headers):
    """ Phương thức chuyển đổi file log thành dataframe
    """ 
    log_messages = []
    linecount = 0
    with open(log_file, 'r', encoding="utf8") as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
            except Exception as e:
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

def generate_logformat_regex(logformat):
    """ Phương thức tạo regex từ logformat, biểu thức định dạng của một event log: 
    Ex: 'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>'
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

def load_data(logfile, logformat):
    """ Phương thức trả về một dataframe từ một file log chỉ định
    """
    log_headers, log_regex = generate_logformat_regex(logformat)
    logs_df = log_to_dataframe(logfile, log_regex, log_headers)
    return logs_df

In [4]:
def hasNumbers(s):
    return any(char.isdigit() for char in s)

def isDynamicToken(token: str, special_tokens: set) -> bool:
    """ Kiểm tra xem token có phải là token động hay không """
    
    # 1. Kiểm tra nếu token.lower() nằm trong special_tokens
    if token.lower() in special_tokens:             
        return True

    # 2. Kiểm tra nếu toàn bộ token là số HEX hợp lệ (ít nhất 8 chữ số hex)
    if re.fullmatch(r'0x[0-9a-fA-F]+', token) or re.fullmatch(r'[0-9a-fA-F]{8,}', token):
        return True

    # 3. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', token):
        return True

    return False

def isDynamicSubTok(token, special_tokens):
    """ Kiểm tra xem token có phải là token động hay không """
    # 1. So khớp special tokens
    if token.lower() in special_tokens:
        return True
    
    # 2. Nếu là hex (0x...)
    if re.search(r'0x[0-9a-fA-F]+', token):
        return True
    
    # 3. Kiểm tra nếu token là số (có thể âm, có thể thập phân)
    if re.fullmatch(r'-?\d+(\.\d+)?', token):
        return True

    # 4. Nếu token có chứa số không ở cuối (số giữa từ)
    if re.search(r'[a-zA-Z]*\d+[a-zA-Z]+', token):
        return True
    
    # 5. Nếu KHÔNG chứa chữ cái, chỉ chứa số và ký tự đặc biệt
    if not re.search(r'[a-zA-Z]', token):
        return True
    
    # 6. Nếu có từ 3 chữ số trở lên bất kỳ đâu trong token
    if len(re.findall(r'\d', token)) >= 3:
        return True

    return False



def splitSpecialTok(s, seps, special_tokens=[]):
    """Tách chuỗi s theo ký tự trong seps và xử lý các subtoken theo quy tắc."""
    pattern = '|'.join(map(re.escape, seps))
    special_set = set(tok.lower() for tok in special_tokens)
    result = []
    
    if not re.search(pattern, s):       # Tức token không chứa ký tự phân tách
        if isDynamicSubTok(s, special_set):
            return "<*>", None
        return s, None
    
    tokens = re.split(f'({pattern})', s)
    tokens = [tok for tok in tokens if tok.strip() != '']

    for tok in tokens:
        if tok in seps or not isDynamicSubTok(tok, special_set):  
            result.append(tok)
        else:
            result.append("<*>")
    return tokens, result

def mergeSpecialTok(token_str, seps):
    """ Gộp các chuỗi "<*>" liên tiếp hoặc ngăn cách bằng các ký tự đặc biệt.
    Sau đó tách lại thành danh sách token, bảo toàn chuỗi "<*>".
    """
    sep_pattern = '|'.join(re.escape(sep) for sep in seps)
    
    prev = None
    while token_str != prev:
        prev = token_str
        # Gộp mẫu: <*> + (các ký tự phân tách giống nhau) + <*>
        token_str = re.sub(rf'(<\*>)(({sep_pattern})\3*)(<\*>)', r'<*>', token_str)
        
        # Gộp nhiều <*><*> liên tiếp:
        token_str = re.sub(r'(<\*>)+', r'<*>', token_str)

    placeholder = "__WILDCARD__"
    token_str = token_str.replace("<*>", placeholder)
    
    pattern = '|'.join(re.escape(sep) for sep in seps)
    tokensL = re.split(f'({pattern})', token_str)
    
    tokensL = [tok.replace(placeholder, "<*>") for tok in tokensL if tok.strip() != '']
    
    return tokensL

def processLine(line, regexs, punctuationL, special_set=set(), merge_special=False):
    """ Phương thức hỗ trợ xử lý từng dòng log """
    # ================== Xử lý regex và filter ================== #
    # tokens0 là list token sau khi tiền xử lý
    # groups_token là list tokens templates
    
    tokens_lst = str(line["Content"]).strip().split()
    
    group_tokens = []         
    idx_dynamic_token = []
    dynamic_tokenL = []
    static_tokenL = []
    
    for idx_tok, token in enumerate(tokens_lst):
        # Duyệt qua từng token, xử lý token tương đồng với regex
        # Nếu token khớp với regex thì thêm xử lý vào group_tokens, ngược lại thì xử lý tiếp
        
        if isDynamicToken(token, special_set):
            group_tokens.append("<*>")
            continue
        
        # 1. Xử lý token với các regex đã cho
        for pattern, *replacement in regexs:
            replacement = replacement[0] if replacement else "<*>"
            token = re.sub(pattern, replacement, token)
        
        # 2. Nếu tồn tại ký tự số trong token, thì xử lý qua phương thức splitSpecialTok
        if not hasNumbers(token):
            group_tokens.append(token)
        else:
            sep_token, static_tokL = splitSpecialTok(token, punctuationL)
            if static_tokL is not None:
                group_tokens.append("<*>")
                idx_dynamic_token.append(idx_tok)
                dynamic_tokenL.append(sep_token) 
                static_tokenL.append(static_tokL.copy())
            else:
                group_tokens.append(sep_token)
                    
    group_lst = group_tokens.copy()
    for idx, val in enumerate(idx_dynamic_token):
        group_lst[val] = "".join(static_tokenL[idx])
    
    groupTem_str = f"{' '.join(group_lst)} : {len(group_lst)} : {' '.join(str(idx) for idx in idx_dynamic_token)} : {' '.join([str(len(i)) for i in static_tokenL])}"

    return pd.Series({
        'GroupTemplate': hashlib.md5(groupTem_str.encode('utf-8')).hexdigest(),
        'GroupTokens': group_tokens,
        'idxDynamicTok': idx_dynamic_token,
        'DynamicTokList': dynamic_tokenL,
        'StaticTokList': static_tokenL,
        'EventTemplate': f"{' '.join(group_lst)}"
    })
    
def regexAndCreateDf(datasets, DICT_SPECIAL_TOKEN=['true', 'false'], punctuationL = set('(),<>:;{}[]~=')):
    logs_df = load_data(datasets['log_file'], datasets['log_format'])

    # ================================ PROCESSING TOKEN AND SUBTOKEN ================================ #
    parse_df = logs_df.copy()
    parse_df['GroupTemplate'] = ""                                  # Lưu template sử dụng để nhóm
    parse_df['GroupTokens'] = [[] for _ in range(len(parse_df))]    # Lưu list token của Group Teplate
    parse_df['idxDynamicTok'] = [[] for _ in range(len(parse_df))]  # Lưu vị trí token động
    parse_df['DynamicTokList'] = [[] for _ in range(len(parse_df))] # Lưu list token động theo vị trí tương ứng
    parse_df['StaticTokList'] = [[] for _ in range(len(parse_df))]  # Lưu list token tĩnh theo vị trí tương ứng
    parse_df['EventTemplate'] = ""                                  # Template cuối cùng sau khi xử lý


    tqdm.pandas(desc="Tiền xử lý ở mức TOKEN!")
    special_set = set(s.lower() for s in DICT_SPECIAL_TOKEN)
    
    results = parse_df.progress_apply(
            lambda row: processLine(row, datasets['token_regexs'], punctuationL, special_set),
            axis=1
        )

    for col in results.columns:
        parse_df[col] = results[col]
    
    return parse_df

In [5]:
# ================================ TẠO CÁC NHÓM GROUP ================================ #
def splitSubToken(s, seps):
    
    placeholder = "__WILDCARD__"
    s = s.replace("<*>", placeholder)
    
    if not re.search('|'.join(map(re.escape, seps)), s):
        return [s]
    
    pattern = '|'.join(re.escape(sep) for sep in seps)
    tokensL = re.split(f'({pattern})', s)
    
    tokensL = [tok.replace(placeholder, "<*>") for tok in tokensL if tok.strip() != '']
    
    return tokensL

def createGroupClust(parse_df, punctuationL, merge_special=False): 
    log_clusters_list = []                                          # List lưu trữ các nhóm log logCluster

    unique_groups = parse_df.groupby("GroupTemplate")
    print(len(unique_groups)) # in ra số nhóm chưa xử lý

    for key, group_val in unique_groups:
        first_row = group_val.iloc[0]
        tokens = first_row['GroupTokens']
        
        if len(first_row["idxDynamicTok"]) != 0:                    # Ktra có token động chưa xử lý hay không?
            group_staticL = group_val['StaticTokList'].to_list()
            
            static_processingL = defaultdict(list)                     # List lưu các nhóm token động đã xử lý
            for idx in range(len(group_staticL[0])):
                cols_idx_gr = list(zip(*[x[idx] for x in group_staticL])) 
                static_idx = []
                for idx_sub, lst_idx in enumerate(cols_idx_gr):     # Lấy các phần tử theo cột của từng token đã được phân tách
                    unique_idx = set(lst_idx)
                    if len(unique_idx) > 1:                         # Vị trí có token khác nhau thì thành <*>
                        static_idx.append("<*>")
                    else: 
                        static_idx.append(next(iter(unique_idx)))
                
                if merge_special:
                    static_idx = mergeSpecialTok("".join(static_idx), punctuationL)
                else: 
                    static_idx = "".join(static_idx)
                tokens[first_row["idxDynamicTok"][idx]] = static_idx                
            
        logTemplate = " ".join(tokens)

        cluster = LogCluster(
            keyGroup= hashlib.md5(logTemplate.encode('utf-8')).hexdigest(),
            logTemplate=logTemplate,
            tokens=tokens,
            length=len(tokens),
            logIDL=group_val.index.tolist(),
        )
        log_clusters_list.append(cluster)
        # ==================================== END ==================================== #
        
    return log_clusters_list

In [6]:
# ===================== TẠO CLASS ===================== #
class MergeGroupTemplate:
    def __init__(self, st=0.6, n_merge=3, template_gr=None, punctuationL=set()):
        self.ST = st
        self.N_MERGE = n_merge
        self.TEMPLATE_GR = template_gr if template_gr is not None else []
        self.punctuationL = punctuationL
    
    def similarySeq(self, seq1, seq2):
        """ So sánh độ tương đồng giữa các token của 2 nhóm cluster dựa trên ý tưởng của Drain"""
        assert len(seq1) == len(seq2)
        simTokens = 0
        numOfPar = 0

        for token1, token2 in zip(seq1, seq2):
            if token1 == "<*>":
                numOfPar += 1
                continue
            if token1 == token2:
                simTokens += 1

        retVal = float(simTokens) / len(seq1)

        return retVal, numOfPar
    
    def fastMatchCLuster(self, seqGroupL, seq):
        choose_group = None
        maxSim = -1
        maxNumOfPara = -1
        maxGroup = None

        for gr in seqGroupL:
                curSim, curNumOfPara = self.similarySeq(gr[0].tokens, seq.tokens)
                if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
                    maxSim = curSim
                    maxNumOfPara = curNumOfPara
                    maxGroup = gr
                    
                if maxSim >= self.ST:
                    choose_group = maxGroup
        return choose_group

    def findGeneralToken(self, strings):
        def wildcard2Regex(pattern_str):
            # Tách theo wildcard rồi escape từng phần
            parts = pattern_str.split('<*>')
            regex = '.*'.join(re.escape(p) for p in parts)
            return '^' + regex + '$'

        strings = list(strings)

        for candidate in strings:
            regex = wildcard2Regex(candidate)
            if all(re.fullmatch(regex, s) for s in strings if s != candidate):
                return candidate

        return None    
    
    def generalizeGroup(self, group):
        """Tạo pattern chung bằng cách đếm số lượng token khác nhau tại mỗi vị trí"""
        
        mask_positions = defaultdict(str)               # Danh sách các vị trí cần thay thế bằng <*>    
        tokensL = [s.tokens for s in group]
        
        for idx, col in enumerate(zip(*tokensL)):
            unique_token = set(col)
            if len(unique_token) > 1:
                if "<*>" in unique_token:
                    mask_positions[idx] = "<*>"
                else:
                    common_token = self.findGeneralToken(unique_token)
                    if common_token is not None:
                        mask_positions[idx] = common_token
                    else:
                        if len(unique_token) >= self.N_MERGE:
                            sub_tokensL = [splitSubToken(token, self.punctuationL) for token in unique_token]
                            unique_len = set(len(sub_token) for sub_token in sub_tokensL)
                            if len(unique_len) > 1:
                                mask_positions[idx] = "<*>"
                            else:
                                replace_str = []
                                for sub_idx, col_sub in enumerate(zip(*sub_tokensL)):
                                    unique_sub = set(col_sub)
                                    if len(unique_sub) > 1:
                                        replace_str.append("<*>")
                                    else:
                                        replace_str.append(next(iter(unique_sub)))
                                replace_str = "".join(replace_str)
                                while "<*><*>" in replace_str:
                                    replace_str = replace_str.replace("<*><*>", "<*>")
                                mask_positions[idx] = replace_str
            
        # Tạo pattern chung
        for seq in group:
            seq.tokens = [mask_positions[i] if i in mask_positions else token for i, token in enumerate(seq.tokens)]
            seq.logTemplate = " ".join(seq.tokens)

        # Gom nhóm lại theo pattern
        pattern_dict = defaultdict(list)
        for seq in group:
            key = tuple(seq.tokens)
            pattern_dict[key].append(seq)

        result = []
        for key, values in pattern_dict.items():
            if len(values) != 1: 
                logIDL = []
                for x in values:
                    logIDL.extend(x.logIDL)
                values[0].logIDL = logIDL
            result.append(values[0])
        return result
    
    def mergeGroup(self, printL=False):
        grouped_by_length = defaultdict(list)
        [grouped_by_length[t.length].append(t) for t in self.TEMPLATE_GR]
        
        newClusterGroupsL = []
        
        # Nhóm theo chiều dài:
        for length, groups_len in grouped_by_length.items():
            groupsSimTemL = []
            for log_clust in groups_len:
                matched_gr = self.fastMatchCLuster(groupsSimTemL, log_clust)
                if matched_gr is not None:
                    matched_gr.append(log_clust)
                else:
                    groupsSimTemL.append([log_clust])
            for group in groupsSimTemL:
                if len(group) == 1:
                    newClusterGroupsL.extend(group)
                else:
                    refined_groups = self.generalizeGroup(group)
                    newClusterGroupsL.extend(refined_groups)
        
        self.TEMPLATE_GR = newClusterGroupsL

        if printL:
            self.printList()
        
        return newClusterGroupsL
    
    def printList(self):
        print(len(self.TEMPLATE_GR))
        # df = pd.read_csv(datasets['log_template'])
        # print(len(df))

        sorted_list = sorted(self.TEMPLATE_GR, key=lambda log: (log.length, log.logTemplate))
        for e in sorted_list:
            print(f"{e.length:3} {e.logTemplate}")
# ===================== END CLASS ===================== #

In [7]:
SETTING_PARAMS_TEST = {
    'Apache': {
        'log_file': './logs2k/Apache/Apache_2k.log',
        'log_template': './logs2k/Apache/Apache_2k.log_templates.csv',
        'log_structure': './logs2k/Apache/Apache_2k.log_structured_corrected.csv',
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'token_regexs': [
            [r'\/(?:\w+\/){2,}\w+\.\w+$', "<*>"],
            [r'\/(?:[^\/\s]+\/)*[^\/\s]*', "<*>"],
            [r'(?:[0-9a-fA-F]{2,}:){3,}[0-9a-fA-F]{2,}', "<*>"],
        ],    
        'subToken_regexs': [
            
        ],
    },
    'BGL': {
        'log_file': './logs2k/BGL/BGL_2k.log',
        'log_template': './logs2k/BGL/BGL_2k.log_templates.csv',
        'log_structure': './logs2k/BGL/BGL_2k.log_structured_corrected.csv',
        'log_format': '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>',
        'token_regexs': [
            [r"core\.\d+", "core.<*>"],
            [r'(?:[0-9a-fA-F]{2,}:){3,}[0-9a-fA-F]{2,}', "<*>"],
            [r'(\.{2,})\d+', r'\1<*>']
        ],    
        'subToken_regexs':[
            
        ],
    },
    'Hadoop': {
        'log_file': './logs2k/Hadoop/Hadoop_2k.log',
        'log_template': './logs2k/Hadoop/Hadoop_2k.log_templates.csv',
        'log_structure': './logs2k/Hadoop/Hadoop_2k.log_structured_corrected.csv',
        'log_format': '<Date> <Time> <Level> \[<Process>\] <Component>: <Content>',
        'token_regexs': [
            [r'\[.*?(_.*?)+\]', "<*>"],
        ],    
    },
    'HDFS': {
        'log_file': './logs2k/HDFS/HDFS_2k.log',
        'log_template': './logs2k/HDFS/HDFS_2k.log_templates.csv',
        'log_structure': './logs2k/HDFS/HDFS_2k.log_structured_corrected.csv',
        'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
        'token_regexs': [
            [r'blk_-?\d+', "<*>"],
            [r'[/]?(\d+\.){3}\d+(:\d+)?', "<*>"], 
        ],    
    },
    'HealthApp':{
        'log_file': './logs2k/HealthApp/HealthApp_2k.log',
        'log_template': './logs2k/HealthApp/HealthApp_2k.log_templates.csv',
        'log_structure': './logs2k/HealthApp/HealthApp_2k.log_structured_corrected.csv',
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'token_regexs': [],  
    },
    'HPC':{
        'log_file': './logs2k/HPC/HPC_2k.log',
        'log_template': './logs2k/HPC/HPC_2k.log_templates.csv',
        'log_structure': './logs2k/HPC/HPC_2k.log_structured_corrected.csv',
        'log_format': '<LogId> <Node> <Component> <State> <Time> <Flag> <Content>',
        'token_regexs': [
            [r'=\d+', "<*>"],
        ],  
    },
    'Linux': {
        'log_file': './logs2k/Linux/Linux_2k.log',
        'log_template': './logs2k/Linux/Linux_2k.log_templates.csv',
        'log_structure': './logs2k/Linux/Linux_2k.log_structured_corrected.csv',
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'token_regexs': [
            [r'(\d+\.){3}\d+', "<*>"],
            [r'\d{2}:\d{2}:\d{2}', "<*>"],
        ],  
    },
    'Mac': {
        'log_file': './logs2k/Mac/Mac_2k.log',
        'log_template': './logs2k/Mac/Mac_2k.log_templates.csv',
        'log_structure': './logs2k/Mac/Mac_2k.log_structured_corrected.csv',
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'token_regexs': [
            [r'([\w-]+\.){2,}[\w-]+', "<*>"],
            [r'https?:\/\/(?:[^\/\s]+\/?)*', "<*>"],
            [r'\S*\/(?:[^\/\s]+\/){1,}[^\/\s]*', "<*>"],
        ],  
    },
    'OpenSSH': {
        'log_file': './logs2k/OpenSSH/OpenSSH_2k.log',
        'log_template': './logs2k/OpenSSH/OpenSSH_2k.log_templates.csv',
        'log_structure': './logs2k/OpenSSH/OpenSSH_2k.log_structured_corrected.csv',
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'token_regexs': [
            [r"(\d+):", "<*>"],
        ],    
    },
    'OpenStack': {
        'log_file': './logs2k/OpenStack/OpenStack_2k.log',
        'log_template': './logs2k/OpenStack/OpenStack_2k.log_templates.csv',
        'log_structure': './logs2k/OpenStack/OpenStack_2k.log_structured_corrected.csv',
        'log_format': '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>',
        'token_regexs': [
            ["(\w+-\w+-\w+-\w+-\w+)", "<*>"],
            [r'HTTP\/\d+\.\d+', "<*>"],
        ],    
    },
    'Proxifier': {
        'log_file': './logs2k/Proxifier/Proxifier_2k.log',
        'log_template': './logs2k/Proxifier/Proxifier_2k.log_templates.csv',
        'log_structure': './logs2k/Proxifier/Proxifier_2k.log_structured_corrected.csv',
        'log_format': '\[<Time>\] <Program> - <Content>',
        'token_regexs': [
            [r'<\d+\ssec', "<*>"],
            [r'([\w-]+\.)+[\w-]+(:\d+)?', "<*>"],
            [r'\d{2}:\d{2}(:\d{2})*', "<*>"],
            [r'[KGTM]B', "<*>"], 
        ],
    },
    'Spark': {
        'log_file': './logs2k/Spark/Spark_2k.log',
        'log_template': './logs2k/Spark/Spark_2k.log_templates.csv',
        'log_structure': './logs2k/Spark/Spark_2k.log_structured_corrected.csv',
        'log_format': '<Date> <Time> <Level> <Component>: <Content>',
        'token_regexs': [],    
    },
    'Thunderbird': {
        'log_file': './logs2k/Thunderbird/Thunderbird_2k.log',
        'log_template': './logs2k/Thunderbird/Thunderbird_2k.log_templates.csv',
        'log_structure': './logs2k/Thunderbird/Thunderbird_2k.log_structured_corrected.csv',
        'log_format': '<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>',
        'token_regexs': [
            [r'(\d+\.){3}\d+', "<*>"],
        ],
    },  
    'Zookeeper': {
        'log_file': './logs2k/Zookeeper/Zookeeper_2k.log',
        'log_template': './logs2k/Zookeeper/Zookeeper_2k.log_templates.csv',
        'log_structure': './logs2k/Zookeeper/Zookeeper_2k.log_structured_corrected.csv',
        'log_format': '<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>',
        'filters': [],
        'token_regexs': [
            [r"(/|)(\d+\.){3}\d+(:\d+)?", "<*>"],
        ],    
    },
}

In [8]:
DICT_SPECIAL_TOKEN = ['true', 'false', 'null', 'root']
PUNCTUATIONL = '(){}[]=:;,#$'

N_MERGE = 4
ST = 0.6

for name_dataset, dataset in SETTING_PARAMS_TEST.items():
        print('\n================ Processing on %s =====================' % name_dataset)
        start_time = time.time()
        # ============== PROCESSING TOKEN AND SUBTOKEN ================ #
        parse_df = regexAndCreateDf(dataset, DICT_SPECIAL_TOKEN=DICT_SPECIAL_TOKEN, punctuationL = set(PUNCTUATIONL))

        # ======================= CREATE GROUP ======================== #
        log_clusters_list = createGroupClust(parse_df, set(PUNCTUATIONL))

        # ====================== MERGE TEMPLATE ======================= #
        # Sử dụng ý tưởng giống như Drain, như sau:
        merge_group = MergeGroupTemplate(st=ST, n_merge=N_MERGE, template_gr=log_clusters_list, punctuationL=set(PUNCTUATIONL))
        new_groupL = merge_group.mergeGroup(printL=False)
        print(f"Số lượng nhóm sau khi merge: {len(new_groupL)}")
        
        # ====================== Lưu kết quả ======================= #
        for item in merge_group.TEMPLATE_GR:
            parse_df.loc[item.logIDL, "EventTemplate"] = item.logTemplate
        parse_df.to_csv(os.path.join("./res/DrainDS/", name_dataset+"_structured.csv"), index=False)
        elapsed_time = time.time() - start_time
        print(f"Hoàn thành xong {name_dataset}: ", elapsed_time)
        print("-"*80)

        structured_df = pd.read_csv(dataset['log_structure']) 
        unique_templates = structured_df['EventTemplate'].unique()
        print(len(unique_templates))




Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 5938.72it/s]


6
Số lượng nhóm sau khi merge: 6
Hoàn thành xong Apache:  0.39594364166259766
--------------------------------------------------------------------------------
6



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 5502.26it/s]


159
Số lượng nhóm sau khi merge: 123
Hoàn thành xong BGL:  0.4569060802459717
--------------------------------------------------------------------------------
120



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6173.16it/s]


133
Số lượng nhóm sau khi merge: 116
Hoàn thành xong Hadoop:  0.40076327323913574
--------------------------------------------------------------------------------
114



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6042.14it/s]


80
Số lượng nhóm sau khi merge: 16
Hoàn thành xong HDFS:  0.3757791519165039
--------------------------------------------------------------------------------
14



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6598.55it/s]


75
Số lượng nhóm sau khi merge: 75
Hoàn thành xong HealthApp:  0.36646032333374023
--------------------------------------------------------------------------------
75



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6727.66it/s] 


126
Số lượng nhóm sau khi merge: 47
Hoàn thành xong HPC:  0.3482792377471924
--------------------------------------------------------------------------------
46



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6834.01it/s]


151
Số lượng nhóm sau khi merge: 131
Hoàn thành xong Linux:  0.4028608798980713
--------------------------------------------------------------------------------
118



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 5956.49it/s]


369
Số lượng nhóm sau khi merge: 346
Hoàn thành xong Mac:  0.4900963306427002
--------------------------------------------------------------------------------
341



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6008.14it/s]


195
Số lượng nhóm sau khi merge: 26
Hoàn thành xong OpenSSH:  0.39605283737182617
--------------------------------------------------------------------------------
26



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6080.19it/s]


60
Số lượng nhóm sau khi merge: 43
Hoàn thành xong OpenStack:  0.3983323574066162
--------------------------------------------------------------------------------
43



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 5735.46it/s]


22
Số lượng nhóm sau khi merge: 15
Hoàn thành xong Proxifier:  0.3936328887939453
--------------------------------------------------------------------------------
8



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6309.57it/s]


110
Số lượng nhóm sau khi merge: 34
Hoàn thành xong Spark:  0.37720727920532227
--------------------------------------------------------------------------------
36



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6062.43it/s]


253
Số lượng nhóm sau khi merge: 171
Hoàn thành xong Thunderbird:  0.44397783279418945
--------------------------------------------------------------------------------
149



Tiền xử lý ở mức TOKEN!: 100%|██████████| 2000/2000 [00:00<00:00, 6311.65it/s]

57
Số lượng nhóm sau khi merge: 57
Hoàn thành xong Zookeeper:  0.36471056938171387
--------------------------------------------------------------------------------
50





#### **1.4. PHƯƠNG THỨC HỖ TRỢ NHÓM GROUP**

- **class `MergeGroupTemplate`**

#### **2. KHUNG LÀM VIỆC CHÍNH**

In [9]:
# def mainDrainDS(SETTING_PARAMS, DICT_SPECIAL_TOKEN=['true', 'false'], punctuationL = set(PUNCTUATIONL), N_MERGE=3, ST=0.6):
#     for name_dataset, dataset in SETTING_PARAMS.items():
#         print('\n================ Processing on %s =====================' % name_dataset)
#         start_time = time.time()
#         # ============== PROCESSING 0 ================ #
#         parse_df = regexAndCreateDf(dataset, DICT_SPECIAL_TOKEN=DICT_SPECIAL_TOKEN, punctuationL = set(PUNCTUATIONL))

#         # ============== CREATE GROUP =============== #
#         log_clusters_list = createGroupClust(parse_df, set(PUNCTUATIONL))

#         # ============== MERGE TEMPLATE ============= #
#         # Sử dụng ý tưởng giống như Drain, như sau:
#         merge_group = MergeGroupTemplate(st=ST, n_merge=N_MERGE, template_gr=log_clusters_list)
#         new_groupL = merge_group.mergeGroup(printL=False)
#         print("NUM of GROUP: ", len(new_groupL))

#         for item in merge_group.TEMPLATE_GR:
#             parse_df.loc[item.logIDL, "EventTemplate"] = item.logTemplate
#         parse_df.to_csv(os.path.join("./res/DrainDS/", name_dataset+"_structured.csv"), index=False)
#         elapsed_time = time.time() - start_time
#         print(f"Hoàn thành xong {name_dataset}: ", elapsed_time)
#         print("-"*80)
        
# mainDrainDS(SETTING_PARAMS_TEST, DICT_SPECIAL_TOKEN=['true', 'false', 'null', 'root'], punctuationL = set(PUNCTUATIONL), N_MERGE=3, ST=0.6)

In [10]:
# parse_df = pd.read_csv(os.path.join("./res/DrainDS/", "BGL_structured.csv"))
# structured_df = pd.read_csv(SETTING_PARAMS_TEST['BGL']['log_structure']) 
# unique_templates = structured_df['EventTemplate'].unique()

# template_compare = {}
# for template in unique_templates:
#     arr_index = structured_df[structured_df['EventTemplate'] == template].index.tolist()
#     parse_template_series = parse_df.loc[arr_index, 'EventTemplate']
#     parse_template_unique = parse_template_series.unique().tolist()
#     content_list = structured_df.loc[arr_index, 'Content'].tolist()
#     content_str = "[\n\t" + "\n\t".join(content_list) + "\n]"
#     staticTok_list = parse_df.loc[arr_index, 'StaticTokList'].tolist()
#     static_str = "[\n\t" + "\n\t".join(staticTok_list) + "\n]"

#     hash_key = hash(template)
#     template_compare[hash_key] = {
#         'ground_truth': template,
#         'parse': parse_template_unique,
#         'content_lst': content_str,
#         'static_str': static_str,
#         'index': arr_index,
#         'length': len(template.strip().split()),
#         'nums': len(arr_index),
#     }
    
# sorted_items = sorted(
#         template_compare.items(),
#         key=lambda item: (item[1]['length'], item[1]['ground_truth'])
#     )

# num_dif = 0
# for idx, (key, value) in enumerate(sorted_items, 1):
#     if len(value['parse']) != 1 or value['parse'][0] != value['ground_truth']:
#         num_dif += 1
#         print(f"No. {idx}")
#         print(f"Length: {value['length']}, Nums: {value['nums']}")
#         print(f"Ground truth  : {value['ground_truth']}")
#         print(f"Parse templs  : {value['parse']}")
#         print(f"Content List: {value['content_lst']}")
#         print(f"Static List: {value['static_str']}")
#         print(f"Length parse: {len(value['parse'])}")
#         print("-" * 40)
# print(f"Total differences found: {num_dif}")

##### **@.2. HOÀN CHỈNH**

In [11]:
from tqdm import tqdm

from evaluation.settings import benchmark_settings
from evaluation.utils.common import common_args
from evaluation.utils.evaluator_main import *
from evaluation.utils.postprocess import post_average

from evaluation.utils.GA_calculator import evaluate
from evaluation.utils.template_level_analysis import evaluate_template_level
from evaluation.utils.PA_calculator import calculate_parsing_accuracy

import importlib
import evaluation.utils.evaluator_main as evaluator_main
importlib.reload(evaluator_main)

file_path = './benchmark/parsing_accuracy.csv'
if os.path.exists(file_path):
    os.remove(file_path)
result_file = evaluator_main.prepare_results(output_dir="./benchmark")
for name_dataset, dataset_setting in SETTING_PARAMS_TEST.items():
    print('\n================ Evaluation on %s =====================' % name_dataset)
    groundtruth = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    
    parsedresult = os.path.join("./res/DrainDS/", name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    tqdm.pandas()
    print("Start to align with null values")
    groundtruth['EventTemplate'] = groundtruth.progress_apply(align_with_null_values, axis=1)
    # groundtruth['EventTemplate'] = groundtruth['EventTemplate'].map(correct_template_general)
    parsedresult['EventTemplate'] = parsedresult.progress_apply(align_with_null_values, axis=1)
    
    filter_templates = None
    
    # =============== BENCHMARK GA =============== #
    start_time = time.time()
    GA, FGA = evaluate(groundtruth, parsedresult, filter_templates)

    GA_end_time = time.time() - start_time
#     print('Grouping Accuracy calculation done. [Time taken: {:.3f}]'.format(GA_end_time))
    
    start_time = time.time()
    PA = calculate_parsing_accuracy(groundtruth, parsedresult, filter_templates)
        
    PA_end_time = time.time() - start_time
#     print('Parsing Accuracy calculation done. [Time taken: {:.3f}]'.format(PA_end_time))

    # # =============== BENCHMARK TEMPLATE-LEVEL-ACCURACY =============== #
    start_time = time.time()
    identified_templates, ground_templates, FTA, PTA, RTA = evaluate_template_level(name_dataset, groundtruth, parsedresult, filter_templates)
    
    TA_end_time = time.time() - start_time
#     print('Template-level accuracy calculation done. [Time taken: {:.3f}]'.format(TA_end_time))

    result = name_dataset + ',' + \
            str(identified_templates) + ',' + \
            str(ground_templates) + ',' + \
            "{:.3f}".format(GA) + ',' + \
            "{:.3f}".format(PA) + ',' + \
            "{:.3f}".format(FGA) + ',' + \
            "{:.3f}".format(PTA) + ',' + \
            "{:.3f}".format(RTA) + ',' + \
            "{:.3f}".format(FTA) + '\n'

    with open(os.path.join("./benchmark", result_file), 'a') as summary_file:
        summary_file.write(result)

result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
print(result_df)


Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 98668.61it/s]
100%|██████████| 2000/2000 [00:00<00:00, 93752.60it/s]
100%|██████████| 6/6 [00:00<00:00, 2336.44it/s]


Grouping_Accuracy (GA): 1.0000, FGA: 1.0000,
Parsing_Accuracy (PA): 1.0000


100%|██████████| 6/6 [00:00<00:00, 6040.76it/s]


PTA: 1.0000, RTA: 1.0000 FTA: 1.0000
Identify : 6, Groundtruth : 6

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 52634.73it/s]
100%|██████████| 2000/2000 [00:00<00:00, 88724.21it/s]
100%|██████████| 120/120 [00:00<00:00, 3575.40it/s]


Grouping_Accuracy (GA): 0.9930, FGA: 0.9712,
Parsing_Accuracy (PA): 0.9850


100%|██████████| 123/123 [00:00<00:00, 31971.95it/s]


PTA: 0.9187, RTA: 0.9417 FTA: 0.9300
Identify : 123, Groundtruth : 120

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 62492.89it/s]
100%|██████████| 2000/2000 [00:00<00:00, 62835.54it/s]
100%|██████████| 114/114 [00:00<00:00, 3197.84it/s]


Grouping_Accuracy (GA): 0.9925, FGA: 0.9739,
Parsing_Accuracy (PA): 0.4050


100%|██████████| 116/116 [00:00<00:00, 38571.37it/s]


PTA: 0.7328, RTA: 0.7456 FTA: 0.7391
Identify : 116, Groundtruth : 114

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 83531.90it/s]
100%|██████████| 2000/2000 [00:00<00:00, 86436.83it/s]
100%|██████████| 14/14 [00:00<00:00, 2728.64it/s]


Grouping_Accuracy (GA): 0.9975, FGA: 0.8667,
Parsing_Accuracy (PA): 0.3690


100%|██████████| 16/16 [00:00<?, ?it/s]


PTA: 0.2500, RTA: 0.2857 FTA: 0.2667
Identify : 16, Groundtruth : 14

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 88582.74it/s]
100%|██████████| 2000/2000 [00:00<00:00, 112499.10it/s]
100%|██████████| 75/75 [00:00<00:00, 4440.36it/s]


Grouping_Accuracy (GA): 1.0000, FGA: 1.0000,
Parsing_Accuracy (PA): 0.8750


100%|██████████| 75/75 [00:00<00:00, 15006.10it/s]


PTA: 0.8933, RTA: 0.8933 FTA: 0.8933
Identify : 75, Groundtruth : 75

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 121665.72it/s]
100%|██████████| 2000/2000 [00:00<00:00, 94941.01it/s]
100%|██████████| 46/46 [00:00<00:00, 3250.36it/s]


Grouping_Accuracy (GA): 0.9255, FGA: 0.8172,
Parsing_Accuracy (PA): 0.7020


100%|██████████| 47/47 [00:00<?, ?it/s]


PTA: 0.5957, RTA: 0.6087 FTA: 0.6022
Identify : 47, Groundtruth : 46

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 68105.38it/s]
100%|██████████| 2000/2000 [00:00<00:00, 76709.87it/s]
100%|██████████| 119/119 [00:00<00:00, 3547.65it/s]


Grouping_Accuracy (GA): 0.5375, FGA: 0.8359,
Parsing_Accuracy (PA): 0.3540


100%|██████████| 137/137 [00:00<00:00, 25965.64it/s]


PTA: 0.6788, RTA: 0.7815 FTA: 0.7266
Identify : 137, Groundtruth : 119

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 32828.14it/s]
100%|██████████| 2000/2000 [00:00<00:00, 28457.86it/s]
100%|██████████| 341/341 [00:00<00:00, 3642.77it/s]


Grouping_Accuracy (GA): 0.9695, FGA: 0.9549,
Parsing_Accuracy (PA): 0.6365


100%|██████████| 346/346 [00:00<00:00, 74659.39it/s]


PTA: 0.6792, RTA: 0.6891 FTA: 0.6841
Identify : 346, Groundtruth : 341

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 82066.66it/s]
100%|██████████| 2000/2000 [00:00<00:00, 98616.41it/s]
100%|██████████| 26/26 [00:00<00:00, 3207.41it/s]


Grouping_Accuracy (GA): 1.0000, FGA: 1.0000,
Parsing_Accuracy (PA): 0.7625


100%|██████████| 26/26 [00:00<00:00, 25964.74it/s]


PTA: 0.6154, RTA: 0.6154 FTA: 0.6154
Identify : 26, Groundtruth : 26

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 63178.17it/s]
100%|██████████| 2000/2000 [00:00<00:00, 73731.74it/s]
100%|██████████| 43/43 [00:00<00:00, 4226.74it/s]


Grouping_Accuracy (GA): 0.4800, FGA: 0.8837,
Parsing_Accuracy (PA): 0.3935


100%|██████████| 43/43 [00:00<00:00, 21343.80it/s]


PTA: 0.6744, RTA: 0.6744 FTA: 0.6744
Identify : 43, Groundtruth : 43

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 79118.40it/s]
100%|██████████| 2000/2000 [00:00<00:00, 108058.84it/s]
100%|██████████| 8/8 [00:00<00:00, 2266.89it/s]


Grouping_Accuracy (GA): 0.5265, FGA: 0.6087,
Parsing_Accuracy (PA): 0.0000


100%|██████████| 15/15 [00:00<?, ?it/s]


PTA: 0.0000, RTA: 0.0000 FTA: 0.0000
Identify : 15, Groundtruth : 8

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 90245.69it/s]
100%|██████████| 2000/2000 [00:00<00:00, 101178.50it/s]
100%|██████████| 36/36 [00:00<00:00, 3658.98it/s]


Grouping_Accuracy (GA): 0.9230, FGA: 0.8286,
Parsing_Accuracy (PA): 0.7305


100%|██████████| 34/34 [00:00<?, ?it/s]


PTA: 0.5294, RTA: 0.5000 FTA: 0.5143
Identify : 34, Groundtruth : 36

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 67646.79it/s]
100%|██████████| 2000/2000 [00:00<00:00, 47986.45it/s]
100%|██████████| 149/149 [00:00<00:00, 3753.60it/s]


Grouping_Accuracy (GA): 0.9560, FGA: 0.8125,
Parsing_Accuracy (PA): 0.8955


100%|██████████| 171/171 [00:00<00:00, 26526.59it/s]


PTA: 0.5731, RTA: 0.6577 FTA: 0.6125
Identify : 171, Groundtruth : 149

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 85831.01it/s]
100%|██████████| 2000/2000 [00:00<00:00, 101277.44it/s]
100%|██████████| 50/50 [00:00<00:00, 3599.33it/s]


Grouping_Accuracy (GA): 0.9945, FGA: 0.9159,
Parsing_Accuracy (PA): 0.6765


100%|██████████| 57/57 [00:00<00:00, 33493.32it/s]

PTA: 0.6316, RTA: 0.7200 FTA: 0.6729
Identify : 57, Groundtruth : 50
        Dataset  parse_gr  truth_gr     GA     PA    FGA    PTA    RTA    FTA
0        Apache         6         6  1.000  1.000  1.000  1.000  1.000  1.000
1           BGL       123       120  0.993  0.985  0.971  0.919  0.942  0.930
2        Hadoop       116       114  0.993  0.405  0.974  0.733  0.746  0.739
3          HDFS        16        14  0.998  0.369  0.867  0.250  0.286  0.267
4     HealthApp        75        75  1.000  0.875  1.000  0.893  0.893  0.893
5           HPC        47        46  0.925  0.702  0.817  0.596  0.609  0.602
6         Linux       137       119  0.537  0.354  0.836  0.679  0.782  0.727
7           Mac       346       341  0.970  0.636  0.955  0.679  0.689  0.684
8       OpenSSH        26        26  1.000  0.762  1.000  0.615  0.615  0.615
9     OpenStack        43        43  0.480  0.394  0.884  0.674  0.674  0.674
10    Proxifier        15         8  0.526  0.000  0.609  0.000  0.000  0




In [12]:
result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
# Chỉ chọn các cột số để tính trung bình và độ lệch chuẩn
numeric_cols = result_df.select_dtypes(include='number').columns

# Tính trung bình
avg_row = result_df[numeric_cols].mean().round(3)
avg_row['Dataset'] = 'Average'
avg_row['parse_gr'] = ''
avg_row['truth_gr'] = ''

# Tính độ lệch chuẩn
std_row = result_df[numeric_cols].std().round(3)
std_row['Dataset'] = 'Std'
std_row['parse_gr'] = ''
std_row['truth_gr'] = ''

# Thêm hai dòng mới vào DataFrame
result_df = pd.concat([result_df, pd.DataFrame([avg_row, std_row])], ignore_index=True)
print(result_df)

        Dataset parse_gr truth_gr     GA     PA    FGA    PTA    RTA    FTA
0        Apache        6        6  1.000  1.000  1.000  1.000  1.000  1.000
1           BGL      123      120  0.993  0.985  0.971  0.919  0.942  0.930
2        Hadoop      116      114  0.993  0.405  0.974  0.733  0.746  0.739
3          HDFS       16       14  0.998  0.369  0.867  0.250  0.286  0.267
4     HealthApp       75       75  1.000  0.875  1.000  0.893  0.893  0.893
5           HPC       47       46  0.925  0.702  0.817  0.596  0.609  0.602
6         Linux      137      119  0.537  0.354  0.836  0.679  0.782  0.727
7           Mac      346      341  0.970  0.636  0.955  0.679  0.689  0.684
8       OpenSSH       26       26  1.000  0.762  1.000  0.615  0.615  0.615
9     OpenStack       43       43  0.480  0.394  0.884  0.674  0.674  0.674
10    Proxifier       15        8  0.526  0.000  0.609  0.000  0.000  0.000
11        Spark       34       36  0.923  0.731  0.829  0.529  0.500  0.514
12  Thunderb

In [13]:
def compare_templates(datasets, parse_df):
    structured_df = pd.read_csv(datasets['log_structure']) 
    unique_templates = structured_df['EventTemplate'].unique()
    print(f"SHAPE: {structured_df.shape}")
    print(f"SHAPE PARSER: {parse_df.shape}")
    print(f"Num of truth templates: {len(unique_templates)}")

    template_compare = {}
    for template in unique_templates:
        arr_index = structured_df[structured_df['EventTemplate'] == template].index.tolist()
        parse_template_series = parse_df.loc[arr_index, 'EventTemplate']
        parse_template_unique = parse_template_series.unique().tolist()
        content_list = structured_df.loc[arr_index[:3], 'Content'].tolist()
        content_str = "[\n\t" + "\n\t".join(content_list) + "\n]"

        hash_key = hash(template)
        template_compare[hash_key] = {
            'ground_truth': template,
            'parse': parse_template_unique,
            'content_lst': content_str,
            'index': arr_index,
            'length': len(template.strip().split()),
            'nums': len(arr_index),
        }
        
    sorted_items = sorted(
            template_compare.items(),
            key=lambda item: (item[1]['length'], item[1]['ground_truth'])
        )

    num_dif = 0
    for idx, (key, value) in enumerate(sorted_items, 1):
        if len(value['parse']) != 1 or value['parse'][0] != value['ground_truth']:
            num_dif += 1
            print(f"No. {idx}")
            print(f"Length: {value['length']}, Nums: {value['nums']}")
            print(f"Ground truth  : {value['ground_truth']}")
            print(f"Parse templs  : {value['parse']}")
            print(f"Content List: {value['content_lst']}")
            print(f"Length parse: {len(value['parse'])}")
            print("-" * 40)
    print(f"Total differences found: {num_dif}")
    
result_path_dir = "./res/DrainDS/"
choose_dataset = ["Apache", "BGL", "Hadoop", "HDFS", "HealthApp", "HPC", "Linux", "Mac", "OpenSSH", "OpenStack", "Proxifier", "Spark", "Thunderbird", "Zookeeper"]
for name_dataset, dataset_setting in SETTING_PARAMS_TEST.items():
    if name_dataset not in choose_dataset:
        continue
    print("="*40 + f" COMPARE {name_dataset} " + "="*40)
    parsedresult = os.path.join(result_path_dir, name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    truth_template = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    unique_templates = truth_template['EventTemplate'].unique()
    
    compare_templates(dataset_setting, parsedresult)
    print("="*45 + " END " + "="*45 + "\n")

SHAPE: (2000, 6)
SHAPE PARSER: (2000, 10)
Num of truth templates: 6
Total differences found: 0

SHAPE: (2000, 13)
SHAPE PARSER: (2000, 17)
Num of truth templates: 120
No. 23
Length: 3, Nums: 3
Ground truth  : problem state (<*>=sup,<*>=usr).......<*>
Parse templs  : ['problem state (<*>=sup,<*>=usr)<*>']
Content List: [
	problem state (0=sup,1=usr).......0
	problem state (0=sup,1=usr).......0
	problem state (0=sup,1=usr).......0
]
Length parse: 1
----------------------------------------
No. 47
Length: 5, Nums: 2
Ground truth  : guaranteed instruction cache block touch.<*>
Parse templs  : ['guaranteed instruction cache block touch.0']
Content List: [
	guaranteed instruction cache block touch.0
	guaranteed instruction cache block touch.0
]
Length parse: 1
----------------------------------------
No. 64
Length: 8, Nums: 9
Ground truth  : Lustre mount FAILED : <*> : point <*>
Parse templs  : ['Lustre mount FAILED : <*> : point /p/gb1']
Content List: [
	Lustre mount FAILED : bglio559 : poin

In [14]:
import re

def find_general_pattern(strings):
    def wildcard_to_regex(pattern_str):
        # Tách theo wildcard rồi escape từng phần
        parts = pattern_str.split('<*>')
        regex = '.*'.join(re.escape(p) for p in parts)
        return '^' + regex + '$'

    strings = list(strings)

    for candidate in strings:
        regex = wildcard_to_regex(candidate)
        if all(re.fullmatch(regex, s) for s in strings if s != candidate):
            return candidate

    return None

# # Trường hợp có chuỗi tổng quát
# s1 = {"<*>:", "hellod/asdsa/sd/:", "jbfb:", "dks:"}
# print(find_general_pattern(s1))

# # Trường hợp không có chuỗi tổng quát
# s2 = {"hellod/asdsa/sd/:", "jbfb:", "dks:"}
# print(find_general_pattern(s2))

# Trường hợp không có chuỗi tổng quát
s2 = {"mane(<*>):", "mane():", "mane(123):", "mane(pwd):", "mane(123.das9):", "mane(<*><*>):"}
print(find_general_pattern(s2))


mane(<*>):
