#### **0. CHUẨN BỊ**

##### **0.1. CẤU HÌNH CÁC THAM SỐ**

In [None]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter, defaultdict
import datetime
from tqdm import tqdm
import string
from copy import deepcopy

import hashlib
import time

pd.set_option('display.max_rows', None)

SETTING_PARAMS = {
    'Apache': {
        'log_file': './logs/Apache/Apache_full.log',
        'log_template': './logs/Apache/Apache_full.log_templates.csv',
        'log_structure': './logs/Apache/Apache_full.log_structured.csv',
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'filters': [],
        'regexs': [
            r'\/(?:\w+\/){2,}\w+\.\w+$',
            r'\/(?:[^\/\s]+\/)*[^\/\s]*'
        ],    
    },
    'BGL': {
        'log_file': './logs/BGL/BGL_full.log',
        'log_template': './logs/BGL/BGL_full.log_templates.csv',
        'log_structure': './logs/BGL/BGL_full.log_structured.csv',
        'log_format': '<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>',
        'filters': [],
        'regexs': [
            r"core\.\d+",
        ],    
    },
    'Hadoop': {
        'log_file': './logs/Hadoop/Hadoop_full.log',
        'log_template': './logs/Hadoop/Hadoop_full.log_templates.csv',
        'log_structure': './logs/Hadoop/Hadoop_full.log_structured.csv',
        'log_format': '<Date> <Time> <Level> \[<Process>\] <Component>: <Content>',
        'filters': [],
        'regexs': [
            r'\[.*?(_.*?)+\]', 
        ],    
    },
    'HDFS': {
        'log_file': './logs/HDFS/HDFS_full.log',
        'log_template': './logs/HDFS/HDFS_full.log_templates.csv',
        'log_structure': './logs/HDFS/HDFS_full.log_structured.csv',
        'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
        'filters': [],
        'regexs': [
            r'blk_-?\d+', 
            r'[/]?(\d+\.){3}\d+(:\d+)?',
        ],    
    },
    'HealthApp':{
        'log_file': './logs/HealthApp/HealthApp_full.log',
        'log_template': './logs/HealthApp/HealthApp_full.log_templates.csv',
        'log_structure': './logs/HealthApp/HealthApp_full.log_structured.csv',
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'filters': [],
        'regexs': [],  
    },
    'HPC':{
        'log_file': './logs/HPC/HPC_full.log',
        'log_template': './logs/HPC/HPC_full.log_templates.csv',
        'log_structure': './logs/HPC/HPC_full.log_structured.csv',
        'log_format': '<LogId> <Node> <Component> <State> <Time> <Flag> <Content>',
        'filters': [],
        'regexs': [
            r'=\d+',
        ],  
    },
    'Linux': {
        'log_file': './logs/Linux/Linux_full.log',
        'log_template': './logs/Linux/Linux_full.log_templates.csv',
        'log_structure': './logs/Linux/Linux_full.log_structured.csv',
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'filters': [],
        'regexs': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],  
    },
    'Mac': {
        'log_file': './logs/Mac/Mac_full.log',
        'log_template': './logs/Mac/Mac_full.log_templates.csv',
        'log_structure': './logs/Mac/Mac_full.log_structured.csv',
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'filters': [],
        'regexs': [
            r'([\w-]+\.){2,}[\w-]+',
            r'https?:\/\/(?:[^\/\s]+\/?)*',
            r'\S*\/(?:[^\/\s]+\/){1,}[^\/\s]*'
        ],  
    },
    'OpenSSH': {
        'log_file': './logs/OpenSSH/OpenSSH_full.log',
        'log_template': './logs/OpenSSH/OpenSSH_full.log_templates.csv',
        'log_structure': './logs/OpenSSH/OpenSSH_full.log_structured.csv',
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'filters': [],
        'regexs': [r"(\d+):"],    
    },
    'OpenStack': {
        'log_file': './logs/OpenStack/OpenStack_full.log',
        'log_template': './logs/OpenStack/OpenStack_full.log_templates.csv',
        'log_structure': './logs/OpenStack/OpenStack_full.log_structured.csv',
        'log_format': '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>',
        'filters': [
            r'HTTP\/\d+\.\d+', 
        ],
        'regexs': [
            "(\w+-\w+-\w+-\w+-\w+)",
            r'HTTP\/\d+\.\d+',
        ],    
    },
    'Proxifier': {
        'log_file': './logs/Proxifier/Proxifier_full.log',
        'log_template': './logs/Proxifier/Proxifier_full.log_templates.csv',
        'log_structure': './logs/Proxifier/Proxifier_full.log_structured.csv',
        'log_format': '\[<Time>\] <Program> - <Content>',
        'filters': [
            r'HTTP\/\d+\.\d+', 
        ],
        'regexs': [
            r'<\d+\ssec', 
            r'([\w-]+\.)+[\w-]+(:\d+)?', 
            r'\d{2}:\d{2}(:\d{2})*', 
            r'[KGTM]B',
        ],
    },
    'Spark': {
        'log_file': './logs/Spark/Spark_full.log',
        'log_template': './logs/Spark/Spark_full.log_templates.csv',
        'log_structure': './logs/Spark/Spark_full.log_structured.csv',
        'log_format': '<Date> <Time> <Level> <Component>: <Content>',
        'filters': [],
        'regexs': [],    
    },
    'Thunderbird': {
        'log_file': './logs/Thunderbird/Thunderbird_full.log',
        'log_template': './logs/Thunderbird/Thunderbird_full.log_templates.csv',
        'log_structure': './logs/Thunderbird/Thunderbird_full.log_structured.csv',
        'log_format': '<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>',
        'filters': [],
        'regexs': [
            r'(\d+\.){3}\d+',
        ],
    },  
    'Zookeeper': {
        'log_file': './logs/Zookeeper/Zookeeper_full.log',
        'log_template': './logs/Zookeeper/Zookeeper_full.log_templates.csv',
        'log_structure': './logs/Zookeeper/Zookeeper_full.log_structured.csv',
        'log_format': '<Date> <Time> - <Level>  \[<Node>:<Component>@<Id>\] - <Content>',
        'filters': [],
        'regexs': [
            r"(/|)(\d+\.){3}\d+(:\d+)?",
        ],    
    },
}

In [44]:
SETTING_PARAMS_2k = {}
for name_dataset, dataset in SETTING_PARAMS.items():
    temp = dataset.copy()
    path_par = "./logs2k/"
    temp['log_file'] = path_par + name_dataset + "/"+ name_dataset + "_2k.log"
    temp['log_template'] = temp['log_file'] + "_templates_corrected.csv"
    temp['log_structure'] = temp['log_file'] + "_structured_corrected.csv"
    SETTING_PARAMS_2k[name_dataset] = temp

##### **0.2. CLASS**

In [45]:
class LogCluster:
    def __init__(self, keyGroup, logTemplate, tokens, length, logIDL=None, dynamic_tokenL=None, static_tokenL=None):
        self.keyGroup = keyGroup
        self.logTemplate = logTemplate
        self.tokens = tokens
        self.length = length
        self.logIDL = logIDL if logIDL is not None else []
        self.dynamic_tokenL = dynamic_tokenL if dynamic_tokenL is not None else {}
        self.static_tokenL = static_tokenL if static_tokenL is not None else {}
        
    def __str__(self):
        dynamic_str = "Dynamic Tokens: {\n"
        for k, v in self.dynamic_tokenL.items():
            dynamic_str += f"  {k}: {v},\n"
        dynamic_str += "}"
        
        static_str = "Cover Tokens: {\n"
        for k, v in self.static_tokenL.items():
            static_str += f"  {k}: {v},\n"
        static_str += "}"
        return (
            f"Key: {self.keyGroup}\n"
            f"Template: {self.logTemplate}\n"
            f"Tokens: {self.tokens}\n"
            f"Length: {self.length}\n"
            f"Len LogIDs: {len(self.logIDL)}\n"
            f"{dynamic_str}\n"
            f"{static_str}\n"
        )

#### **1. Các phương thức sử dụng**

##### **1.1. Read data**
- **`log_to_dataframe()`**

- **`generate_logformat_regex()`**

- **`load_data()`**

In [46]:
# =============================== READ DATA =============================== #
def log_to_dataframe(log_file, regex, headers):
    """ Phương thức chuyển đổi file log thành dataframe
    """ 
    log_messages = []
    linecount = 0
    with open(log_file, 'r', encoding="utf8") as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
            except Exception as e:
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

def generate_logformat_regex(logformat):
    """ Phương thức tạo regex từ logformat, biểu thức định dạng của một event log: 
    Ex: 'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>'
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

def load_data(logfile, logformat):
    """ Phương thức trả về một dataframe từ một file log chỉ định
    """
    log_headers, log_regex = generate_logformat_regex(logformat)
    logs_df = log_to_dataframe(logfile, log_regex, log_headers)
    return logs_df

##### **1.2. PRE_PROCESSING_0**
- **`pre0_hasNumbers()`**

- **`pre0_regexAndFilter()`**

- **`createSpecialRex()`**

In [47]:
# ========================== PRE_PROCESSING_0 ========================== #
def pre0_hasNumbers(s):
    return any(char.isdigit() for char in s)

def pre0_regexAndFilter(line, regexs = [], filters = []):
    for currentFil in filters:
        line = re.sub(currentFil, '', line)
    for currentRex in regexs:
        line = re.sub(currentRex, "<*>", line)
    return line.strip()

# ======================= CÁC PHƯƠNG THỨC BỔ TRỢ ======================= #
def createSpecialRex(dict_special_token):
    """ Tạo biểu thức chính quy cho các từ điển yêu cầu (không phân biệt hoa thường) """
    keywords = '|'.join([re.escape(k) for k in dict_special_token])
    return re.compile(f'({keywords})', flags=re.IGNORECASE)


##### **1.3. PREPROCESSING_1**

- **`splitSpecialTok()`**

- **`mergeSpecialTok()`**

In [48]:
# ========================== PRE_PROCESSING_1 ========================== #
def splitSpecialTok(s, seps):
    """ Tách chuỗi s giữ lại ký tự phân tách trong seps. Trả về None nếu không có ký tự phân tách."""
    pattern = '|'.join(map(re.escape, seps))  
    if not re.search(pattern, s):             
        return None, None
    tokens = re.split(f'({pattern})', s)
    sep_token = [tok for tok in tokens if tok]   
    static_tokenL = ["<*>" if pre0_hasNumbers(tok) else tok for tok in sep_token]
    # static_str = mergeSpecialTok("".join(static_tokenL), seps)
        
    return sep_token, static_tokenL

# Phương thức gộp chuỗi
def mergeSpecialTok(token_str, seps):
    """ Nhận chuỗi string token đã xử lý và danh sách phân tách seps. Dùng regex để lặp lại việc thay thế mẫu: <*> + (kí tự phân tách giống nhau) + <*> => <*> hoặc <*>+... ==> <*>
    """
    sep_pattern = '|'.join(re.escape(sep) for sep in seps)

    prev = None
    while token_str != prev:
        prev = token_str
        
        # Gộp mẫu: <*> + (các ký tự phân tách giống nhau) + <*>
        token_str = re.sub(rf'(<\*>)(({sep_pattern})\3*)(<\*>)', r'<*>', token_str)
        
        # Gộp nhiều <*><*> liên tiếp:
        token_str = re.sub(r'(<\*>)+', r'<*>', token_str)

    return token_str

#### **1.4. PHƯƠNG THỨC HỖ TRỢ NHÓM GROUP**

- **class `MergeGroupTemplate`**

In [49]:
# ===================== TẠO CLASS ===================== #
class MergeGroupTemplate:
    def __init__(self, st=0.6, n_merge=3, template_gr=None):
        self.ST = st
        self.N_MERGE = n_merge
        self.TEMPLATE_GR = template_gr if template_gr is not None else []
    
    def similarySeq(self, seq1, seq2):
        """ So sánh độ tương đồng giữa các token của 2 nhóm cluster dựa trên ý tưởng của Drain"""
        assert len(seq1) == len(seq2)
        simTokens = 0
        numOfPar = 0

        for token1, token2 in zip(seq1, seq2):
            if token1 == "<*>":
                numOfPar += 1
                continue
            if token1 == token2:
                simTokens += 1

        retVal = float(simTokens) / len(seq1)

        return retVal, numOfPar
    
    def fastMatchCLuster(self, seqGroupL, seq):
        choose_group = None
        maxSim = -1
        maxNumOfPara = -1
        maxGroup = None

        for gr in seqGroupL:
                curSim, curNumOfPara = self.similarySeq(gr[0].tokens, seq.tokens)
                if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
                    maxSim = curSim
                    maxNumOfPara = curNumOfPara
                    maxGroup = gr
                    
                if maxSim >= self.ST:
                    choose_group = maxGroup
        return choose_group
    
    def generalizeGroup(self, group):
        """Tạo pattern chung bằng cách đếm số lượng token khác nhau tại mỗi vị trí"""
        mask_positions = set()
        tokensL = [s.tokens for s in group]
        
        for idx, col in enumerate(zip(*tokensL)):
            if len(set(col)) >= self.N_MERGE:
                mask_positions.add(idx)

        # Tạo pattern chung
        for seq in group:
            seq.tokens = [token if i not in mask_positions else "<*>" for i, token in enumerate(seq.tokens)]
            seq.logTemplate = " ".join(seq.tokens)

        # Gom nhóm lại theo pattern
        pattern_dict = defaultdict(list)
        for seq in group:
            key = tuple(seq.tokens)
            pattern_dict[key].append(seq)

        result = []
        for key, values in pattern_dict.items():
            if len(values) != 1: 
                logIDL = []
                for x in values:
                    logIDL.extend(x.logIDL)
                values[0].logIDL = logIDL
            result.append(values[0])
        return result
    
    def mergeGroup(self, printL=False):
        grouped_by_length = defaultdict(list)
        [grouped_by_length[t.length].append(t) for t in self.TEMPLATE_GR]
        
        newClusterGroupsL = []
        
        # Nhóm theo chiều dài:
        for length, groups_len in grouped_by_length.items():
            groupsSimTemL = []
            for log_clust in groups_len:
                matched_gr = self.fastMatchCLuster(groupsSimTemL, log_clust)
                if matched_gr is not None:
                    matched_gr.append(log_clust)
                else:
                    groupsSimTemL.append([log_clust])
            for group in groupsSimTemL:
                if len(group) == 1:
                    newClusterGroupsL.extend(group)
                else:
                    refined_groups = self.generalizeGroup(group)
                    newClusterGroupsL.extend(refined_groups)
        
        self.TEMPLATE_GR = newClusterGroupsL

        if printL:
            self.printList()
        
        return newClusterGroupsL
    
    def printList(self):
        print(len(self.TEMPLATE_GR))
        # df = pd.read_csv(datasets['log_template'])
        # print(len(df))

        sorted_list = sorted(self.TEMPLATE_GR, key=lambda log: (log.length, log.logTemplate))
        for e in sorted_list:
            print(f"{e.length:3} {e.logTemplate}")
# ===================== END CLASS ===================== #

In [50]:
def createGroupClust(parse_df, punctuationL): 
    # ================================ GROUP LOG ================================ #
    log_clusters_list = []                                          # List lưu trữ các nhóm log logCluster

    unique_groups = parse_df.groupby("GroupTemplate")
    print(len(unique_groups)) # in ra số nhóm chưa xử lý

    for key, group_val in unique_groups:
        first_row = group_val.iloc[0]
        tokens = first_row['GroupTokens']
        
        if len(first_row["idxDynamicTok"]) != 0:                    # Ktra có token động chưa xử lý hay không?
            group_staticL = group_val['StaticTokList'].to_list()
            
            static_processingL = []                                 # List lưu các nhóm token động đã xử lý
            for idx in range(len(group_staticL[0])):
                cols_idx_gr = list(zip(*[x[idx] for x in group_staticL])) 
                static_idx = []
                for idx0, lst_idx in enumerate(cols_idx_gr):        # Lấy các phần tử theo cột của từng token đã được phân tách
                    unique_idx = set(lst_idx)
                    if len(unique_idx) > 1:                         # Vị trí có token khác nhau thì thành <*>
                        static_idx.append("<*>")
                    else: 
                        static_idx.append(next(iter(unique_idx)))
                
                token_str = mergeSpecialTok("".join(static_idx), punctuationL)    
                
                static_processingL.append(token_str)                
        
            # Chuyển group thành các log cluster:
            for idx, val in enumerate(first_row["idxDynamicTok"]):
                tokens[val] = static_processingL[idx]
        
        logTemplate = " ".join(tokens)

        cluster = LogCluster(
            keyGroup= hashlib.md5(logTemplate.encode('utf-8')).hexdigest(),
            logTemplate=logTemplate,
            tokens=tokens,
            length=len(tokens),
            logIDL=group_val.index.tolist(),
            static_tokenL=None
        )
        log_clusters_list.append(cluster)
        # ==================================== END ==================================== #
        
    return log_clusters_list

#### **1.5. MAIN FUNCTION**

- **`processLine()`**

In [51]:
# Viết lại phương thức xử lý theo từng dòng
def processLine(line, regexs, filters, pattern_special, punctuationL):
    """ Phương thức hỗ trợ xử lý từng dòng log """
    # ================== Xử lý regex và filter ================== #
    # tokens0 là list token sau khi tiền xử lý
    # groups_token là list tokens templates
    
    parsed = pre0_regexAndFilter(line['Content'], regexs=regexs, filters=filters)
    tokens0 = str(parsed).strip().split()

    groups_token = []         
    idx_dynamic_token = []
    # dynamic_token = []
    dynamic_tokenL = []
    static_tokenL = []
    # static_token = []
    
    for idx_tok, token in enumerate(tokens0):
        # Duyệt qua từng token, xử lý token có giá trị đặc biệt
        # Nếu token không chứa ký tự đặc biệt thì thêm vào groups_token, nếu có thì xử lý tiếp
        temp = pattern_special.sub('<*>', token)
        if not pre0_hasNumbers(temp): 
            groups_token.append(temp)
        else:
            groups_token.append("<*>")
            sep_token, static_tokL = splitSpecialTok(temp, punctuationL)
            if sep_token is not None:
                idx_dynamic_token.append(idx_tok)
                # dynamic_token.append(token)
                dynamic_tokenL.append(sep_token) 
                # static_token.append(sta_str)
                static_tokenL.append(static_tokL.copy())
    

    groupTem_str = f"{' '.join(groups_token)} : {len(groups_token)}"
    if idx_dynamic_token:
        posa = []
        for item in static_tokenL:
            posa.append(item[0])
        groupTem_str += f" : {' '.join([str(tok) for tok in idx_dynamic_token])} : {' '.join([str(len(item)) for item in static_tokenL])} : {' '.join(posa)}"
    
    return pd.Series({
        'GroupTemplate': hashlib.md5(groupTem_str.encode('utf-8')).hexdigest(),
        'GroupTokens': groups_token,
        'idxDynamicTok': idx_dynamic_token,
        # 'DynamicTok': dynamic_token, 
        'DynamicTokList': dynamic_tokenL,
        'StaticTokList': static_tokenL,
        # 'StaticTok': static_token,
        'EventTemplate': f""
    })
    
def regexAndCreateDf(datasets, DICT_SPECIAL_TOKEN=['true', 'false'], punctuationL = set('(),<>:;{}[]~='), N_MERGE=3, ST=0.6):
    
    logs_df = load_data(datasets['log_file'], datasets['log_format'])

    # ================================ PROCESSING 0 ================================ #
    parse_df = logs_df.copy()
    parse_df['GroupTemplate'] = ""                                  # Lưu template sử dụng để nhóm
    parse_df['GroupTokens'] = [[] for _ in range(len(parse_df))]    # Lưu list token của Group Teplate
    parse_df['idxDynamicTok'] = [[] for _ in range(len(parse_df))]  # Lưu vị trí token động
    # parse_df['DynamicTok'] = [[] for _ in range(len(parse_df))]     # Lưu token động theo vị trí tương ứng
    parse_df['DynamicTokList'] = [[] for _ in range(len(parse_df))] # Lưu list token động theo vị trí tương ứng
    # parse_df['StaticTok'] = [[] for _ in range(len(parse_df))]      # Lưu token tĩnh đã phân tích tương ứng
    parse_df['StaticTokList'] = [[] for _ in range(len(parse_df))]  # Lưu list token tĩnh theo vị trí tương ứng
    parse_df['EventTemplate'] = ""                             # Template cuối cùng sau khi xử lý


    tqdm.pandas(desc="Tiền xử lý giai đoạn 0 và 1")
    pattern_special = createSpecialRex(DICT_SPECIAL_TOKEN)          # Tạo ra các REGEX cho các giá trị đặc biệt
    # punctuationL = set(PUNCTUATIONL)                                # Lấy các giá trị phân tách đặc biệt và duy nhất

    results = parse_df.progress_apply(
            lambda row: processLine(row, datasets['regexs'], datasets['filters'], pattern_special, punctuationL),
            axis=1
        )

    for col in results.columns:
        parse_df[col] = results[col]
    
    return parse_df

##### **1.6. PHƯƠNG THỨC KHÁC**

- **`write_df_to_txt()`**

- **`compare_templates()`**

In [52]:
def write_df_to_txt(df, filename):
    col_widths = [max(len(str(val)) for val in [col] + df[col].tolist()) for col in df.columns]

    with open(filename, 'w', encoding='utf-8') as f:
        header = '\t'.join(str(col).ljust(width) for col, width in zip(df.columns, col_widths))
        f.write(header + '\n')

        for row in df.itertuples(index=False):
            line = '\t'.join(str(val).ljust(width) for val, width in zip(row, col_widths))
            f.write(line + '\n')
            
def compare_templates(datasets, parse_df):
    structured_df = pd.read_csv(datasets['log_structure']) 
    unique_templates = structured_df['EventTemplate'].unique()
    print(f"SHAPE: {structured_df.shape}")
    print(f"SHAPE PARSER: {parse_df.shape}")
    print(f"Num of truth templates: {len(unique_templates)}")

    template_compare = {}
    for template in unique_templates:
        arr_index = structured_df[structured_df['EventTemplate'] == template].index.tolist()
        parse_template_series = parse_df.loc[arr_index, 'EventTemplate']
        parse_template_unique = parse_template_series.unique().tolist()
        content_list = structured_df.loc[arr_index[:3], 'Content'].tolist()
        content_str = "[\n\t" + "\n\t".join(content_list) + "\n]"

        hash_key = hash(template)
        template_compare[hash_key] = {
            'ground_truth': template,
            'parse': parse_template_unique,
            'content_lst': content_str,
            'index': arr_index,
            'length': len(template.strip().split()),
            'nums': len(arr_index),
        }
        
    sorted_items = sorted(
            template_compare.items(),
            key=lambda item: (item[1]['length'], item[1]['ground_truth'])
        )

    num_dif = 0
    for idx, (key, value) in enumerate(sorted_items, 1):
        if len(value['parse']) != 1 or value['parse'][0] != value['ground_truth']:
            num_dif += 1
            print(f"No. {idx}")
            print(f"Length: {value['length']}, Nums: {value['nums']}")
            print(f"Ground truth  : {value['ground_truth']}")
            print(f"Parse templs  : {value['parse']}")
            print(f"Content List: {value['content_lst']}")
            print(f"Length parse: {len(value['parse'])}")
            print("-" * 40)
    print(f"Total differences found: {num_dif}")

#### **2. KHUNG LÀM VIỆC CHÍNH**

In [53]:
DICT_SPECIAL_TOKEN = ['true', 'false']
PUNCTUATIONL = '(),<>:;{}[]~=#'

N_MERGE = 2
ST = 0.6

def mainDrainDS(SETTING_PARAMS, DICT_SPECIAL_TOKEN=['true', 'false'], punctuationL = set(PUNCTUATIONL), N_MERGE=3, ST=0.6):
    for name_dataset, dataset in SETTING_PARAMS.items():
        print('\n================ Processing on %s =====================' % name_dataset)
        start_time = time.time()
        # ============== PROCESSING 0 ================ #
        parse_df = regexAndCreateDf(dataset, DICT_SPECIAL_TOKEN=DICT_SPECIAL_TOKEN, punctuationL = set(PUNCTUATIONL), N_MERGE=3, ST=0.6)

        # ============== CREATE GROUP =============== #
        log_clusters_list = createGroupClust(parse_df, set(PUNCTUATIONL))

        # ============== MERGE TEMPLATE ============= #
        # Sử dụng ý tưởng giống như Drain, như sau:
        merge_group = MergeGroupTemplate(st=ST, n_merge=N_MERGE, template_gr=log_clusters_list)
        new_groupL = merge_group.mergeGroup(printL=False)
        print("NUM of GROUP: ", len(new_groupL))

        for item in merge_group.TEMPLATE_GR:
            parse_df.loc[item.logIDL, "EventTemplate"] = item.logTemplate
        parse_df.to_csv(os.path.join("./res/DrainDS/", name_dataset+"_structured.csv"), index=False)
        elapsed_time = time.time() - start_time
        print(f"Hoàn thành xong {name_dataset}: ", elapsed_time)
        print("-"*80)
        

mainDrainDS(SETTING_PARAMS_2k, DICT_SPECIAL_TOKEN=['true', 'false'], punctuationL = set(PUNCTUATIONL), N_MERGE=3, ST=0.6)




Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 4857.68it/s]


6
NUM of GROUP:  6
Hoàn thành xong Apache:  0.4476292133331299
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 7023.48it/s]


134
NUM of GROUP:  115
Hoàn thành xong BGL:  0.3652808666229248
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5757.13it/s]


117
NUM of GROUP:  117
Hoàn thành xong Hadoop:  0.42699527740478516
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5529.50it/s]


17
NUM of GROUP:  17
Hoàn thành xong HDFS:  0.3978757858276367
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5458.05it/s]


75
NUM of GROUP:  75
Hoàn thành xong HealthApp:  0.430403470993042
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5048.65it/s]


54
NUM of GROUP:  43
Hoàn thành xong HPC:  0.4499385356903076
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 6167.18it/s]


148
NUM of GROUP:  123
Hoàn thành xong Linux:  0.4107532501220703
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5138.82it/s]


349
NUM of GROUP:  335
Hoàn thành xong Mac:  0.5258331298828125
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5147.71it/s]


180
NUM of GROUP:  34
Hoàn thành xong OpenSSH:  0.452012300491333
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 4375.68it/s]


60
NUM of GROUP:  56
Hoàn thành xong OpenStack:  0.5414953231811523
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 5125.19it/s]


17
NUM of GROUP:  11
Hoàn thành xong Proxifier:  0.43251562118530273
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 7219.31it/s]


38
NUM of GROUP:  34
Hoàn thành xong Spark:  0.3199479579925537
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 6415.70it/s]


198
NUM of GROUP:  163
Hoàn thành xong Thunderbird:  0.473663330078125
--------------------------------------------------------------------------------



Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 2000/2000 [00:00<00:00, 6221.72it/s]

56
NUM of GROUP:  54
Hoàn thành xong Zookeeper:  0.37224602699279785
--------------------------------------------------------------------------------





In [54]:
from tqdm import tqdm

from evaluation.settings import benchmark_settings
from evaluation.utils.common import common_args
from evaluation.utils.evaluator_main import *
from evaluation.utils.postprocess import post_average

from evaluation.utils.GA_calculator import evaluate
from evaluation.utils.template_level_analysis import evaluate_template_level
from evaluation.utils.PA_calculator import calculate_parsing_accuracy

import importlib
import evaluation.utils.evaluator_main as evaluator_main
importlib.reload(evaluator_main)

file_path = './benchmark/parsing_accuracy.csv'
if os.path.exists(file_path):
    os.remove(file_path)
result_file = evaluator_main.prepare_results(output_dir="./benchmark")
for name_dataset, dataset_setting in SETTING_PARAMS_2k.items():
    print('\n================ Evaluation on %s =====================' % name_dataset)
    groundtruth = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    
    parsedresult = os.path.join("./res/DrainDS/", name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    tqdm.pandas()
    print("Start to align with null values")
    groundtruth['EventTemplate'] = groundtruth.progress_apply(align_with_null_values, axis=1)
    groundtruth['EventTemplate'] = groundtruth['EventTemplate'].map(correct_template_general)
    parsedresult['EventTemplate'] = parsedresult.progress_apply(align_with_null_values, axis=1)
    
    filter_templates = None
    
    # =============== BENCHMARK GA =============== #
    start_time = time.time()
    GA, FGA = evaluate(groundtruth, parsedresult, filter_templates)

    GA_end_time = time.time() - start_time
#     print('Grouping Accuracy calculation done. [Time taken: {:.3f}]'.format(GA_end_time))
    
    start_time = time.time()
    PA = calculate_parsing_accuracy(groundtruth, parsedresult, filter_templates)
        
    PA_end_time = time.time() - start_time
#     print('Parsing Accuracy calculation done. [Time taken: {:.3f}]'.format(PA_end_time))

    # # =============== BENCHMARK TEMPLATE-LEVEL-ACCURACY =============== #
    start_time = time.time()
    identified_templates, ground_templates, FTA, PTA, RTA = evaluate_template_level(name_dataset, groundtruth, parsedresult, filter_templates)
    
    TA_end_time = time.time() - start_time
#     print('Template-level accuracy calculation done. [Time taken: {:.3f}]'.format(TA_end_time))

    result = name_dataset + ',' + \
            str(identified_templates) + ',' + \
            str(ground_templates) + ',' + \
            "{:.3f}".format(GA) + ',' + \
            "{:.3f}".format(PA) + ',' + \
            "{:.3f}".format(FGA) + ',' + \
            "{:.3f}".format(PTA) + ',' + \
            "{:.3f}".format(RTA) + ',' + \
            "{:.3f}".format(FTA) + '\n'

    with open(os.path.join("./benchmark", result_file), 'a') as summary_file:
        summary_file.write(result)

result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
print(result_df)


Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 94889.46it/s]
100%|██████████| 2000/2000 [00:00<00:00, 119300.41it/s]
100%|██████████| 6/6 [00:00<00:00, 1345.19it/s]


Grouping_Accuracy (GA): 1.0000, FGA: 1.0000,
Parsing_Accuracy (PA): 0.5700


100%|██████████| 6/6 [00:00<00:00, 5991.86it/s]


PTA: 0.5000, RTA: 0.5000 FTA: 0.5000
Identify : 6, Groundtruth : 6

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 72114.16it/s]
100%|██████████| 2000/2000 [00:00<00:00, 88047.19it/s]
100%|██████████| 120/120 [00:00<00:00, 2582.05it/s]


Grouping_Accuracy (GA): 0.9815, FGA: 0.9277,
Parsing_Accuracy (PA): 0.5590


100%|██████████| 115/115 [00:00<00:00, 28753.80it/s]

PTA: 0.5652, RTA: 0.5417 FTA: 0.5532
Identify : 115, Groundtruth : 120

Start to align with null values



100%|██████████| 2000/2000 [00:00<00:00, 59611.63it/s]
100%|██████████| 2000/2000 [00:00<00:00, 70087.88it/s]
100%|██████████| 114/114 [00:00<00:00, 2961.53it/s]


Grouping_Accuracy (GA): 0.9880, FGA: 0.9610,
Parsing_Accuracy (PA): 0.4020


100%|██████████| 117/117 [00:00<?, ?it/s]


PTA: 0.6752, RTA: 0.6930 FTA: 0.6840
Identify : 117, Groundtruth : 114

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 77784.65it/s]
100%|██████████| 2000/2000 [00:00<00:00, 90133.21it/s]
100%|██████████| 14/14 [00:00<00:00, 1998.17it/s]


Grouping_Accuracy (GA): 0.8660, FGA: 0.7742,
Parsing_Accuracy (PA): 0.9335


100%|██████████| 17/17 [00:00<00:00, 6788.19it/s]


PTA: 0.7647, RTA: 0.9286 FTA: 0.8387
Identify : 17, Groundtruth : 14

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 46705.61it/s]
100%|██████████| 2000/2000 [00:00<00:00, 77055.85it/s]
100%|██████████| 75/75 [00:00<00:00, 3383.89it/s]


Grouping_Accuracy (GA): 1.0000, FGA: 1.0000,
Parsing_Accuracy (PA): 0.6580


100%|██████████| 75/75 [00:00<00:00, 37543.00it/s]


PTA: 0.8400, RTA: 0.8400 FTA: 0.8400
Identify : 75, Groundtruth : 75

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 105759.20it/s]
100%|██████████| 2000/2000 [00:00<00:00, 130645.36it/s]
100%|██████████| 46/46 [00:00<00:00, 3321.99it/s]


Grouping_Accuracy (GA): 0.8800, FGA: 0.7865,
Parsing_Accuracy (PA): 0.6570


100%|██████████| 43/43 [00:00<00:00, 18480.90it/s]


PTA: 0.6047, RTA: 0.5652 FTA: 0.5843
Identify : 43, Groundtruth : 46

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 65038.05it/s]
100%|██████████| 2000/2000 [00:00<00:00, 72872.18it/s]
100%|██████████| 119/119 [00:00<00:00, 2720.93it/s]


Grouping_Accuracy (GA): 0.3510, FGA: 0.8226,
Parsing_Accuracy (PA): 0.2800


100%|██████████| 129/129 [00:00<00:00, 118863.18it/s]


PTA: 0.6589, RTA: 0.7143 FTA: 0.6855
Identify : 129, Groundtruth : 119

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 30770.22it/s]
100%|██████████| 2000/2000 [00:00<00:00, 40125.94it/s]
100%|██████████| 341/341 [00:00<00:00, 3489.06it/s]


Grouping_Accuracy (GA): 0.8440, FGA: 0.9172,
Parsing_Accuracy (PA): 0.5055


100%|██████████| 335/335 [00:00<00:00, 29172.47it/s]


PTA: 0.5761, RTA: 0.5660 FTA: 0.5710
Identify : 335, Groundtruth : 341

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 69620.20it/s]
100%|██████████| 2000/2000 [00:00<00:00, 61986.31it/s]
100%|██████████| 26/26 [00:00<00:00, 2028.08it/s]


Grouping_Accuracy (GA): 0.7625, FGA: 0.7333,
Parsing_Accuracy (PA): 0.3070


100%|██████████| 34/34 [00:00<00:00, 8736.53it/s]


PTA: 0.3235, RTA: 0.4231 FTA: 0.3667
Identify : 34, Groundtruth : 26

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 77742.12it/s]
100%|██████████| 2000/2000 [00:00<00:00, 62611.36it/s]
100%|██████████| 43/43 [00:00<00:00, 3035.21it/s]


Grouping_Accuracy (GA): 0.4800, FGA: 0.7835,
Parsing_Accuracy (PA): 0.4010


100%|██████████| 54/54 [00:00<00:00, 13738.47it/s]

PTA: 0.5556, RTA: 0.6977 FTA: 0.6186
Identify : 54, Groundtruth : 43






Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 64205.25it/s]
100%|██████████| 2000/2000 [00:00<00:00, 65208.94it/s]
100%|██████████| 8/8 [00:00<00:00, 1462.13it/s]


Grouping_Accuracy (GA): 0.5265, FGA: 0.7368,
Parsing_Accuracy (PA): 0.5255


100%|██████████| 11/11 [00:00<?, ?it/s]


PTA: 0.5455, RTA: 0.7500 FTA: 0.6316
Identify : 11, Groundtruth : 8

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 77602.60it/s]
100%|██████████| 2000/2000 [00:00<00:00, 86688.73it/s]
100%|██████████| 36/36 [00:00<00:00, 3515.03it/s]


Grouping_Accuracy (GA): 0.9230, FGA: 0.8286,
Parsing_Accuracy (PA): 0.8810


100%|██████████| 34/34 [00:00<00:00, 7814.05it/s]


PTA: 0.5882, RTA: 0.5556 FTA: 0.5714
Identify : 34, Groundtruth : 36

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 62947.29it/s]
100%|██████████| 2000/2000 [00:00<00:00, 40133.23it/s]
100%|██████████| 149/149 [00:00<00:00, 3268.03it/s]


Grouping_Accuracy (GA): 0.9595, FGA: 0.8526,
Parsing_Accuracy (PA): 0.5765


100%|██████████| 163/163 [00:00<00:00, 46400.95it/s]


PTA: 0.5583, RTA: 0.6107 FTA: 0.5833
Identify : 163, Groundtruth : 149

Start to align with null values


100%|██████████| 2000/2000 [00:00<00:00, 74571.37it/s]
100%|██████████| 2000/2000 [00:00<00:00, 88292.77it/s]
100%|██████████| 50/50 [00:00<00:00, 3290.42it/s]


Grouping_Accuracy (GA): 0.9900, FGA: 0.8846,
Parsing_Accuracy (PA): 0.9890


100%|██████████| 54/54 [00:00<00:00, 26947.34it/s]

PTA: 0.7963, RTA: 0.8600 FTA: 0.8269
Identify : 54, Groundtruth : 50
        Dataset  parse_gr  truth_gr     GA     PA    FGA    PTA    RTA    FTA
0        Apache         6         6  1.000  0.570  1.000  0.500  0.500  0.500
1           BGL       115       120  0.982  0.559  0.928  0.565  0.542  0.553
2        Hadoop       117       114  0.988  0.402  0.961  0.675  0.693  0.684
3          HDFS        17        14  0.866  0.933  0.774  0.765  0.929  0.839
4     HealthApp        75        75  1.000  0.658  1.000  0.840  0.840  0.840
5           HPC        43        46  0.880  0.657  0.787  0.605  0.565  0.584
6         Linux       129       119  0.351  0.280  0.823  0.659  0.714  0.685
7           Mac       335       341  0.844  0.505  0.917  0.576  0.566  0.571
8       OpenSSH        34        26  0.762  0.307  0.733  0.324  0.423  0.367
9     OpenStack        54        43  0.480  0.401  0.784  0.556  0.698  0.619
10    Proxifier        11         8  0.526  0.525  0.737  0.545  0.750  0




In [57]:
result_df = pd.read_csv("./benchmark/parsing_accuracy.csv")
# Chỉ chọn các cột số để tính trung bình và độ lệch chuẩn
numeric_cols = result_df.select_dtypes(include='number').columns

# Tính trung bình
avg_row = result_df[numeric_cols].mean()
avg_row['Dataset'] = 'Average'
avg_row['parse_gr'] = ''
avg_row['truth_gr'] = ''

# Tính độ lệch chuẩn
std_row = result_df[numeric_cols].std()
std_row['Dataset'] = 'Std'
std_row['parse_gr'] = ''
std_row['truth_gr'] = ''

# Thêm hai dòng mới vào DataFrame
result_df = pd.concat([result_df, pd.DataFrame([avg_row, std_row])], ignore_index=True)
print(result_df)

        Dataset parse_gr truth_gr        GA        PA       FGA       PTA  \
0        Apache        6        6  1.000000  0.570000  1.000000  0.500000   
1           BGL      115      120  0.982000  0.559000  0.928000  0.565000   
2        Hadoop      117      114  0.988000  0.402000  0.961000  0.675000   
3          HDFS       17       14  0.866000  0.933000  0.774000  0.765000   
4     HealthApp       75       75  1.000000  0.658000  1.000000  0.840000   
5           HPC       43       46  0.880000  0.657000  0.787000  0.605000   
6         Linux      129      119  0.351000  0.280000  0.823000  0.659000   
7           Mac      335      341  0.844000  0.505000  0.917000  0.576000   
8       OpenSSH       34       26  0.762000  0.307000  0.733000  0.324000   
9     OpenStack       54       43  0.480000  0.401000  0.784000  0.556000   
10    Proxifier       11        8  0.526000  0.525000  0.737000  0.545000   
11        Spark       34       36  0.923000  0.881000  0.829000  0.588000   

#### **3. Kiểm tra vì độ chính xác quá thấp**

In [55]:
result_path_dir = "./res/DrainDS/"
choose_dataset = ["Apache", "BGL", "Hadoop", "HDFS", "HealthApp", "HPC", "Linux", "Mac", "OpenSSH", "OpenStack", "Proxifier", "Spark", "Thunderbird", "Zookeeper"]
for name_dataset, dataset_setting in SETTING_PARAMS_2k.items():
    if name_dataset not in choose_dataset:
        continue
    print("="*40 + f" COMPARE {name_dataset} " + "="*40)
    parsedresult = os.path.join(result_path_dir, name_dataset + "_structured.csv")
    parsedresult = pd.read_csv(parsedresult, dtype=str)
    parsedresult.fillna("", inplace=True)
    
    truth_template = pd.read_csv(dataset_setting["log_structure"], dtype=str)
    unique_templates = truth_template['EventTemplate'].unique()
    
    compare_templates(dataset_setting, parsedresult)
    print("="*45 + " END " + "="*45 + "\n")
    

SHAPE: (2000, 6)
SHAPE PARSER: (2000, 10)
Num of truth templates: 6
No. 3
Length: 7, Nums: 12
Ground truth  : jk2_init() Can't find child <*> in scoreboard
Parse templs  : ["<*>() Can't find child <*> in scoreboard"]
Content List: [
	jk2_init() Can't find child 1566 in scoreboard
	jk2_init() Can't find child 1567 in scoreboard
	jk2_init() Can't find child 2082 in scoreboard
]
Length parse: 1
----------------------------------------
No. 6
Length: 8, Nums: 836
Ground truth  : jk2_init() Found child <*> in scoreboard slot <*>
Parse templs  : ['<*>() Found child <*> in scoreboard slot <*>']
Content List: [
	jk2_init() Found child 6725 in scoreboard slot 10
	jk2_init() Found child 6726 in scoreboard slot 8
	jk2_init() Found child 6728 in scoreboard slot 6
]
Length parse: 1
----------------------------------------
Total differences found: 2

SHAPE: (2000, 13)
SHAPE PARSER: (2000, 17)
Num of truth templates: 120
No. 1
Length: 1, Nums: 1
Ground truth  : fpr29=<*>
Parse templs  : ['<*> ffffffff

In [56]:
# # ================================ EXTEND DRAIN ================================ #
# class Node:
#     def __init__(self, childD=None, depth=0, digitOrtoken=None):
#         if childD is None:
#             childD = dict()
#         self.childD = childD
#         self.depth = depth
#         self.digitOrtoken = digitOrtoken

# class DrainTree:
#     def __init__(self, depth=5, st=1, maxChild=20):
#         self.depth = depth - 2
#         self.st = st
#         self.maxChild = maxChild
#         self.rootNode = Node()

    
#     def treeSearch(self, rn, seq):
#         retLogClust = None

#         seqLen = len(seq)
#         if seqLen not in rn.childD:
#             return retLogClust

#         parentn = rn.childD[seqLen]

#         currentDepth = 1
#         for token in seq:
#             if currentDepth >= self.depth or currentDepth > seqLen:
#                 break

#             if token in parentn.childD:
#                 parentn = parentn.childD[token]
#             elif "<*>" in parentn.childD:
#                 parentn = parentn.childD["<*>"]
#             else:
#                 return retLogClust
#             currentDepth += 1

#         logClustL = parentn.childD

#         retLogClust = self.fastMatch(logClustL, seq)

#         return retLogClust

#     def addSeqToPrefixTree(self, rn, logClust):
#         seqLen = len(logClust.tokens)
#         if seqLen not in rn.childD:
#             firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
#             rn.childD[seqLen] = firtLayerNode
#         else:
#             firtLayerNode = rn.childD[seqLen]

#         parentn = firtLayerNode

#         currentDepth = 1
#         for token in logClust.tokens:
#             # Add current log cluster to the leaf node
#             if currentDepth >= self.depth or currentDepth > seqLen:
#                 if len(parentn.childD) == 0:
#                     parentn.childD = [logClust]
#                 else:
#                     parentn.childD.append(logClust)
#                 break

#             # If token not matched in this layer of existing tree.
#             if token not in parentn.childD:
#                 if "<*>" in parentn.childD:
#                     if len(parentn.childD) < self.maxChild:
#                         newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
#                         parentn.childD[token] = newNode
#                         parentn = newNode
#                     else:
#                         parentn = parentn.childD["<*>"]
#                 else:
#                     if len(parentn.childD) + 1 < self.maxChild:
#                         newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
#                         parentn.childD[token] = newNode
#                         parentn = newNode
#                     elif len(parentn.childD) + 1 == self.maxChild:
#                         newNode = Node(depth=currentDepth + 1, digitOrtoken="<*>")
#                         parentn.childD["<*>"] = newNode
#                         parentn = newNode
#                     else:
#                         parentn = parentn.childD["<*>"]

#             # If the token is matched
#             else:
#                 parentn = parentn.childD[token]

#             currentDepth += 1

#     def seqDist(self, seq1, seq2):
#         assert len(seq1) == len(seq2)
#         simTokens = 0
#         numOfPar = 0

#         for token1, token2 in zip(seq1, seq2):
#             if token1 == "<*>":
#                 numOfPar += 1
#                 continue
#             if token1 == token2:
#                 simTokens += 1

#         retVal = float(simTokens) / len(seq1)

#         return retVal, numOfPar

#     def fastMatch(self, logClustL, seq):
#         retLogClust = None

#         maxSim = -1
#         maxNumOfPara = -1
#         maxClust = None

#         for logClust in logClustL:
#             curSim, curNumOfPara = self.seqDist(logClust.tokens, seq)
#             if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
#                 maxSim = curSim
#                 maxNumOfPara = curNumOfPara
#                 maxClust = logClust

#         if maxSim >= self.st:
#             retLogClust = maxClust

#         return retLogClust

#     def printTree(self, node, dep):
#         pStr = ""
#         for i in range(dep):
#             pStr += "\t"

#         if node.depth == 0:
#             pStr += "Root"
#         elif node.depth == 1:
#             pStr += "<" + str(node.digitOrtoken) + ">"
#         else:
#             pStr += node.digitOrtoken

#         print(pStr)

#         if node.depth == self.depth:
#             return 1
#         for child in node.childD:
#             self.printTree(node.childD[child], dep + 1)

#     def match_template(self, row):
#         matchCluster = self.treeSearch(self.rootNode, row["GroupTokens"])
#         return matchCluster.logTemplate if matchCluster else ""
    
#     def parse(self, logs_df):
#         # print("Parsing file: " + os.path.join(self.path, logName))
#         start_time = datetime.now()
#         logs_df['EventTemplate'] = ""             # Lưu kết quả xử lý

#         tqdm.pandas(desc="So khớp ...")
#         logs_df['EventTemplate'] = logs_df.progress_apply(self.match_template, axis=1)
#         print("Parsing done. [Time taken: {!s}]".format(datetime.now() - start_time))
#         return logs_df
    
#     def createTree(self, newGroupL):
#         self.rootNode = Node()
#         for item in newGroupL:
#             # matchCluster = self.treeSearch(self.rootNode, item.tokens)
#             # if matchCluster is None:
#             #     self.addSeqToPrefixTree(self.rootNode, item)
#             # else: 
#             #     matchCLuster.logIDL.extend(item.logIDL)
            
#             self.addSeqToPrefixTree(self.rootNode, item)      # Chỉ cần dòng này là đủ
    
    
# drain_tree = DrainTree(depth=5, st=1, maxChild=20)


##### **@.2. HOÀN CHỈNH**