In [2]:
# Khai báo sử dụng các thư viện. Thực hiện chạy ngay trên CPU, do đó, chỉ thực hiện mô phỏng, nếu ổn định sẽ thực hiện chạy trên kaggle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import regex as re
import os
import sys
import hashlib
from datetime import datetime
import hashlib
from tabulate import tabulate
import copy

from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

##### =========================================================================
##### Copyright (C) 2016-2023 LOGPAI (https://github.com/logpai).
##### 
##### Tiếp tục thực hiện chỉnh sửa lại DRAIN sao cho phù hợp với việc phân tích
##### Quy trình thực hiện sẽ giữ lại mọt số cấu trúc vào tạo thêm một số cấu trúc, phương thức mới trong DRAIN
#### =========================================================================

In [3]:
# Lớp Logcluster và Node được giữ lại:
class Logcluster:
    def __init__(self, logTemplate="", logLevel="", logIDL=None):
        self.logTemplate = logTemplate
        self.logLevel = [logLevel]
        if logIDL is None:
            logIDL = []
        self.logIDL = logIDL
        self.logIDLevelL = {logLevel:list(logIDL)}
        self.totalOccurrences = {}
        
    def addIDLevel(self, logLevel, id):
        if logLevel not in self.logLevel:
            self.logLevel.append(logLevel)
            self.logIDLevelL[logLevel] = [id]
        else:
            self.logIDLevelL[logLevel].append(id)
        # if logLevel in self.logLevel:
        #     self.logIDLevelL[logLevel].append(id)
        # else:
        #     self.logIDLevelL[logLevel] = [id]
        # self.logIDLevelL.setdefault(logLevel, []).append(id)
        
    # Phương thức để đặt lại giá trị logLevel, logIDL và logIDLevelL
    def resetValues(self):
        self.logLevel = []
        self.logIDL = []
        self.logIDLevelL = {}
            

class Node:
    def __init__(self, childD=None, depth=0, digitOrtoken=None):
        if childD is None:
            childD = dict()
        self.childD = childD
        self.depth = depth
        self.digitOrtoken = digitOrtoken

class LogParser:
    def __init__(
        self,
        log_format,
        indir="./",
        outdir="./result/",
        depth=4,
        st=0.4,
        maxChild=100,
        rex=[],
        keep_para=True,
        rootNode=Node(),
        logClusterList=[]
    ):
        self.path = indir
        self.depth = depth - 2
        self.st = st
        self.maxChild = maxChild
        self.logName = None
        self.savePath = outdir
        self.df_log = None
        self.log_format = log_format
        self.rex = rex
        self.keep_para = keep_para
        self.rootNode = rootNode
        self.logClusterList = logClusterList

    def hasNumbers(self, s):
        return any(char.isdigit() for char in s)

    def treeSearch(self, rn, seq):
        retLogClust = None

        seqLen = len(seq)
        if seqLen not in rn.childD:
            return retLogClust

        parentn = rn.childD[seqLen]

        currentDepth = 1
        for token in seq:
            if currentDepth >= self.depth or currentDepth > seqLen:
                break

            if token in parentn.childD:
                parentn = parentn.childD[token]
            elif "<*>" in parentn.childD:
                parentn = parentn.childD["<*>"]
            else:
                return retLogClust
            currentDepth += 1

        logClustL = parentn.childD

        retLogClust = self.fastMatch(logClustL, seq)

        return retLogClust

    def addSeqToPrefixTree(self, rn, logClust):
        seqLen = len(logClust.logTemplate)
        if seqLen not in rn.childD:
            firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
            rn.childD[seqLen] = firtLayerNode
        else:
            firtLayerNode = rn.childD[seqLen]

        parentn = firtLayerNode

        currentDepth = 1
        for token in logClust.logTemplate:
            # Add current log cluster to the leaf node
            if currentDepth >= self.depth or currentDepth > seqLen:
                if len(parentn.childD) == 0:
                    parentn.childD = [logClust]
                else:
                    parentn.childD.append(logClust)
                break

            # If token not matched in this layer of existing tree.
            if token not in parentn.childD:
                if not self.hasNumbers(token):
                    if "<*>" in parentn.childD:
                        if len(parentn.childD) < self.maxChild:
                            newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
                            parentn.childD[token] = newNode
                            parentn = newNode
                        else:
                            parentn = parentn.childD["<*>"]
                    else:
                        if len(parentn.childD) + 1 < self.maxChild:
                            newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
                            parentn.childD[token] = newNode
                            parentn = newNode
                        elif len(parentn.childD) + 1 == self.maxChild:
                            newNode = Node(depth=currentDepth + 1, digitOrtoken="<*>")
                            parentn.childD["<*>"] = newNode
                            parentn = newNode
                        else:
                            parentn = parentn.childD["<*>"]

                else:
                    if "<*>" not in parentn.childD:
                        newNode = Node(depth=currentDepth + 1, digitOrtoken="<*>")
                        parentn.childD["<*>"] = newNode
                        parentn = newNode
                    else:
                        parentn = parentn.childD["<*>"]

            # If the token is matched
            else:
                parentn = parentn.childD[token]

            currentDepth += 1
   
    # seq1 is template
    def seqDist(self, seq1, seq2):
        assert len(seq1) == len(seq2)
        simTokens = 0
        numOfPar = 0

        for token1, token2 in zip(seq1, seq2):
            if token1 == "<*>":
                numOfPar += 1
                continue
            if token1 == token2:
                simTokens += 1

        retVal = float(simTokens) / len(seq1)

        return retVal, numOfPar

    def fastMatch(self, logClustL, seq):
        retLogClust = None

        maxSim = -1
        maxNumOfPara = -1
        maxClust = None

        for logClust in logClustL:
            curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
            if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
                maxSim = curSim
                maxNumOfPara = curNumOfPara
                maxClust = logClust

        if maxSim >= self.st:
            retLogClust = maxClust

        return retLogClust

    def getTemplate(self, seq1, seq2):
        assert len(seq1) == len(seq2)
        retVal = []

        i = 0
        for word in seq1:
            if word == seq2[i]:
                retVal.append(word)
            else:
                retVal.append("<*>")

            i += 1

        return retVal

    def outputResult(self, logClustL):
        log_templates = [0] * self.df_log.shape[0]
        log_templateids = [0] * self.df_log.shape[0]
        df_events = []
        for logClust in logClustL:
            template_str = " ".join(logClust.logTemplate)
            occurrence = len(logClust.logIDL)
            template_id = hashlib.md5(template_str.encode("utf-8")).hexdigest()[0:8]
            for logID in logClust.logIDL:
                logID -= 1
                log_templates[logID] = template_str
                log_templateids[logID] = template_id
            df_events.append([template_id, template_str, occurrence])

        df_event = pd.DataFrame(
            df_events, columns=["EventId", "EventTemplate", "Occurrences"]
        )
        self.df_log["EventId"] = log_templateids
        self.df_log["EventTemplate"] = log_templates
        if self.keep_para:
            self.df_log["ParameterList"] = self.df_log.apply(
                self.get_parameter_list, axis=1
            )
        self.df_log.to_csv(
            os.path.join(self.savePath, self.logName + "_structured.csv"), index=False
        )

        occ_dict = dict(self.df_log["EventTemplate"].value_counts())
        df_event = pd.DataFrame()
        df_event["EventTemplate"] = self.df_log["EventTemplate"].unique()
        df_event["EventId"] = df_event["EventTemplate"].map(
            lambda x: hashlib.md5(x.encode("utf-8")).hexdigest()[0:8]
        )
        df_event["Occurrences"] = df_event["EventTemplate"].map(occ_dict)
        df_event.to_csv(
            os.path.join(self.savePath, self.logName + "_templates.csv"),
            index=False,
            columns=["EventId", "EventTemplate", "Occurrences"],
        )

    def printTree(self, node, dep):
        pStr = ""
        for i in range(dep):
            pStr += "\t"

        if node.depth == 0:
            pStr += "Root"
        elif node.depth == 1:
            pStr += "<" + str(node.digitOrtoken) + ">"
        else:
            pStr += node.digitOrtoken

        print(pStr)

        if node.depth == self.depth:
            return 1
        for child in node.childD:
            self.printTree(node.childD[child], dep + 1)

    def parse(self, logName):
        print("Parsing file: " + os.path.join(self.path, logName))
        start_time = datetime.now()
        self.logName = logName
        rootNode = Node()
        logCluL = []

        self.load_data()

        count = 0
        for idx, line in self.df_log.iterrows():
            logID = line["LineId"]
            logmessageL = self.preprocess(line["Content"]).strip().split()
            matchCluster = self.treeSearch(rootNode, logmessageL)

            # Match no existing log cluster
            if matchCluster is None:
                
                
                newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
                logCluL.append(newCluster)
                self.addSeqToPrefixTree(rootNode, newCluster)

            # Add the new log message to the existing cluster
            else:
                newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
                matchCluster.logIDL.append(logID)
                if " ".join(newTemplate) != " ".join(matchCluster.logTemplate):
                    matchCluster.logTemplate = newTemplate

            count += 1
            if count % 1000 == 0 or count == len(self.df_log):
                print(
                    "Processed {0:.1f}% of log lines.".format(
                        count * 100.0 / len(self.df_log)
                    )
                )

        if not os.path.exists(self.savePath):
            os.makedirs(self.savePath)

        self.outputResult(logCluL)

        print("Parsing done. [Time taken: {!s}]".format(datetime.now() - start_time))

    def load_data(self):
        headers, regex = self.generate_logformat_regex(self.log_format)
        self.df_log = self.log_to_dataframe(
            os.path.join(self.path, self.logName), regex, headers, self.log_format
        )

    def preprocess(self, line):
        for currentRex in self.rex:
            line = re.sub(currentRex, "<*>", line)
        return line

    def log_to_dataframe(self, log_file, regex, headers, logformat):
        """Function to transform log file to dataframe"""
        log_messages = []
        linecount = 0
        a = []
        with open(log_file, "r") as fin:
            for line in fin.readlines():
                try:
                    match = regex.search(line.strip())
                    message = [match.group(header) for header in headers]
                    log_messages.append(message)
                    linecount += 1
                except Exception as e:
                    a.append(linecount)
        logdf = pd.DataFrame(log_messages, columns=headers)
        logdf.insert(0, "LineId", None)
        logdf["LineId"] = [i + 1 for i in range(linecount)]
        print("Total lines: ", len(logdf))
        return logdf

    def generate_logformat_regex(self, logformat):
        """Function to generate regular expression to split log messages"""
        headers = []
        splitters = re.split(r"(<[^<>]+>)", logformat)
        regex = ""
        for k in range(len(splitters)):
            if k % 2 == 0:
                splitter = re.sub(" +", "\\\s+", splitters[k])
                regex += splitter
            else:
                header = splitters[k].strip("<").strip(">")
                regex += "(?P<%s>.*?)" % header
                headers.append(header)
        regex = re.compile("^" + regex + "$")
        return headers, regex

    def get_parameter_list(self, row):
        template_regex = re.sub(r"<.{1,5}>", "<*>", row["EventTemplate"])
        if "<*>" not in template_regex:
            return []
        template_regex = re.sub(r"([^A-Za-z0-9])", r"\\\1", template_regex)
        template_regex = re.sub(r"\\ +", r"\\s+", template_regex)
        template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
        parameter_list = re.findall(template_regex, row["Content"])
        parameter_list = parameter_list[0] if parameter_list else ()
        parameter_list = (
            list(parameter_list)
            if isinstance(parameter_list, tuple)
            else [parameter_list]
        )
        return parameter_list

In [4]:
# Một số phương thức hỗ trợ in dữ liệu trong class:

# Phương thức in các thuộc tính có trong đối tượng chỉ định
def printObj(obj):
    print("\n###################################")
    print("TYPE: " + str(type(obj)))
    for key, value in vars(obj).items():
        if key == "df_log":
            print(f"{key}:{len(value)}")
        else:
            print(f"{key}: {value}")
    print("###################################")

In [5]:
class KeThuaLogParser(LogParser):
    def outputResult(self, logClustL):
        log_templates = [0] * self.df_log.shape[0]      # Cột Template
        log_templateids = [0] * self.df_log.shape[0]    # Cột IDTemplate
        df_events = []                                  # DataFrame Events
        for logClust in logClustL:
            template_str = " ".join(logClust.logTemplate)
            occurrence = len(logClust.logIDL)
            template_id = hashlib.md5(template_str.encode("utf-8")).hexdigest()[0:8]
            for logID in logClust.logIDL:
                logID -= 1
                log_templates[logID] = template_str
                log_templateids[logID] = template_id
            
            # Viết thêm dữ liệu:
            if len(logClust.logIDLevelL) != 0:
                for key, value in logClust.logIDLevelL.items():
                    if key in logClust.totalOccurrences:
                        logClust.totalOccurrences[key] = logClust.totalOccurrences[key] + len(value)
                    else:
                        logClust.totalOccurrences[key] = len(value)
            
            df_events.append([template_id, template_str, occurrence, list(logClust.logLevel), copy.deepcopy(logClust.logIDLevelL)])

        df_eventAll = pd.DataFrame(
            df_events, columns=["EventId", "EventTemplate", "Occurrences", "Level", "Details"]
        )
        self.df_log["EventId"] = log_templateids
        self.df_log["EventTemplate"] = log_templates
        if self.keep_para:
            self.df_log["ParameterList"] = self.df_log.apply(
                self.get_parameter_list, axis=1
            )
        self.df_log.to_csv(
            os.path.join(self.savePath, self.logName + "_structured.csv"), index=False
        )

        occ_dict = dict(self.df_log["EventTemplate"].value_counts())
        # Tạo lại một df_event mới chưa có cột nào
        df_event = pd.DataFrame()
        df_event["EventTemplate"] = self.df_log["EventTemplate"].unique()
        df_event["EventId"] = df_event["EventTemplate"].map(
            lambda x: hashlib.md5(x.encode("utf-8")).hexdigest()[0:8]
        )
        df_event["Occurrences"] = df_event["EventTemplate"].map(occ_dict)
        # Thêm cột "Details" và "Level" vào df_event từ df_eventAll dựa trên cột "EventTemplate"
        df_event = df_event.merge(df_eventAll[["EventTemplate", "Level", "Details"]], on="EventTemplate", how="left")
        # print(df_event["Details"].head(10))
        print(f"Length DF_EVENTS = {df_event.shape[0]}")
        df_event.to_csv(
            os.path.join(self.savePath, self.logName + "_templates.csv"),
            index=False,
            columns=["EventId", "EventTemplate", "Occurrences","Level", "Details"],
        )
        
        self.logExtractNgram(self.df_log, "WARN", 10)
        
    def parse(self, logName):
        print("Parsing file: " + os.path.join(self.path, logName))
        start_time = datetime.now()
        self.logName = logName
        self.df_log = None
        rootNode = self.rootNode
        logCluL = self.logClusterList
        templateLogCluL = []
        for value in logCluL:
            templateLogCluL.append(" ".join(value.logTemplate))
        
        # ########Load dữ liệu:#############
        self.load_data()

        count = 0
        for idx, line in self.df_log.iterrows():
            logID = line["LineId"]
            logmessageL = self.preprocess(line["Content"]).strip().split()
            logLevelL = str(line['Level'])
            matchCluster = self.treeSearch(rootNode, logmessageL)

            # Match no existing log cluster
            if matchCluster is None:
                logMesStr = " ".join(logmessageL)
                numIdx = -1
                for idx, element in enumerate(templateLogCluL):
                    if logMesStr == element:
                        numIdx = idx
                        break
                if numIdx >= 0:
                    matchCluster = logCluL[numIdx]
                    newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
                    matchCluster.addIDLevel(logLevelL, logID)
                    matchCluster.logIDL.append(logID)
                    if " ".join(newTemplate) != " ".join(matchCluster.logTemplate):
                        matchCluster.logTemplate = newTemplate
                else:
                    newCluster = Logcluster(logTemplate=logmessageL, logLevel=logLevelL, logIDL=[logID])
                    logCluL.append(newCluster)
                    templateLogCluL.append(" ".join(logmessageL))
                    self.addSeqToPrefixTree(rootNode, newCluster)

            # Add the new log message to the existing cluster
            else:
                newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
                matchCluster.addIDLevel(logLevelL, logID)
                matchCluster.logIDL.append(logID)
                if " ".join(newTemplate) != " ".join(matchCluster.logTemplate):
                    matchCluster.logTemplate = newTemplate

            count += 1
            if count % 10000 == 0 or count == len(self.df_log):
                print(
                    "Processed {0:.1f}% of log lines.".format(
                        count * 100.0 / len(self.df_log)
                    )
                )

        if not os.path.exists(self.savePath):
            os.makedirs(self.savePath)

        self.outputResult(logCluL)

        for item in logCluL:
            item.resetValues()
        self.allLogClusterList = logCluL
        
        print("Parsing done. [Time taken: {!s}]".format(datetime.now() - start_time))
        
    def logExtractNgram(self, df_root, extract="WARN", n_gram=10):
        df_extract = df_root[df_root['Level'] == extract].copy()
        if df_extract.shape[0] == 0:
            return
        df_extract['ListID'] = [[]] * df_extract.shape[0]
        df_extract['InfoTemplate'] = [[]] * df_extract.shape[0]
        df_extract['InfoPara'] = [[]] * df_extract.shape[0]
        for idx, line in df_extract.iterrows():
            start_idx = idx - 30 if (idx - 30 >= 0) else 0
            df_select = None
            while(1):
                data_root = df_root.iloc[start_idx:idx]
                df_select = data_root[data_root['Level'] == "INFO"]
                if df_select.shape[0] >= 10 or start_idx == 0:
                    df_select = df_select.iloc[-10:]
                    break
                else:
                    start_idx = start_idx - 20 if (start_idx - 20 >= 0) else 0
            df_extract.at[idx, 'ListID'] = df_select["LineId"].tolist()
            df_extract.at[idx, 'InfoTemplate'] = df_select["EventId"].tolist()
            df_extract.at[idx, 'InfoPara'] = df_select["ParameterList"].tolist()
            
        df_extract.to_csv(
            os.path.join(self.savePath, self.logName + "_" + extract + "_out.csv"), index=False, sep='|'
        )
    
    # def writingExcelSheet(self, df_root, extract="WARN"):
    #     out_file = extract + "_out_template.xlsx"
    #     unique_df = df_root['EventId'].unique()
    #     # Đọc file Excel hiện có và lấy tên các sheet đã tồn tại
    #     try:
    #         wb = load_workbook(out_file)
    #     except FileNotFoundError:
    #         wb = Workbook()

    #     existing_sheets = wb.sheetnames

    #     for template_id in unique_df:
    #         if str(template_id) in existing_sheets:
    #             # Nếu sheet đã tồn tại, mở sheet đó và thêm dữ liệu
    #             ws = wb[str(template_id)]
    #         else:
    #             # Nếu sheet chưa tồn tại, tạo mới sheet đó và thêm dữ liệu
    #             ws = wb.create_sheet(title=str(template_id))

    #         data_add = df_root[df_root['EventId'] == template_id]

    #         for r_idx, row in enumerate(dataframe_to_rows(data_add, index=False, header=True), 1):
    #             ws.append(row)
    #     wb.save(out_file)
        
            

input_dir = path  = './' # The input directory of log file
output_dir = 'result/'  # The output directory of parsing results
log_file   = 'filelog1.log'  # The input log file name
log_format = '<Date> <Time>,<Pid> <Level> <Component>: <Content>'  # HDFS log format
regex      = [
    r'(\/[\w\-.]{2,})+(:[0-9]+)?',                                   #path
    r'blk(_[\-0-9]+){,2}',                                          # blockid
    r'0x[a-f0-9]+\b',                                                #Hexa
    r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', #UUID
    r'(\-+[0-9.]+)+\b',                                                 # Block Pool
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',                        # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',         # Numbers
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[\w\/]+)'                       # Numbers
]
st         = 0.5 
depth      = 4
allRootNode = Node()
allLogClusterList = []
parser = KeThuaLogParser(log_format, indir=input_dir, outdir=output_dir,  depth=depth, st=st, rex=regex, rootNode=allRootNode, logClusterList=allLogClusterList)

# allLogClusterList = parser.logClusterList
# parser.parse(log_file)
for i in range(1,6):
    log_file = "filelog" + str(i) + ".log"
    parser.parse(log_file)
    print(len(allLogClusterList))


# print(len(allLogClusterList))
# log_file = 'filelog2.log'
# parser.parse(log_file)
# print(len(allLogClusterList))

# for obj in allLogClusterList:
#     for key, value in vars(obj).items():
#         if key == "logTemplate":
#             print(f"{key}: {str(''.join(value))}")
#         # if key == "totalOccurrences":
#         #     print(f"\t{key}:")
#         #     for k,v in value.items():
#         #         print(f"\t\t{k}: {v}")

# parser.printTree(allRootNode, 4)






Parsing file: ./filelog1.log
Total lines:  30000
Processed 33.3% of log lines.
Processed 66.7% of log lines.
Processed 100.0% of log lines.
Length DF_EVENTS = 72
Parsing done. [Time taken: 0:00:06.688826]
72
Parsing file: ./filelog2.log
Total lines:  33825
Processed 29.6% of log lines.
Processed 59.1% of log lines.
Processed 88.7% of log lines.
Processed 100.0% of log lines.
Length DF_EVENTS = 83
Parsing done. [Time taken: 0:00:07.932148]
86
Parsing file: ./filelog3.log
Total lines:  21399
Processed 46.7% of log lines.
Processed 93.5% of log lines.
Processed 100.0% of log lines.
Length DF_EVENTS = 17
Parsing done. [Time taken: 0:00:05.263692]
89
Parsing file: ./filelog4.log
Total lines:  41847
Processed 23.9% of log lines.
Processed 47.8% of log lines.
Processed 71.7% of log lines.
Processed 95.6% of log lines.
Processed 100.0% of log lines.
Length DF_EVENTS = 12
Parsing done. [Time taken: 0:00:09.004297]
89
Parsing file: ./filelog5.log
Total lines:  175434
Processed 5.7% of log lines.

In [6]:
# Chương trình chỉnh sửa sẽ sử dụng lại các thành phần ở trên, sử dụng lại 2 class: LogCluster và Node:
