#### **0. CHUẨN BỊ**

##### **0.1. CẤU HÌNH CÁC THAM SỐ**

In [1]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter, defaultdict
import datetime
from tqdm import tqdm
import string

pd.set_option('display.max_rows', None)

SETTING_PARAMS = {
    'Apache': {
        'log_file': './logs/Apache/Apache_full.log',
        'log_template': './logs/Apache/Apache_full.log_templates.csv',
        'log_structure': './logs/Apache/Apache_full.log_structured.csv',
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'filters': [],
        'regexs': [r'\/(?:\w+\/){2,}\w+\.\w+$'],    
    },
    'HealthApp':{
        'log_file': './logs/HealthApp/HealthApp_full.log',
        'log_template': './logs/HealthApp/HealthApp_full.log_templates.csv',
        'log_structure': './logs/HealthApp/HealthApp_full.log_structured.csv',
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'filters': [],
        'regexs': [],  
    },
    'Mac': {
        'log_file': './logs/Mac/Mac_full.log',
        'log_template': './logs/Mac/Mac_full.log_templates.csv',
        'log_structure': './logs/Mac/Mac_full.log_structured.csv',
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'filters': [],
        'regexs': [r'([\w-]+\.){2,}[\w-]+'],  
    },
    'Linux': {
        'log_file': './logs/Linux/Linux_full.log',
        'log_template': './logs/Linux/Linux_full.log_templates.csv',
        'log_structure': './logs/Linux/Linux_full.log_structured.csv',
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'filters': [],
        'regexs': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],  
    },
    'OpenSSH': {
        'log_file': './logs/OpenSSH/OpenSSH_full.log',
        'log_template': './logs/OpenSSH/OpenSSH_full.log_templates.csv',
        'log_structure': './logs/OpenSSH/OpenSSH_full.log_structured.csv',
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'filters': [],
        'regexs': [r"(\d+):"],    
    },
}

##### **0.2. CLASS**

In [2]:
class LogCluster:
    def __init__(self, keyGroup, logTemplate, tokens, length, logIDL=None, dynamic_tokenL=None, static_tokenL=None):
        self.keyGroup = keyGroup
        self.logTemplate = logTemplate
        self.tokens = tokens
        self.length = length
        self.logIDL = logIDL if logIDL is not None else []
        self.dynamic_tokenL = dynamic_tokenL if dynamic_tokenL is not None else {}
        self.static_tokenL = static_tokenL if static_tokenL is not None else {}
        
    def __str__(self):
        dynamic_str = "Dynamic Tokens: {\n"
        for k, v in self.dynamic_tokenL.items():
            dynamic_str += f"  {k}: {v},\n"
        dynamic_str += "}"
        
        static_str = "Cover Tokens: {\n"
        for k, v in self.static_tokenL.items():
            static_str += f"  {k}: {v},\n"
        static_str += "}"
        return (
            f"Key: {self.keyGroup}\n"
            f"Template: {self.logTemplate}\n"
            f"Tokens: {self.tokens}\n"
            f"Length: {self.length}\n"
            f"Len LogIDs: {len(self.logIDL)}\n"
            f"{dynamic_str}\n"
            f"{static_str}\n"
        )

#### **1. Các phương thức sử dụng**

##### **1.1. Read data**
- **`log_to_dataframe()`**

- **`generate_logformat_regex()`**

- **`load_data()`**

In [3]:
# =============================== READ DATA =============================== #
def log_to_dataframe(log_file, regex, headers):
    """ Phương thức chuyển đổi file log thành dataframe
    """ 
    log_messages = []
    linecount = 0
    with open(log_file, 'r', encoding="utf8") as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
            except Exception as e:
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

def generate_logformat_regex(logformat):
    """ Phương thức tạo regex từ logformat, biểu thức định dạng của một event log: 
    Ex: 'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>'
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

def load_data(logfile, logformat):
    """ Phương thức trả về một dataframe từ một file log chỉ định
    """
    log_headers, log_regex = generate_logformat_regex(logformat)
    logs_df = log_to_dataframe(logfile, log_regex, log_headers)
    return logs_df

##### **1.2. PRE_PROCESSING_0**
- **`pre0_hasNumbers()`**
- **`pre0_regexAndFilter()`**

- **`create_special_pattern()`**
- **`process_line()`**

In [4]:
# ========================== PRE_PROCESSING_0 ========================== #
def pre0_hasNumbers(s):
    return any(char.isdigit() for char in s)

def pre0_regexAndFilter(line, regexs = [], filters = []):
    for currentFil in filters:
        line = re.sub(currentFil, '', line)
    for currentRex in regexs:
        line = re.sub(currentRex, "<*>", line)
    return line.strip()

# ======================= CÁC PHƯƠNG THỨC BỔ TRỢ ======================= #
def create_special_pattern(dict_special_token):
    """ Tạo biểu thức chính quy cho các từ điển yêu cầu """
    keywords = '|'.join([re.escape(k) for k in dict_special_token])
    return re.compile(rf"([{re.escape(string.punctuation)}])({keywords})\b", flags=re.IGNORECASE)


def process_line(line, regexs, filters, dict_special_token, pattern_special):
    """ Phương thức hỗ trợ xử lý từng dòng log """
    
    # ================== Xử lý regex và filter ================== #
    # tokens0 là list token sau khi tiền xử lý
    # tokens1 là list token sau khi đã xử lý hết
    # groups_token là chuỗi log để nhóm các log chung lại với nhau
    
    parsed = pre0_regexAndFilter(line['Content'], regexs=regexs, filters=filters)
    tokens0 = str(parsed).strip().split()

    tokens1 = []
    groups_token = []
    
    for token in tokens0:
        if token == "<*>":
            tokens1.append("<*>")
        if not pre0_hasNumbers(token) and str(token).lower() not in dict_special_token:
            temp = re.sub(pattern_special, r'\1<*>', token)
            groups_token.append(temp)
            tokens1.append(temp)
        else:
            tokens1.append("<*>")

    return pd.Series({
        # 'ParseTemplate0': parsed,
        # 'Tokens0': tokens0,
        'Tokens1': tokens1,
        'ParseTemplate1': f"{' '.join(groups_token)} : {len(tokens0)}"
    })

##### **1.3. PHƯƠNG THỨC HỖ TRỢ NHÓM GROUP**

- **`extractDynamicTok()`**

In [5]:
def extractDynamicTok(groups, tokens):
    dynamic_tokenL = defaultdict(list)
    split_lines = [row['Content'].strip().split() for _, row in groups.iterrows()]

    for pos, token in enumerate(tokens):
        if "<*>" in token:
            for line in split_lines:
                if pos < len(line):
                    dynamic_tokenL[pos].append(line[pos])
    return dict(dynamic_tokenL)

##### **1.#. PHƯƠNG THỨC KHÁC**

- **`write_df_to_txt()`**

In [6]:
def write_df_to_txt(df, filename):
    col_widths = [max(len(str(val)) for val in [col] + df[col].tolist()) for col in df.columns]

    with open(filename, 'w', encoding='utf-8') as f:
        header = '\t'.join(str(col).ljust(width) for col, width in zip(df.columns, col_widths))
        f.write(header + '\n')

        for row in df.itertuples(index=False):
            line = '\t'.join(str(val).ljust(width) for val, width in zip(row, col_widths))
            f.write(line + '\n')

#### **2. KHUNG LÀM VIỆC CHÍNH**

In [8]:
# CẤU HÌNH CÁC THAM SỐ:
datasets = SETTING_PARAMS['HealthApp']
DICT_SPECIAL_TOKEN = ['true', 'false', 'yes', 'no']

logs_df = load_data(datasets['log_file'], datasets['log_format'])

# ================================ PROCESSING 0 ================================ #
parse_df = logs_df.copy()
# parse_df['ParseTemplate0'] = ""
# parse_df['Tokens0'] = [[] for _ in range(len(parse_df))]
parse_df['ParseTemplate1'] = ""
parse_df['Tokens1'] = [[] for _ in range(len(parse_df))]

tqdm.pandas(desc="Tiền xử lý giai đoạn 0 và 1")
pattern_special = create_special_pattern(DICT_SPECIAL_TOKEN)

results = parse_df.progress_apply(
        lambda row: process_line(row, datasets['regexs'], datasets['filters'], DICT_SPECIAL_TOKEN, pattern_special),
        axis=1
    )

for col in results.columns:
    parse_df[col] = results[col]

# write_df_to_txt(parse_df, 'z_parse_df.txt')

# ================================ GROUP LOG ================================ #
PUNCTUATIONL = '(),<>:;{}[]~='
N_MERGE = 3

log_clusters_list = []

unique_groups = parse_df.groupby("ParseTemplate1")
print(len(unique_groups))

for key, group_val in unique_groups:
    tokens = group_val.iloc[0]['Tokens1']
    logTemplate = " ".join(tokens)
    logIDL = group_val.index.tolist()

    dynamic_tokenL = extractDynamicTok(group_val, tokens)
    
    cluster = LogCluster(
        keyGroup=key,
        logTemplate=logTemplate,
        tokens=tokens,
        length=len(tokens),
        logIDL=logIDL,
        dynamic_tokenL=dynamic_tokenL,
        static_tokenL=None
    )
    log_clusters_list.append(cluster)

# ================================ PROCESSING 1 ================================ #

# =============================== CÁC CÂU LỆNH IN KIỂM TRA ===============================
parse_df.head(100)

Tiền xử lý giai đoạn 0 và 1: 100%|██████████| 212394/212394 [00:27<00:00, 7598.02it/s] 


161


Unnamed: 0,LineId,Time,Component,Pid,Content,ParseTemplate1,Tokens1
0,1,20171223-22:15:29:606,Step_LSC,30002312,onStandStepChanged 3579,onStandStepChanged : 2,"[onStandStepChanged, <*>]"
1,2,20171223-22:15:29:633,Step_StandReportReceiver,30002312,onReceive action: android.intent.action.SCREEN_ON,onReceive action: android.intent.action.SCREEN...,"[onReceive, action:, android.intent.action.SCR..."
2,3,20171223-22:15:29:635,Step_LSC,30002312,processHandleBroadcastAction action:android.in...,processHandleBroadcastAction action:android.in...,"[processHandleBroadcastAction, action:android...."
3,4,20171223-22:15:29:635,Step_StandStepCounter,30002312,flush sensor data,flush sensor data : 3,"[flush, sensor, data]"
4,5,20171223-22:15:29:635,Step_SPUtils,30002312,getTodayTotalDetailSteps = 1514038440000##699...,getTodayTotalDetailSteps = : 3,"[getTodayTotalDetailSteps, =, <*>]"
5,6,20171223-22:15:29:636,Step_SPUtils,30002312,setTodayTotalDetailSteps=1514038440000##7007##...,: 1,[<*>]
6,7,20171223-22:15:29:636,Step_LSC,30002312,onStandStepChanged 3579,onStandStepChanged : 2,"[onStandStepChanged, <*>]"
7,8,20171223-22:15:29:645,Step_ExtSDM,30002312,calculateCaloriesWithCache totalCalories=126775,calculateCaloriesWithCache : 2,"[calculateCaloriesWithCache, <*>]"
8,9,20171223-22:15:29:648,Step_ExtSDM,30002312,calculateAltitudeWithCache totalAltitude=240,calculateAltitudeWithCache : 2,"[calculateAltitudeWithCache, <*>]"
9,10,20171223-22:15:29:649,Step_StandReportReceiver,30002312,REPORT : 7007 5002 150089 240,REPORT : : 6,"[REPORT, :, <*>, <*>, <*>, <*>]"


##### **@.2. HOÀN CHỈNH**

In [None]:
# def parse_DrainDS(datasets):
#     logs_df = load_data(datasets['log_file'], datasets['log_format'])
    
    
#     # =============================== CÁC CÂU LỆNH IN KIỂM TRA ===============================
#     logs_df.head(100)
    
    
    
# # =============== CHẠY CHƯƠNG TRÌNH ==============
# parse_DrainDS(SETTING_PARAMS['Mac'])