In [10]:
import re
import sys
sys.path.append('../')
import os
import pandas as pd
from collections import Counter
import datetime

pd.set_option('display.max_rows', None)

SETTING_PARAMS = {
    'Apache': {
        'log_file': './logs/Apache/Apache_full.log',
        'log_template': './logs/Apache/Apache_full.log_templates.csv',
        'log_structure': './logs/Apache/Apache_full.log_structured.csv',
        'log_format': '\[<Time>\] \[<Level>\] <Content>',
        'filters': [],
        'regexs': [r'\/(?:\w+\/){2,}\w+\.\w+$'],    
    },
    'HealthApp':{
        'log_file': '../logs/HealthApp/HealthApp_full.log',
        'log_template': '../logs/HealthApp/HealthApp_full.log_templates.csv',
        'log_structure': '../logs/HealthApp/HealthApp_full.log_structured.csv',
        'log_format': '<Time>\|<Component>\|<Pid>\|<Content>',
        'filters': [],
        'regexs': [],  
    },
    'Mac': {
        'log_file': './logs/Mac/Mac_full.log',
        'log_template': './logs/Mac/Mac_full.log_templates.csv',
        'log_structure': './logs/Mac/Mac_full.log_structured.csv',
        'log_format': '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'filters': [],
        'regexs': [r'([\w-]+\.){2,}[\w-]+'],  
    },
    'Linux': {
        'log_file': './logs/Linux/Linux_full.log',
        'log_template': './logs/Linux/Linux_full.log_templates.csv',
        'log_structure': './logs/Linux/Linux_full.log_structured.csv',
        'log_format': '<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>',
        'filters': [],
        'regexs': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],  
    },
    'OpenSSH': {
        'log_file': './logs/OpenSSH/OpenSSH_full.log',
        'log_template': './logs/OpenSSH/OpenSSH_full.log_templates.csv',
        'log_structure': './logs/OpenSSH/OpenSSH_full.log_structured.csv',
        'log_format': '<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>',
        'filters': [],
        'regexs': [r"(\d+):"],    
    },
}

In [40]:
import ULP.ULP as ulp
import time
import importlib

import sys
sys.path.append('../../')

importlib.reload(ulp)
pd.set_option('display.max_colwidth', None) 

output_dir = '../res/ULP/'  
datasets = SETTING_PARAMS['HealthApp']
parser = ulp.LogParser(datasets['log_format'], indir="../logs/HealthApp/", outdir=output_dir, rex=datasets['regexs'])
parser.logname = 'HealthApp_full.log'

# start_timeBig = time.time()
# print("Parsing file: " + os.path.join(parser.path, datasets['log_file']))

# parser.load_data()
# print("SHAPE: ", parser.df_log.shape)
# parser.tokenize()

# parser.df_log["EventId"] = parser.df_log["event_label"].map(
#             lambda x: parser.remove_word_with_special(str(x))
#         )
# groups = parser.df_log.groupby("EventId")
# keys = groups.groups.keys()             # Lấy danh sách các EventId đã nhóm lại
# stock = pd.DataFrame()
# count = 0

# re_list2 = ["[ ]{1,}[-]*[0-9]+[ ]{1,}", ' "\d+" ']  # Các regex để tìm các từ biến động trong log
# generic_re = re.compile("|".join(re_list2))         # Biểu thức regex tổng hợp để tìm các từ biến động trong log

# for i in keys:
#     l = []
#     slc = groups.get_group(i)                       # Lấy nhóm log theo EventId

#     template = slc["event_label"][0:1].to_list()[0] # Lấy template đầu tiên trong nhóm
#     count += 1
#     if slc.size > 1:
#         l = parser.getDynamicVars2(slc.head(10))
#         pat = r"\b(?:{})\b".format("|".join(str(v) for v in l))
#         if len(l) > 0:
#             template = template.lower()
#             template = re.sub(pat, "<*>", template)

#     template = re.sub(generic_re, " <*> ", template)
#     slc["event_label"] = [template] * len(slc["event_label"].to_list())

#     stock = pd.concat([stock, slc])
#     stock = stock.sort_index()

# parser.df_log = stock

# parser.df_log["EventTemplate"] = parser.df_log["event_label"]
# if not os.path.exists(parser.savePath):
#     os.makedirs(parser.savePath)
# parser.df_log.to_csv(
#     os.path.join(parser.savePath, "ULP_structured.csv"), index=False
# )
# elapsed_timeBig = time.time() - start_timeBig
# print(f"Parsing done in {elapsed_timeBig} sec")
parser.parse(parser.logname)

Parsing file: ../logs/HealthApp/HealthApp_full.log
Parsing done in 14.523539781570435 sec


0

In [41]:
from tqdm import tqdm

structured_df = pd.read_csv(datasets['log_structure']) 
unique_templates = structured_df['EventTemplate'].unique()
print(f"SHAPE: {structured_df.shape}")
print(f"SHAPE PARSER: {parser.df_log.shape}")

template_compare = {}
for template in tqdm(unique_templates, desc="Processing templates"):
    arr_index = structured_df[structured_df['EventTemplate'] == template].index.tolist()
    parse_template_series = parser.df_log.loc[arr_index, 'EventTemplate']
    parse_template_unique = parse_template_series.unique().tolist()

    hash_key = hash(template)
    template_compare[hash_key] = {
        'ground_truth': template,
        'parse': parse_template_unique,
        'index': arr_index,
        'length': len(template.strip().split()),
        'nums': len(arr_index)
    }
    
sorted_items = sorted(
        template_compare.items(),
        key=lambda item: (item[1]['length'], item[1]['ground_truth'])
    )

num_dif = 0
for idx, (key, value) in enumerate(sorted_items, 1):
    if len(value['parse']) != 1 or value['parse'][0] != value['ground_truth']:
        num_dif += 1
        print(f"No. {idx}")
        print(f"Length: {value['length']}, Nums: {value['nums']}")
        print(f"Ground truth  : {value['ground_truth']}")
        print(f"Parse templs  : {value['parse']}")
        print(f"Length parse: {len(value['parse'])}")
        print("-" * 40)
print(f"Total differences found: {num_dif}")

SHAPE: (212394, 4)
SHAPE PARSER: (212394, 8)


Processing templates: 100%|██████████| 156/156 [00:01<00:00, 105.78it/s]

No. 1
Length: 1, Nums: 10
Ground truth  : FAILED_ERROR_DATA
Parse templs  : ['[ closenotification... ] ']
Length parse: 1
----------------------------------------
No. 2
Length: 1, Nums: 11
Ground truth  : clear()
Parse templs  : ['[ clear (  )  ] ']
Length parse: 1
----------------------------------------
No. 3
Length: 1, Nums: 15
Ground truth  : closeNotification...
Parse templs  : ['[ closenotification... ] ']
Length parse: 1
----------------------------------------
No. 4
Length: 1, Nums: 5
Ground truth  : context=<*>
Parse templs  : ['[ context = <*> ] ']
Length parse: 1
----------------------------------------
No. 5
Length: 1, Nums: 4
Ground truth  : createHealthNotification()
Parse templs  : ['[ createHealthNotification (  )  ] ']
Length parse: 1
----------------------------------------
No. 6
Length: 1, Nums: 15
Ground truth  : deleteHealthNotification()
Parse templs  : ['[ deleteHealthNotification (  )  ] ']
Length parse: 1
----------------------------------------
No. 7
Length: 1


