In [42]:
import json
import os
from concurrent.futures import ThreadPoolExecutor
import httpx
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tqdm import tqdm
from openai import OpenAI

CLIENT = OpenAI(
    api_key='sk-proj-5EkdZfTfjJ1GJim17pgQT3BlbkFJHCMqWAOX7dTSGOcFOjrn',   # api_key
    http_client=httpx.Client(
            proxies="http://127.0.0.1:7890"  # proxies
    ),
)

map = {}


@retry(stop=stop_after_attempt(20), wait=wait_random_exponential(min=1, max=60))
def get_response(prompt, dataset):

    if prompt not in map:
        response = CLIENT.chat.completions.create(
            model='gpt-3.5-turbo-0125',
            messages=[{'role': 'user', 'content': prompt}],
            temperature=0.0,
        )
        output = response.choices[0].message.content.strip('\n')
        map[prompt] = output
    else:
        output = map[prompt]
    return output


if __name__ == '__main__':
    datasets = ['BGL', 'HDFS', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper',
                'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark', 'Linux']
    datasets = ['HealthApp', 'OpenStack', 'Zookeeper','Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark', 'Linux']
    for dataset in datasets:
        print(f'Processing {dataset} dataset')

        prompt_list_path = f'cost_divlog_for_{dataset}.json'
        with open(prompt_list_path, 'r') as f:
            prompt_list = json.load(f)
        prompt_list = [str(prompt) for prompt in prompt_list]

        with ThreadPoolExecutor(max_workers=16) as executor:
            outputs = list(
                tqdm(executor.map(get_response, prompt_list, [dataset for _ in prompt_list]), total=len(prompt_list)))

        with open(f'map/{dataset}.json', 'w') as f:
            json.dump(map, f, indent=4)

Processing HealthApp dataset


100%|██████████| 2000/2000 [01:30<00:00, 22.01it/s]


Processing OpenStack dataset


100%|██████████| 2000/2000 [02:35<00:00, 12.88it/s]


Processing Zookeeper dataset


100%|██████████| 2000/2000 [00:51<00:00, 38.72it/s] 


Processing Mac dataset


100%|██████████| 2000/2000 [01:50<00:00, 18.04it/s]


Processing Hadoop dataset


100%|██████████| 2000/2000 [01:06<00:00, 30.08it/s]


Processing Android dataset


100%|██████████| 2000/2000 [01:26<00:00, 23.13it/s]


Processing Windows dataset


100%|██████████| 2000/2000 [01:18<00:00, 25.39it/s] 


Processing Apache dataset


100%|██████████| 2000/2000 [01:04<00:00, 30.79it/s]


Processing Thunderbird dataset


100%|██████████| 2000/2000 [00:31<00:00, 63.25it/s] 


Processing Spark dataset


100%|██████████| 2000/2000 [02:03<00:00, 16.19it/s]


Processing Linux dataset


  3%|▎         | 57/2000 [00:05<03:04, 10.52it/s]


KeyboardInterrupt: 

In [46]:
import re
import pandas as pd

def extractResultTemplate(text):
    # this pattern is for ChatGPT
    # pattern = re.compile('<START> <Event\d> (.+) <END>')
    pattern = re.compile('<START> (.+) <END>')
    # findall return a list
    result = pattern.findall(text)
    if (len(result)):
        return result[0]
    else: return ""

def correct_single_template(template, user_strings=None):
    """Apply all rules to process a template.

    DS (Double Space)
    BL (Boolean)
    US (User String)
    DG (Digit)
    PS (Path-like String)
    WV (Word concatenated with Variable)
    DV (Dot-separated Variables)
    CV (Consecutive Variables)

    """

    boolean = {'true', 'false'}
    default_strings = {'null', 'root', 'admin'}
    path_delimiters = {  # reduced set of delimiters for tokenizing for checking the path-like strings
        r'\s', r'\,', r'\!', r'\;', r'\:',
        r'\=', r'\|', r'\"', r'\'',
        r'\[', r'\]', r'\(', r'\)', r'\{', r'\}'
    }
    token_delimiters = path_delimiters.union({  # all delimiters for tokenizing the remaining rules
        r'\.', r'\-', r'\+', r'\@', r'\#', r'\$', r'\%', r'\&',
    })

    if user_strings:
        default_strings = default_strings.union(user_strings)

    # apply DS
    # Note: this is not necessary while postprorcessing
    template = template.strip()
    template = re.sub(r'\s+', ' ', template)

    # apply PS
    p_tokens = re.split('(' + '|'.join(path_delimiters) + ')', template)
    new_p_tokens = []
    for p_token in p_tokens:
        if re.match(r'^(\/[^\/]+)+$', p_token):
            p_token = '<*>'
        new_p_tokens.append(p_token)
    template = ''.join(new_p_tokens)

    # tokenize for the remaining rules
    tokens = re.split('(' + '|'.join(token_delimiters) + ')', template)  # tokenizing while keeping delimiters
    new_tokens = []
    for token in tokens:
        # apply BL, US
        for to_replace in boolean.union(default_strings):
            if token.lower() == to_replace.lower():
                token = '<*>'

        # apply DG
        # Note: hexadecimal num also appears a lot in the logs
        if re.match(r'^\d+$', token) or re.match(r'\b0[xX][0-9a-fA-F]+\b', token):
            token = '<*>'

        # apply WV
        if re.match(r'^[^\s\/]*<\*>[^\s\/]*$', token) or all(x in token for x in {'<*>', '.', '/'}) or all(x in token for x in {'<*>', '/', ':'}):
            # if token != '<*>/<*>':  # need to check this because `/` is not a deliminator
            token = '<*>'

        # collect the result
        new_tokens.append(token)

    # make the template using new_tokens
    template = ''.join(new_tokens)

    # Substitute consecutive variables only if separated with any delimiter including "." (DV)
    while True:
        prev = template
        template = re.sub(r'<\*>\.<\*>', '<*>', template)
        if prev == template:
            break

    # Substitute consecutive variables only if not separated with any delimiter including space (CV)
    # NOTE: this should be done at the end
    while True:
        prev = template
        template = re.sub(r'<\*><\*>', '<*>', template)
        if prev == template:
            break
    # incorrect in HealthApp
    # while "#<*>#" in template:
    #     template = template.replace("#<*>#", "<*>")

    while "<*>:<*>" in template:
        template = template.replace("<*>:<*>", "<*>")

    while "<*>/<*>" in template:
        template = template.replace("<*>/<*>", "<*>")

    return template


datasets = ['BGL', 'HDFS', 'HealthApp', 'OpenStack', 'OpenSSH', 'HPC', 'Zookeeper',
            'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark', 'Linux']

output_dir = 'result'
# output file
os.makedirs(output_dir, exist_ok=True)
# datasets = ['BGL']
for dataset in datasets:

    df = pd.read_csv(
        f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

    with open(f'cost_divlog_for_{dataset}.json', 'r') as f:
        prompt_list = json.load(f)

    with open(f'map/{dataset}.json', 'r') as f:
        look_up_map = json.load(f)
    results = []
    look_up_map2 = {}
    for prompt in tqdm(prompt_list):
        log = prompt.split('\n')[-2].replace('<prompt>:', '').strip()
        template = prompt.split('\n')[-4].replace('<extraction>: <START>', '').replace('<END>','').strip()
        if prompt not in look_up_map:
            print('alert1')
            result = template
            result = ''
        else:
            result = extractResultTemplate(look_up_map[prompt])
            result = correct_single_template(result)
            if result == '':
                print('alert2')
                result = template
        look_up_map2[log] = result

    for log in df['Content']:
        if log.strip() in look_up_map2:
            results.append(look_up_map2[log.strip()])
        else:
            results.append('')
            print('alert3')

    df['EventTemplate'] = results
    df[['Content', 'EventTemplate']].to_csv(f'{output_dir}/{dataset}_2k.log_structured.csv', index=False)

100%|██████████| 2000/2000 [00:00<00:00, 11508.49it/s]


alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2


100%|██████████| 2000/2000 [00:00<00:00, 13909.63it/s]
100%|██████████| 2000/2000 [00:00<00:00, 19197.61it/s]
100%|██████████| 2000/2000 [00:00<00:00, 7779.89it/s]
100%|██████████| 2000/2000 [00:00<00:00, 8549.04it/s]
100%|██████████| 2000/2000 [00:00<00:00, 26569.10it/s]
100%|██████████| 2000/2000 [00:00<00:00, 14730.94it/s]
100%|██████████| 2000/2000 [00:00<00:00, 8004.68it/s]


alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2


100%|██████████| 2000/2000 [00:00<00:00, 10063.02it/s]


alert2
alert2
alert2


 89%|████████▊ | 1772/2000 [00:00<00:00, 8822.76it/s]

alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2
alert2


100%|██████████| 2000/2000 [00:00<00:00, 8939.59it/s]
100%|██████████| 2000/2000 [00:00<00:00, 9574.56it/s]


alert2


100%|██████████| 2000/2000 [00:00<00:00, 14439.67it/s]
100%|██████████| 2000/2000 [00:00<00:00, 11892.53it/s]
100%|██████████| 2000/2000 [00:00<00:00, 10715.30it/s]
100%|██████████| 2000/2000 [00:00<00:00, 11675.01it/s]

alert2



