sampling from other 15 datasets (DPP + KNN)

In [None]:
import csv
import json
import pandas as pd
import numpy as np
import math
import time

from openai.embeddings_utils import cosine_similarity
import tqdm

def dpp(kernel_matrix, max_length, epsilon=1E-10):
    item_size = kernel_matrix.shape[0]
    cis = np.zeros((max_length, item_size))
    di2s = np.copy(np.diag(kernel_matrix))
    selected_items = list()
    selected_item = np.argmax(di2s)
    selected_items.append(selected_item)
    while len(selected_items) < max_length:
        k = len(selected_items) - 1
        ci_optimal = cis[:k, selected_item]
        di_optimal = math.sqrt(di2s[selected_item])
        elements = kernel_matrix[selected_item, :]
        eis = (elements - np.dot(ci_optimal, cis[:k, :])) / di_optimal
        cis[k, :] = eis
        di2s -= np.square(eis)
        selected_item = np.argmax(di2s)
        selected_items.append(selected_item)
    return selected_items

def getDppIndex(log_emb_list, 
                item_size,    # log dataset size
                split_ratio):

    max_length = int(item_size * split_ratio)
    feature_vectors = np.array(log_emb_list) 

    # standarization no need for log embeddings
    feature_vectors /= np.linalg.norm(feature_vectors, axis=1, keepdims=True)

    # calculate similarity matrix of log embeddings
    similarities = np.dot(feature_vectors, feature_vectors.T) 

    t = time.time()
    result = dpp(similarities, max_length)
    result.sort()
    print('DPP algorithm running time: ' + '\t' + "{0:.4e}".format(time.time() - t))
    return result

def generateLuMap(test_embeddings, candidate_embeddings, logs, look_up_map_path):
    lookUpMap = {}
    for test_idx in range(2000):
        dis_dict = {}
        for cand_idx in range(300):
            dis_dict[cosine_similarity(test_embeddings[test_idx], candidate_embeddings[cand_idx])] = cand_idx
        # get a list in sorted key (descending order), key = cosine similarity
        sorted_list = []
        for key in sorted(dis_dict, reverse=True): 
            sorted_list.append(dis_dict[key])
        # dict: {log_message : list of similar candidate indexes in order}
        lookUpMap[logs[test_idx]] = sorted_list

    # write the map into a json file
    with open(look_up_map_path, 'w') as file:
        file.write(json.dumps(lookUpMap))
    return lookUpMap


def getNearest(log,lookUpMap, N=5):
    cand_list = lookUpMap[log]
    result = cand_list[0:N]
    return result

# generate a prompt in str for a specific log message
def generateDemonstrations(log, lookUpMap, candidate_logs, candidate_templates, nearest_num=5,):
    idxes = getNearest(log, lookUpMap, nearest_num)
    prompt = ""
    result = []
    for index in idxes:
        result.append({"role": "user", "content": candidate_logs[index]})
        result.append({"role": "assistant", "content": '`'+candidate_templates[index] + '`'})
    return result

datasets = ["HDFS", "Spark", "BGL", "Windows", "Linux", "Android", "Mac", "Hadoop", "HealthApp", "OpenSSH", "Thunderbird", "Proxifier", "Apache", "HPC", "Zookeeper", "OpenStack"]

for test_dataset in datasets:
    print(f"processing {test_dataset}...")

    # discard the target dataset
    train_datasets = [dataset for dataset in datasets if dataset != test_dataset]

    # load test logs, templates and embeddings
    file = open(f"DivLog\embeddings\{test_dataset}.json")
    emb_map = json.load(file)
    file.close()
    df = pd.read_csv(f"Divlog\loghub_2k\{test_dataset}\{test_dataset}_2k.log_structured.csv")
    test_logs = df["Content"].values.tolist()
    test_templates = df["EventTemplate"].values.tolist()
    test_embeddings = [emb_map[log] for log in test_logs]


    # load train logs, templates and embeddings
    logs = []
    templates = []
    embeddings = []
    for dataset in train_datasets:
        file = open(f"DivLog\embeddings\{dataset}.json")
        emb_map = json.load(file)
        file.close()
        df = pd.read_csv(f"Divlog\loghub_2k\{dataset}\{dataset}_2k.log_structured.csv")
        log_list = df["Content"].values.tolist()
        template_list = df["EventTemplate"].values.tolist()
        for log,template in zip(log_list, template_list):
            logs.append(log)
            templates.append(template)
            embeddings.append(emb_map[log])

    # get candidate set using dpp
    candidate_index = getDppIndex(embeddings, len(logs), 0.01)
    candidate_logs = [logs[i] for i in candidate_index]
    candidate_templates = [templates[i] for i in candidate_index]
    candidate_embeddings = [embeddings[i] for i in candidate_index]

    # generate lookup map
    look_up_map_path = f"DivLog\lookup_map\{test_dataset}_lookup_map.json"
    lookUpMap = generateLuMap(test_embeddings, candidate_embeddings, test_logs, look_up_map_path)

    json_data = []
    for log in test_logs:
        # get a prompt with five examples for each log message
        result = generateDemonstrations(log, lookUpMap, candidate_logs, candidate_templates)
        json_data.append(result)
    with open(f"DivLog\jsondata\{test_dataset}.json", 'w') as f:
        json.dump(json_data, f)

In [None]:
from concurrent.futures import ThreadPoolExecutor
import os
import re
from openai import OpenAI
import httpx
import json
import pandas as pd
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_random_exponential

def post_process(response):
    response = response.strip().strip('\n')
    if "\n\n" in response:
        response = response.split("\n\n")[0]
    reg = re.compile("`([^`]+)`")
    tmps = reg.findall(response)
    tmps = [x.strip('\n').strip() for x in tmps]
    tmp = ''
    if len(tmps) == 1:
        tmp = tmps[0]
    if len(tmps) > 1:
        tmp = max(tmps, key=len)
    
    tmp = tmp.strip('\n').strip()
    tmp = re.sub(r'\{\{.*?\}\}', '<*>', tmp)
    template = tmp
    return template
class Parser:
    def __init__(self, api_key, model='gpt-3.5-turbo-0125', using_proxy=True, N=5):
        self.model = model
        self.api_key = api_key
        self.N = N
        self.instruction = '''You will be provided with a log message delimited by backticks. You must abstract variables with `<*>` to extract the corresponding template.\nPrint the input log's template delimited by backticks.'''
        self.client = OpenAI(
                    # 3.5 https://4.0.996444.icu/v1
                    base_url="https://oneapi.xty.app/v1",  # 中转url
                    api_key=api_key,                      # api_key
                    http_client=httpx.Client(
                        proxies="http://127.0.0.1:7890"  # 代理地址
                    ),
                )
    @retry(wait=wait_random_exponential(min=1, max=30), stop=stop_after_attempt(5))
    def chat(self, messages):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0.0,
        )
        return response.choices[0].message.content.strip('\n')

    def get_responce(self, input):
        messages = [{"role": "system", "content": self.instruction}]
        for message in input["demonstrations"][:2*self.N]:
            messages.append(message)
        messages.append({"role": "user", "content": input["log"]})
        output = self.chat(messages)
        template = post_process(output)
        print(template)
        return template


# main
if __name__ == "__main__":

    parser = Parser(api_key="sk-zY5LaAEd3EUdBVmKA75aDe77C9684c209b128b981826C043", N=2)

    datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
    datasets = ['BGL','HDFS']
    output_dir = 'outputs/divlog/Test/'
    for dataset in datasets:
        print(f"parsing {dataset}...")
        with open(f"DivLog\jsondata\{dataset}.json", 'r') as f:
            loaded_data = json.load(f)
        df = pd.read_csv(f"Divlog\loghub_2k\{dataset}\{dataset}_2k.log_structured.csv")
        logs = df["Content"].values.tolist()
        inputs = []
        for log, demontrations in zip(logs, loaded_data):
            inputs.append({"log": log, "demonstrations": demontrations})
        with ThreadPoolExecutor(max_workers=16) as executor:
            templates = list(
                tqdm(executor.map(parser.get_responce, inputs), total=len(inputs)))
        # write to file
        df['Output'] = templates
        os.makedirs(output_dir, exist_ok=True)
        df[['Content', 'EventTemplate', 'Output']].to_csv(output_dir+ f'{dataset}.csv', index=False)