In [None]:
import kagglehub
import json
from pathlib import Path

from transformers import BertTokenizer, BertModel
import torch
import re
import pickle

import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import ssl

import pandas as pd

In [None]:


_create_unverified_https_context = ssl._create_unverified_context

ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt', download_dir=str(ROOT))
from nltk.tokenize import word_tokenize

In [None]:
with open('HDFS_2000.log', 'r') as file:
    original_text = [line.strip() for line in file]

In [None]:
def absolute_to_relative_path(text):
    text = re.sub(r"/user/root/rand/", "/", text)
    return text

def remplace_common(text):
    tokenize_text = word_tokenize(text)

    common_dict = {
        'CLUSTER_SYSTEM_NUMBER': r"\d+$",
        'CLUSTER_SYSTEM_SV_NUMBER': r"\d+'[bodhBODH][\da-fA-Fxz]+$",
        'CLUSTER_SYSTEM_HEX_VAL': r"0x[\da-fA-F]+$",
        "CLUSTER_SYSTEM_TIME": r"\d+(.\d+){0,1}(s|ms|us|ps|fs)$",
        "CLUSTER_SYSTEM_IP_PORT": r"/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})",
        "CLUSTER_SYSTEM_IP": r"/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})",
    }
    for i, word in enumerate(tokenize_text):
        for remplace, regex in common_dict.items():
            match = re.search(regex, word)
            if match:
                tokenize_text[i] = remplace

    detokenizer = TreebankWordDetokenizer()
    detokenizer_text = detokenizer.detokenize(tokenize_text)
    return detokenizer_text

def clean_data(text):
    text = absolute_to_relative_path(text)
    text = remplace_common(text)
    return text


process_text = list(map(clean_data, original_text))
process_text

In [None]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-cased')
model = BertModel.from_pretrained('google-bert/bert-base-cased')

In [None]:
# error_list = error_list[:100]
inputs = tokenizer(process_text, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs, )

embeddings = outputs.last_hidden_state
cls_embeddings = embeddings[:, 0, :]
cls_embeddings.shape

In [None]:
import hdbscan
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

cosine_distance_matrix = cosine_distances(cls_embeddings)
cosine_distance_matrix = cosine_distance_matrix.astype(np.float64)

clustering = hdbscan.HDBSCAN(min_cluster_size=3, metric='precomputed').fit(cosine_distance_matrix)


In [None]:
labels = clustering.labels_

clusters = {}

for label, log_entry in zip(labels, original_text):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(log_entry)
    
for cluster_lit in clusters.values():
    cluster_lit.sort()

In [None]:
for cluster in clusters.values():
    print("-"*20)
    print("\n".join(cluster))
    print("-"*20)