In [3]:
from utils import log_to_token_seq, split_log
import pandas as pd


In [4]:
template_paths = [
    r"nulog\logs\Linux\Linux_2k.log_templates.csv",
    r"nulog\logs\Andriod\Andriod_2k.log_templates.csv",     
    r"nulog\logs\Apache\Apache_2k.log_templates.csv",
    r"nulog\logs\BGL\BGL_2k.log_templates.csv",
    r"nulog\logs\Hadoop\Hadoop_2k.log_templates.csv",
    r"nulog\logs\HDFS\HDFS_2k.log_templates.csv",
    r"nulog\logs\HealthApp\HealthApp_2k.log_templates.csv",
    r"nulog\logs\HPC\HPC_2k.log_templates.csv",
    r"nulog\logs\Mac\Mac_2k.log_templates.csv",
    r"nulog\logs\OpenSSH\OpenSSH_2k.log_templates.csv",
    r"nulog\logs\OpenStack\OpenStack_2k.log_templates.csv",
    r"nulog\logs\Proxifier\Proxifier_2k.log_templates.csv",
    r"nulog\logs\Spark\Spark_2k.log_templates.csv",
    r"nulog\logs\Thunderbird\Thunderbird_2k.log_templates.csv",
    r"nulog\logs\Windows\Windows_2k.log_templates.csv",
    r"nulog\logs\Zookeeper\Zookeeper_2k.log_templates.csv"
    ]

In [5]:
log_paths = [
    r"nulog\logs\Andriod\Andriod_2k.log",
    r"nulog\logs\Apache\Apache_2k.log",
    r"nulog\logs\BGL\BGL_2k.log",
    r"nulog\logs\Hadoop\Hadoop_2k.log",
    r"nulog\logs\HDFS\HDFS_2k.log",
    r"nulog\logs\HealthApp\HealthApp_2k.log",
    r"nulog\logs\HPC\HPC_2k.log",
    r"nulog\logs\Linux\Linux_2k.log",
    r"nulog\logs\Mac\Mac_2k.log",
    r"nulog\logs\OpenSSH\OpenSSH_2k.log",
    r"nulog\logs\OpenStack\OpenStack_2k.log",
    r"nulog\logs\Proxifier\Proxifier_2k.log",
    r"nulog\logs\Spark\Spark_2k.log",
    r"nulog\logs\Thunderbird\Thunderbird_2k.log",
    r"nulog\logs\Windows\Windows_2k.log",
    r"nulog\logs\Zookeeper\Zookeeper_2k.log"
]

In [6]:
dictionary = set()

for path in template_paths:

    df = pd.read_csv(path)

    for log in df["EventTemplate"]:
        tokens = split_log(log)
        
        for token in tokens:
            if token != '<*>':
                dictionary.add(token)

print(len(dictionary))

3193


In [7]:
data = []

In [8]:
max_length = 0
for log_path in log_paths:
    with open(log_path) as f:
        log = f.readline()
        while log:
            tok_seq = log_to_token_seq(log, 10)
            if len(tok_seq) > max_length:
                max_length = len(tok_seq)
            data.append(tok_seq)
            log = f.readline()

In [9]:
len(data)

32000

In [10]:
max_length

300

In [11]:
import torch
from torch.utils.data import Dataset
import random

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [13]:
torch.zeros(1).cuda()


tensor([0.], device='cuda:0')

In [14]:
class LogDataset(Dataset):
    def __init__(self, log_paths, vocab_size, max_length=300, mask_prob=0.15):
        self.log_paths = log_paths
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.mask_prob = mask_prob
        self.special_offset = 10
    
        self.logs = []
        
        for log_path in self.log_paths:
            with open(log_path) as f:
                log = f.readline()
                while log:
                    tok_seq = log_to_token_seq(log, self.special_offset)
                    self.logs.append(tok_seq)
                    log = f.readline()

    def __len__(self):
        return len(self.logs)
    
    def __getitem__(self, idx):
        tokens = self.logs[idx][:self.max_length]
        input_ids = tokens + [0] * (self.max_length - len(tokens))  # Pad
        labels = [-100] * self.max_length  # Ignore positions not masked

        for i in range(len(tokens)):
            if random.random() < self.mask_prob:
                prob = random.random()
                labels[i] = input_ids[i]  # Store the original value

                if prob < 0.8:
                    input_ids[i] = 1  # [MASK]
                elif prob < 0.9:
                    input_ids[i] = random.randint(self.special_offset, self.vocab_size - 1)
                else:
                    pass  # Keep original

        attention_mask = [1 if t != 0 else 0 for t in input_ids]

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }

In [15]:
dataset = LogDataset(log_paths, 30000)
from torch.utils.data import DataLoader 
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

batch = next(iter(dataloader))
print(batch)

{'input_ids': tensor([[ 1284, 18675, 24948,  ...,     0,     0,     0],
        [17038, 22929, 16775,  ...,     0,     0,     0],
        [ 1284,     1,     1,  ...,     0,     0,     0],
        ...,
        [24713,  1284,     1,  ...,     0,     0,     0],
        [ 1284, 17654, 24948,  ...,     0,     0,     0],
        [21334, 25292,  2155,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ 1284, 16972, 24948,  ...,  -100,  -100,  -100],
        ...,
        [ -100,  -100,  1956,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100]])}


In [16]:
from transformers import BertConfig, BertForMaskedLM, Trainer, TrainingArguments
from torch.utils.data import random_split

# Step 1: Define model config
config = BertConfig(
    vocab_size=30000,           # keep this as is
    max_position_embeddings=300,  # fine
    num_attention_heads=4,      # ↓ reduce attention heads
    num_hidden_layers=4,        # ↓ fewer transformer blocks
    type_vocab_size=1,
    hidden_size=256,            # ↓ smaller embedding and hidden dims
    intermediate_size=1024,     # ↓ smaller feed-forward layer
)
# Step 2: Create model
model = BertForMaskedLM(config)
model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# Assuming `dataset` is your preprocessed LogTokenDataset
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
10,10.297,10.219909
20,10.1847,10.083426
30,10.0862,9.961554
40,9.9467,9.867284
50,9.7649,9.773114
60,9.7286,9.6673
70,9.6852,9.559312
80,9.5177,9.469193
90,9.4814,9.374647
100,9.3391,9.303562
