In [1]:
from utils import log_to_token_seq, split_log
import pandas as pd


In [2]:
template_paths = [
    r"nulog\logs\Linux\Linux_2k.log_templates.csv",
    r"nulog\logs\Andriod\Andriod_2k.log_templates.csv",     
    r"nulog\logs\Apache\Apache_2k.log_templates.csv",
    r"nulog\logs\BGL\BGL_2k.log_templates.csv",
    r"nulog\logs\Hadoop\Hadoop_2k.log_templates.csv",
    r"nulog\logs\HDFS\HDFS_2k.log_templates.csv",
    r"nulog\logs\HealthApp\HealthApp_2k.log_templates.csv",
    r"nulog\logs\HPC\HPC_2k.log_templates.csv",
    r"nulog\logs\Mac\Mac_2k.log_templates.csv",
    r"nulog\logs\OpenSSH\OpenSSH_2k.log_templates.csv",
    r"nulog\logs\OpenStack\OpenStack_2k.log_templates.csv",
    r"nulog\logs\Proxifier\Proxifier_2k.log_templates.csv",
    r"nulog\logs\Spark\Spark_2k.log_templates.csv",
    r"nulog\logs\Thunderbird\Thunderbird_2k.log_templates.csv",
    r"nulog\logs\Windows\Windows_2k.log_templates.csv",
    r"nulog\logs\Zookeeper\Zookeeper_2k.log_templates.csv"
    ]

In [3]:
log_paths = [
    r"nulog\logs\Andriod\Andriod_2k.log",
    r"nulog\logs\Apache\Apache_2k.log",
    r"nulog\logs\BGL\BGL_2k.log",
    r"nulog\logs\Hadoop\Hadoop_2k.log",
    r"nulog\logs\HDFS\HDFS_2k.log",
    r"nulog\logs\HealthApp\HealthApp_2k.log",
    r"nulog\logs\HPC\HPC_2k.log",
    r"nulog\logs\Linux\Linux_2k.log",
    r"nulog\logs\Mac\Mac_2k.log",
    r"nulog\logs\OpenSSH\OpenSSH_2k.log",
    r"nulog\logs\OpenStack\OpenStack_2k.log",
    r"nulog\logs\Proxifier\Proxifier_2k.log",
    r"nulog\logs\Spark\Spark_2k.log",
    r"nulog\logs\Thunderbird\Thunderbird_2k.log",
    r"nulog\logs\Windows\Windows_2k.log",
    r"nulog\logs\Zookeeper\Zookeeper_2k.log"
]

In [4]:
dictionary = set()

for path in template_paths:

    df = pd.read_csv(path)

    for log in df["EventTemplate"]:
        tokens = split_log(log)
        
        for token in tokens:
            if token != '<*>':
                dictionary.add(token)

print(len(dictionary))

3193


In [5]:
data = []

In [6]:
max_length = 0
for log_path in log_paths:
    with open(log_path) as f:
        log = f.readline()
        while log:
            tok_seq = log_to_token_seq(log, 10)
            if len(tok_seq) > max_length:
                max_length = len(tok_seq)
            data.append(tok_seq)
            log = f.readline()

In [7]:
len(data)

32000

In [8]:
max_length

300

In [1]:
import torch
from torch.utils.data import Dataset
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
torch.zeros(1).cuda()


AssertionError: Torch not compiled with CUDA enabled

In [11]:
class LogDataset(Dataset):
    def __init__(self, log_paths, vocab_size, max_length=300, mask_prob=0.15):
        self.log_paths = log_paths
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.mask_prob = mask_prob
        self.special_offset = 10
    
        self.logs = []
        
        for log_path in self.log_paths:
            with open(log_path) as f:
                log = f.readline()
                while log:
                    tok_seq = log_to_token_seq(log, self.special_offset)
                    self.logs.append(tok_seq)
                    log = f.readline()

    def __len__(self):
        return len(self.logs)
    
    def __getitem__(self, idx):
        tokens = self.logs[idx][:self.max_length]
        input_ids = tokens + [0] * (self.max_length - len(tokens))  # Pad
        labels = [-100] * self.max_length  # Ignore positions not masked

        for i in range(len(tokens)):
            if random.random() < self.mask_prob:
                prob = random.random()
                labels[i] = input_ids[i]  # Store the original value

                if prob < 0.8:
                    input_ids[i] = 1  # [MASK]
                elif prob < 0.9:
                    input_ids[i] = random.randint(self.special_offset, self.vocab_size - 1)
                else:
                    pass  # Keep original

        attention_mask = [1 if t != 0 else 0 for t in input_ids]

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }

In [12]:
dataset = LogDataset(log_paths, 30000)
from torch.utils.data import DataLoader 
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

batch = next(iter(dataloader))
print(batch)

{'input_ids': tensor([[23053, 23834, 26216,  ...,     0,     0,     0],
        [23053, 23834, 26491,  ...,     0,     0,     0],
        [ 2266, 17428, 24948,  ...,     0,     0,     0],
        ...,
        [ 1216, 23834,  1145,  ...,     0,     0,     0],
        [24389,     1, 26030,  ...,     0,     0,     0],
        [ 2266, 16569,     1,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        ...,
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100, 22565,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100, 24948,  ...,  -100,  -100,  -100]])}


In [None]:
from transformers import BertConfig, BertForMaskedLM, Trainer, TrainingArguments
from torch.utils.data import random_split

# Step 1: Define model config
config = BertConfig(
    vocab_size=30000,           # keep this as is
    max_position_embeddings=300,  # fine
    num_attention_heads=4,      # ↓ reduce attention heads
    num_hidden_layers=4,        # ↓ fewer transformer blocks
    type_vocab_size=1,
    hidden_size=256,            # ↓ smaller embedding and hidden dims
    intermediate_size=1024,     # ↓ smaller feed-forward layer
)
# Step 2: Create model
model = BertForMaskedLM(config)
model = model.to(device)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [14]:
# Assuming `dataset` is your preprocessed LogTokenDataset
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=200,                 # add this if missing, evaluate less frequently
    learning_rate=5e-5,             # ↑ higher LR to compensate for smaller model
    per_device_train_batch_size=32, # ↑ larger batch size (if GPU allows)
    per_device_eval_batch_size=32,
    num_train_epochs=5,             # ↑ more epochs with smaller model is okay
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,               # ↓ log less frequently if needed
    save_steps=1000,                # ↓ save less often if you don’t need many checkpoints
    load_best_model_at_end=True,
)



In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [17]:
trainer.train()

  0%|          | 0/5400 [00:00<?, ?it/s]

{'loss': 10.1093, 'grad_norm': 6.3579912185668945, 'learning_rate': 1.9962962962962963e-05, 'epoch': 0.01}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 9.719738960266113, 'eval_runtime': 209.1562, 'eval_samples_per_second': 15.3, 'eval_steps_per_second': 0.956, 'epoch': 0.01}
{'loss': 9.515, 'grad_norm': 6.052066326141357, 'learning_rate': 1.9925925925925928e-05, 'epoch': 0.01}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 9.35147476196289, 'eval_runtime': 208.9466, 'eval_samples_per_second': 15.315, 'eval_steps_per_second': 0.957, 'epoch': 0.01}
{'loss': 9.3066, 'grad_norm': 5.670358657836914, 'learning_rate': 1.988888888888889e-05, 'epoch': 0.02}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 9.122315406799316, 'eval_runtime': 209.6056, 'eval_samples_per_second': 15.267, 'eval_steps_per_second': 0.954, 'epoch': 0.02}
{'loss': 9.0733, 'grad_norm': 5.349259853363037, 'learning_rate': 1.9851851851851855e-05, 'epoch': 0.02}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.919425964355469, 'eval_runtime': 209.1666, 'eval_samples_per_second': 15.299, 'eval_steps_per_second': 0.956, 'epoch': 0.02}
{'loss': 8.8845, 'grad_norm': 5.442677974700928, 'learning_rate': 1.9814814814814816e-05, 'epoch': 0.03}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.784394264221191, 'eval_runtime': 209.302, 'eval_samples_per_second': 15.289, 'eval_steps_per_second': 0.956, 'epoch': 0.03}
{'loss': 8.7877, 'grad_norm': 4.333401203155518, 'learning_rate': 1.977777777777778e-05, 'epoch': 0.03}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.64264965057373, 'eval_runtime': 208.6764, 'eval_samples_per_second': 15.335, 'eval_steps_per_second': 0.958, 'epoch': 0.03}
{'loss': 8.6255, 'grad_norm': 5.7926788330078125, 'learning_rate': 1.9740740740740743e-05, 'epoch': 0.04}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.460636138916016, 'eval_runtime': 209.3787, 'eval_samples_per_second': 15.283, 'eval_steps_per_second': 0.955, 'epoch': 0.04}
{'loss': 8.5266, 'grad_norm': 4.889074802398682, 'learning_rate': 1.9703703703703704e-05, 'epoch': 0.04}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.344767570495605, 'eval_runtime': 208.8602, 'eval_samples_per_second': 15.321, 'eval_steps_per_second': 0.958, 'epoch': 0.04}
{'loss': 8.3052, 'grad_norm': 5.2697577476501465, 'learning_rate': 1.9666666666666666e-05, 'epoch': 0.05}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.23314094543457, 'eval_runtime': 208.8248, 'eval_samples_per_second': 15.324, 'eval_steps_per_second': 0.958, 'epoch': 0.05}
{'loss': 8.214, 'grad_norm': 4.823780059814453, 'learning_rate': 1.962962962962963e-05, 'epoch': 0.06}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 8.097243309020996, 'eval_runtime': 209.0905, 'eval_samples_per_second': 15.304, 'eval_steps_per_second': 0.957, 'epoch': 0.06}
{'loss': 8.0441, 'grad_norm': 5.505671501159668, 'learning_rate': 1.9592592592592596e-05, 'epoch': 0.06}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.9543609619140625, 'eval_runtime': 209.7521, 'eval_samples_per_second': 15.256, 'eval_steps_per_second': 0.954, 'epoch': 0.06}
{'loss': 7.9839, 'grad_norm': 4.9122114181518555, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.07}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.879695892333984, 'eval_runtime': 210.3715, 'eval_samples_per_second': 15.211, 'eval_steps_per_second': 0.951, 'epoch': 0.07}
{'loss': 7.8137, 'grad_norm': 5.17730712890625, 'learning_rate': 1.9518518518518522e-05, 'epoch': 0.07}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.773776054382324, 'eval_runtime': 209.3083, 'eval_samples_per_second': 15.288, 'eval_steps_per_second': 0.956, 'epoch': 0.07}
{'loss': 7.8123, 'grad_norm': 4.697854995727539, 'learning_rate': 1.9481481481481484e-05, 'epoch': 0.08}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.673365592956543, 'eval_runtime': 211.9529, 'eval_samples_per_second': 15.098, 'eval_steps_per_second': 0.944, 'epoch': 0.08}
{'loss': 7.7348, 'grad_norm': 4.535989284515381, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.08}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.54664421081543, 'eval_runtime': 308.8286, 'eval_samples_per_second': 10.362, 'eval_steps_per_second': 0.648, 'epoch': 0.08}
{'loss': 7.6498, 'grad_norm': 5.099508285522461, 'learning_rate': 1.9407407407407407e-05, 'epoch': 0.09}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.453378677368164, 'eval_runtime': 469.918, 'eval_samples_per_second': 6.81, 'eval_steps_per_second': 0.426, 'epoch': 0.09}
{'loss': 7.5495, 'grad_norm': 4.845084190368652, 'learning_rate': 1.9370370370370372e-05, 'epoch': 0.09}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.351250648498535, 'eval_runtime': 209.1846, 'eval_samples_per_second': 15.297, 'eval_steps_per_second': 0.956, 'epoch': 0.09}
{'loss': 7.2756, 'grad_norm': 5.4411821365356445, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.1}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.289959907531738, 'eval_runtime': 471.6952, 'eval_samples_per_second': 6.784, 'eval_steps_per_second': 0.424, 'epoch': 0.1}
{'loss': 7.3028, 'grad_norm': 4.576045989990234, 'learning_rate': 1.92962962962963e-05, 'epoch': 0.11}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.193802356719971, 'eval_runtime': 1665.5719, 'eval_samples_per_second': 1.921, 'eval_steps_per_second': 0.12, 'epoch': 0.11}
{'loss': 7.1942, 'grad_norm': 5.959846496582031, 'learning_rate': 1.925925925925926e-05, 'epoch': 0.11}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.106534481048584, 'eval_runtime': 5872.1737, 'eval_samples_per_second': 0.545, 'eval_steps_per_second': 0.034, 'epoch': 0.11}
{'loss': 7.0213, 'grad_norm': 4.395904064178467, 'learning_rate': 1.9222222222222225e-05, 'epoch': 0.12}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 7.051356792449951, 'eval_runtime': 2864.1403, 'eval_samples_per_second': 1.117, 'eval_steps_per_second': 0.07, 'epoch': 0.12}
{'loss': 7.0454, 'grad_norm': 4.421105861663818, 'learning_rate': 1.9185185185185186e-05, 'epoch': 0.12}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.976436614990234, 'eval_runtime': 196.371, 'eval_samples_per_second': 16.296, 'eval_steps_per_second': 1.018, 'epoch': 0.12}
{'loss': 6.8819, 'grad_norm': 7.044583797454834, 'learning_rate': 1.9148148148148148e-05, 'epoch': 0.13}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.878376483917236, 'eval_runtime': 197.9383, 'eval_samples_per_second': 16.167, 'eval_steps_per_second': 1.01, 'epoch': 0.13}
{'loss': 6.9069, 'grad_norm': 4.6630120277404785, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.13}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.818413257598877, 'eval_runtime': 196.5467, 'eval_samples_per_second': 16.281, 'eval_steps_per_second': 1.018, 'epoch': 0.13}
{'loss': 6.9454, 'grad_norm': 4.345062732696533, 'learning_rate': 1.9074074074074075e-05, 'epoch': 0.14}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.760271072387695, 'eval_runtime': 441.9392, 'eval_samples_per_second': 7.241, 'eval_steps_per_second': 0.453, 'epoch': 0.14}
{'loss': 6.8321, 'grad_norm': 4.1998443603515625, 'learning_rate': 1.903703703703704e-05, 'epoch': 0.14}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.648655414581299, 'eval_runtime': 552.5497, 'eval_samples_per_second': 5.791, 'eval_steps_per_second': 0.362, 'epoch': 0.14}
{'loss': 6.606, 'grad_norm': 4.331967830657959, 'learning_rate': 1.9e-05, 'epoch': 0.15}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.638491153717041, 'eval_runtime': 564.3811, 'eval_samples_per_second': 5.67, 'eval_steps_per_second': 0.354, 'epoch': 0.15}
{'loss': 6.6296, 'grad_norm': 5.312910556793213, 'learning_rate': 1.8962962962962966e-05, 'epoch': 0.16}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.576657772064209, 'eval_runtime': 557.4411, 'eval_samples_per_second': 5.741, 'eval_steps_per_second': 0.359, 'epoch': 0.16}
{'loss': 6.521, 'grad_norm': 6.404658317565918, 'learning_rate': 1.8925925925925928e-05, 'epoch': 0.16}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.494619369506836, 'eval_runtime': 7748.8011, 'eval_samples_per_second': 0.413, 'eval_steps_per_second': 0.026, 'epoch': 0.16}
{'loss': 6.6162, 'grad_norm': 4.380571365356445, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.17}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.494793891906738, 'eval_runtime': 533.2251, 'eval_samples_per_second': 6.001, 'eval_steps_per_second': 0.375, 'epoch': 0.17}
{'loss': 6.389, 'grad_norm': 4.538882255554199, 'learning_rate': 1.885185185185185e-05, 'epoch': 0.17}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.396506309509277, 'eval_runtime': 524.7896, 'eval_samples_per_second': 6.098, 'eval_steps_per_second': 0.381, 'epoch': 0.17}
{'loss': 6.3949, 'grad_norm': 5.141176700592041, 'learning_rate': 1.8814814814814816e-05, 'epoch': 0.18}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.387725353240967, 'eval_runtime': 534.7729, 'eval_samples_per_second': 5.984, 'eval_steps_per_second': 0.374, 'epoch': 0.18}
{'loss': 6.5364, 'grad_norm': 4.087141513824463, 'learning_rate': 1.877777777777778e-05, 'epoch': 0.18}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.3306355476379395, 'eval_runtime': 529.7756, 'eval_samples_per_second': 6.04, 'eval_steps_per_second': 0.378, 'epoch': 0.18}
{'loss': 6.3411, 'grad_norm': 4.870862007141113, 'learning_rate': 1.8740740740740742e-05, 'epoch': 0.19}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.30800724029541, 'eval_runtime': 529.9201, 'eval_samples_per_second': 6.039, 'eval_steps_per_second': 0.377, 'epoch': 0.19}
{'loss': 6.4295, 'grad_norm': 4.756584644317627, 'learning_rate': 1.8703703703703707e-05, 'epoch': 0.19}


  0%|          | 0/200 [00:00<?, ?it/s]

{'eval_loss': 6.228446006774902, 'eval_runtime': 796.3108, 'eval_samples_per_second': 4.019, 'eval_steps_per_second': 0.251, 'epoch': 0.19}
{'loss': 6.1601, 'grad_norm': 4.699578762054443, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}


  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 