In [1]:
import pandas as pd

df = pd.read_csv('datasets/linux/Linux_2k.log_structured.csv')
df.head()


Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...


In [2]:
# Tokenize Content
df["Tokens"] = df["Content"].apply(lambda x: x.split())
print("Tokenized Logs:")
print(df["Tokens"])
df.head()


Tokenized Logs:
0       [authentication, failure;, logname=, uid=0, eu...
1                           [check, pass;, user, unknown]
2       [authentication, failure;, logname=, uid=0, eu...
3       [authentication, failure;, logname=, uid=0, eu...
4       [authentication, failure;, logname=, uid=0, eu...
                              ...                        
1995    [pci_hotplug:, PCI, Hot, Plug, PCI, Core, vers...
1996              [isapnp:, Scanning, for, PnP, cards...]
1997          [isapnp:, No, Plug, &, Play, device, found]
1998                   [Real, Time, Clock, Driver, v1.12]
1999    [Linux, agpgart, interface, v0.100, (c), Dave,...
Name: Tokens, Length: 2000, dtype: object


Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate,Tokens
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure;, logname=, uid=0, eu..."
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown,"[check, pass;, user, unknown]"
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure;, logname=, uid=0, eu..."
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure;, logname=, uid=0, eu..."
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure;, logname=, uid=0, eu..."


In [3]:
from datetime import datetime

def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(":"))
    return h * 3600 + m * 60 + s

# Convert time to seconds
df["TimeInSeconds"] = df["Time"].apply(time_to_seconds)

# Combine day, month, and time into a single identifier for grouping
df["DateKey"] = df["Month"] + " " + df["Date"].astype(str)

# Group logs into sequences
time_window = 5  # Define the time window in seconds
sequences = []
current_sequence = []
current_date = df["DateKey"].iloc[0]
start_time = df["TimeInSeconds"].iloc[0]

for _, row in df.iterrows():
    if row["DateKey"] == current_date and (row["TimeInSeconds"] - start_time <= time_window):
        current_sequence.append(row["Tokens"])
    else:
        sequences.append(current_sequence)  # Append the completed sequence
        current_sequence = [row["Tokens"]]  # Start a new sequence
        current_date = row["DateKey"]  # Update to the new day
        start_time = row["TimeInSeconds"]

if current_sequence:  # Append the last sequence if any
    sequences.append(current_sequence)

# Add [SEP] tokens to separate logs within a sequence
SEP = "[SEP]"  # Special token for separating logs
separated_sequences = []
for sequence in sequences:
    flat_sequence = []
    for log in sequence:
        flat_sequence.extend(log + [SEP])  # Add [SEP] after each log
    separated_sequences.append(flat_sequence[:-1])  # Remove the last [SEP]

# Print the sequences
print("Sequences grouped by day and within 5 seconds:")
for i, seq in enumerate(separated_sequences):
    print(f"Sequence {i + 1}: {seq}")


Sequences grouped by day and within 5 seconds:
Sequence 1: ['authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=218.188.2.4', '[SEP]', 'check', 'pass;', 'user', 'unknown', '[SEP]', 'authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=218.188.2.4']
Sequence 2: ['authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root', '[SEP]', 'authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root', '[SEP]', 'authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root', '[SEP]', 'authentication', 'failure;', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root', '[SEP]', 'authentication', 'failure;', 'logname=', 'uid=0', '

In [102]:
# Build a vocabulary from tokens
vocab = {}
for seq in sequences:
    for log in seq:
        for token in log:
            if token not in vocab:
                vocab[token] = len(vocab) + 1  # Assign a unique ID to each token

# Convert tokens to numerical tokens
numerical_sequences = [[[vocab[token] for token in log] for log in seq] for seq in sequences]
print("Numerical Sequences:")
print(numerical_sequences)

Numerical Sequences:
[[[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14], [1, 2, 3, 4, 5, 6, 7, 13, 14]], [[15, 16, 17, 11, 18, 19, 20], [15, 21, 17, 11, 18], [22, 23, 24, 25, 26]], [[15, 16, 17, 11, 27, 19, 20], [15, 21, 17, 11, 27]], [[9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [9, 10, 11, 12], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8]], [[9, 10, 11, 12], [1, 2, 3, 4, 5,

In [103]:
def pad_logs_in_sequences(sequences, log_max_length, sequence_max_length, sep_token=0, pad_token=-1):
    """
    Pads each log in the sequence and then the overall sequence to a fixed length.
    
    Args:
    - sequences (list of list of list of int): Nested numerical sequences (group of logs).
    - log_max_length (int): The desired length of each individual log.
    - sequence_max_length (int): The desired length of the entire sequence.
    - sep_token (int): The token used to separate logs ([SEP]).
    - pad_token (int): The token used for padding ([PAD]).
    
    Returns:
    - padded_sequences (list of list of int): Padded and flattened sequences.
    """
    padded_sequences = []
    for sequence_group in sequences:
        padded_logs = []
        
        # Pad each log within the sequence group
        for log in sequence_group:
            if len(log) > log_max_length:
                padded_logs.append(log[:log_max_length])
            else:
                padded_logs.append(log + [pad_token] * (log_max_length - len(log)))
        
        # Flatten the sequence with [SEP] tokens separating logs
        flattened_sequence = []
        for log in padded_logs:
            flattened_sequence.extend(log + [sep_token])  # Add [SEP] after each log
        flattened_sequence = flattened_sequence[:-1]  # Remove the last [SEP]

        # Pad the overall sequence to the specified sequence_max_length
        if len(flattened_sequence) > sequence_max_length:
            truncated_sequence = flattened_sequence[:sequence_max_length]
            # Ensure the last token is [SEP] if truncating between logs
            if truncated_sequence[-1] != sep_token:
                truncated_sequence[-1] = sep_token
            padded_sequences.append(truncated_sequence)
        else:
            # Pad if the sequence is shorter than sequence_max_length
            padded_sequence = flattened_sequence + [pad_token] * (sequence_max_length - len(flattened_sequence))
            padded_sequences.append(padded_sequence)
    
    return padded_sequences

# Define parameters
log_max_length = 10  # Length for each log
sequence_max_length = 50  # Total sequence length
#sep_token = 0  # Assuming [SEP] token is represented by 0
pad_token = -1  # Assuming [PAD] token is represented by -1

# Apply the function
padded_sequences = pad_logs_in_sequences(numerical_sequences, log_max_length, sequence_max_length, sep_token, pad_token)

# Print padded sequences
print("Padded Sequences:")
for i, seq in enumerate(padded_sequences):
    print(f"Sequence {i + 1}: {seq}")



Padded Sequences:
Sequence 1: [1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Sequence 2: [1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 0]
Sequence 3: [15, 16, 17, 11, 18, 19, 20, -1, -1, -1, 0, 15, 21, 17, 11, 18, -1, -1, -1, -1, -1, 0, 22, 23, 24, 25, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Sequence 4: [15, 16, 17, 11, 27, 19, 20, -1, -1, -1, 0, 15, 21, 17, 11, 27, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Sequence 5: [9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, 0]
S

In [104]:
# Add [CLS] at the beginning and [SEP] between logs
vocab["SEP"] = 0
vocab["PAD"] = -1
last = len(vocab) + 1
vocab["CLS"] = last

CLS_ID = last
SEP_ID = 0

final_sequences = []
for seq in padded_sequences:
    seq_with_special_tokens = [CLS_ID] + seq + [SEP_ID]
    final_sequences.append(seq_with_special_tokens)

print("Final Sequences with [CLS] and [SEP]:")
print(final_sequences)


Final Sequences with [CLS] and [SEP]:
[[706, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], [706, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 13, 14, -1, 0, 1, 2, 3, 4, 5, 0, 0], [706, 15, 16, 17, 11, 18, 19, 20, -1, -1, -1, 0, 15, 21, 17, 11, 18, -1, -1, -1, -1, -1, 0, 22, 23, 24, 25, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], [706, 15, 16, 17, 11, 27, 19, 20, -1, -1, -1, 0, 15, 21, 17, 11, 27, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], [706, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, 0, 9, 10, 11, 12, -1, 0,

In [105]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [114]:
from transformers import BertConfig, BertForMaskedLM

# Configure BERT
max_length = 52
config = BertConfig(
    vocab_size=len(vocab),  # +3 for [CLS], [SEP], and [PAD]
    hidden_size=128,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=max_length,
)

# Define LogBERT model
model = BertForMaskedLM(config)
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(706, 128, padding_idx=0)
      (position_embeddings): Embedding(52, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_af

In [115]:
import torch

# Convert to tensor
inputs = torch.tensor(final_sequences, dtype=torch.long)
labels = torch.tensor([1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0]
)
# Create attention masks (1 for valid tokens, 0 for padding)
attention_mask = (inputs != 0).int()

# Print shapes for debugging
print(f"Inputs Shape: {inputs.shape}")
print(f"Attention Mask Shape: {attention_mask.shape}")
print(f"Labels Shape: {labels.shape}")


Inputs Shape: torch.Size([297, 52])
Attention Mask Shape: torch.Size([297, 52])
Labels Shape: torch.Size([297])


In [118]:
# Print the maximum token index
print(f"Max token index: {inputs.max().item()}")
print(input_ids.max())  # Should be less than vocab_size

# Print the vocab size
print(f"Vocab size: {len(vocab)}")

Max token index: 706
tensor(705)
Vocab size: 706


In [117]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

# Dataset and DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    for batch in dataloader:
        b_input_ids, b_attention_masks, b_labels = batch
        outputs = model(
            input_ids=b_input_ids, 
            attention_mask=b_attention_masks, 
            labels=b_labels
        )
        loss = outputs.loss
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


IndexError: index out of range in self