In [1]:
!pip install pandas

[0m

In [2]:
!pip install nltk

[0m

In [3]:
!pip install gensim

[0m

In [4]:
!pip install scikit-learn

[0m

In [5]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


In [6]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
df = pd.read_csv('datasets/linux/Linux_2k.log_structured.csv')
df.head()

Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...


In [8]:
null_data = df.isnull().sum()

In [9]:
print(null_data[null_data > 0])

PID    151
dtype: int64


In [10]:
df['Tokens'] = df['Content'].apply(word_tokenize)
df.head()

Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate,Tokens
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure, ;, logname=, uid=0, ..."
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown,"[check, pass, ;, user, unknown]"
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure, ;, logname=, uid=0, ..."
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure, ;, logname=, uid=0, ..."
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...,"[authentication, failure, ;, logname=, uid=0, ..."


In [11]:
from sklearn.preprocessing import LabelEncoder

# Prepare your tokenized sentences
sentences = df['Tokens'].values.tolist()

# Flatten the list of tokenized words
words = [word for sentence in sentences for word in sentence]

# Convert words to numeric labels
label_encoder = LabelEncoder()
X = label_encoder.fit_transform(words)

# Reshape X for the HMM
X = X.reshape(-1, 1)
X = X.tolist()
##print(X)
lab = dict(zip(words, X))
print(lab)





In [12]:
# Convert Time to seconds for easier grouping
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(":"))
    return h * 3600 + m * 60 + s

# Helper function to convert seconds to hh:mm:ss
def seconds_to_hms(seconds):
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h:02}:{m:02}:{s:02}"

df['TimeInSeconds'] = df['Time'].apply(time_to_seconds)

# Step 4: Group Logs into Sequences by Time Window and Day
time_window = 5  # Define the time window in seconds
sequences = []
time_frames = []  # To store time frames for each sequence
current_sequence = []
current_start_time = None  # Track the start time of the current sequence
prev_time = None
prev_date = None
prev_month = None

for _, row in df.iterrows():
    is_same_day = (
        prev_date == row["Date"] and prev_month == row["Month"]
    )  # Check if the log is on the same day
    
    if (
        prev_time is None
        or (row["TimeInSeconds"] - prev_time <= time_window and is_same_day)
    ):
        if current_start_time is None:
            current_start_time = f"{row['Month']} {row['Date']} {row['Time']}"  # Start time of the new sequence
        current_sequence.append(row["Tokens"])
        prev_time = row["TimeInSeconds"]  # Update prev_time within the sequence
    else:
        sequences.append(current_sequence)
        end_time_hms = seconds_to_hms(prev_time)  # Convert end time to hh:mm:ss
        time_frames.append((current_start_time, f"{prev_month} {prev_date} {end_time_hms}"))  # Add time frame
        current_sequence = [row["Tokens"]]
        current_start_time = f"{row['Month']} {row['Date']} {row['Time']}"  # Reset the start time
        prev_time = row["TimeInSeconds"]  # Reset prev_time for the new sequence
    prev_date = row["Date"]
    prev_month = row["Month"]

# Add the last sequence and its time frame
if current_sequence:
    end_time_hms = seconds_to_hms(prev_time)
    time_frames.append((current_start_time, f"{prev_month} {prev_date} {end_time_hms}"))
    sequences.append(current_sequence)

# Visualization: Structured display with time frames
for i, (sequence, time_frame) in enumerate(zip(sequences, time_frames)):
    print(f"Sequence {i + 1} (Time Frame: {time_frame[0]} to {time_frame[1]}):")
    for log in sequence:
        print(f"  - {log}")
    print("\n")
    

Sequence 1 (Time Frame: Jun 14 15:16:01 to Jun 14 15:16:02):
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=218.188.2.4']
  - ['check', 'pass', ';', 'user', 'unknown']
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=218.188.2.4']


Sequence 2 (Time Frame: Jun 15 02:04:59 to Jun 15 02:04:59):
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root']
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root']
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user=root']
  - ['authentication', 'failure', ';', 'logname=', 'uid=0', 'euid=0', 'tty=NODEVssh', 'ruser=', 'rhost=220-135-151-1.hinet-ip.hinet.net', 'user

In [13]:
!pip install transformers

[0m

In [14]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Found existing installation: torch 2.4.1+cpu
Uninstalling torch-2.4.1+cpu:
  Successfully uninstalled torch-2.4.1+cpu
Found existing installation: torchvision 0.19.1+cpu
Uninstalling torchvision-0.19.1+cpu:
  Successfully uninstalled torchvision-0.19.1+cpu
Found existing installation: torchaudio 2.4.1+cpu
Uninstalling torchaudio-2.4.1+cpu:
  Successfully uninstalled torchaudio-2.4.1+cpu
[0mLooking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Using cached https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl (194.9 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cpu/torchvision-0.19.1%2Bcpu-cp38-cp38-linux_x86_64.whl (1.6 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl (1.7 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.4.1+cpu torchaudio-2.4.1+cpu torchvision-0.19.1+cpu
[0m

In [15]:
!python3 --version

Python 3.8.10


In [16]:
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def prepare_inputs(sequence):
    # Convert the tokens into a single string for each event (to match the format BERT expects)
    tokens = [" ".join(event) for event in sequence]
    
    # Tokenize with padding and truncation to the model's max length
    inputs = tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors="pt")
    return inputs

# Prepare inputs for all sequences
sequences_inputs = [prepare_inputs(sequence) for sequence in sequences]

# Check if the sequences are padded correctly
print(sequences_inputs[1])  # Example of one batch after padding


{'input_ids': tensor([[  101, 27280,  4945,  1025,  8833, 18442,  1027, 21318,  2094,  1027,
          1014,  7327,  3593,  1027,  1014, 23746,  2100,  1027, 13045, 15088,
          4095, 26307,  2099,  1027,  1054, 15006,  2102,  1027, 10545,  1011,
         11502,  1011, 16528,  1011,  1015,  1012,  7632,  7159,  1011, 12997,
          1012,  7632,  7159,  1012,  5658,  5310,  1027,  7117,   102],
        [  101, 27280,  4945,  1025,  8833, 18442,  1027, 21318,  2094,  1027,
          1014,  7327,  3593,  1027,  1014, 23746,  2100,  1027, 13045, 15088,
          4095, 26307,  2099,  1027,  1054, 15006,  2102,  1027, 10545,  1011,
         11502,  1011, 16528,  1011,  1015,  1012,  7632,  7159,  1011, 12997,
          1012,  7632,  7159,  1012,  5658,  5310,  1027,  7117,   102],
        [  101, 27280,  4945,  1025,  8833, 18442,  1027, 21318,  2094,  1027,
          1014,  7327,  3593,  1027,  1014, 23746,  2100,  1027, 13045, 15088,
          4095, 26307,  2099,  1027,  1054, 15006,

In [17]:
from transformers import DataCollatorWithPadding

# Create a data collator for padding sequences dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Create a DataLoader for the tokenized sequences
from torch.utils.data import DataLoader

# Flatten inputs to a single list for DataLoader
flattened_sequences_inputs = [
    {"input_ids": inputs["input_ids"][i], "attention_mask": inputs["attention_mask"][i]}
    for inputs in sequences_inputs
    for i in range(inputs["input_ids"].size(0))
]

normal_sequences = DataLoader(flattened_sequences_inputs, batch_size=32, shuffle=True, collate_fn=data_collator)

print("DataLoader initialized with dynamic padding.")


DataLoader initialized with dynamic padding.


In [18]:
from transformers import BertForMaskedLM

# Load pretrained BERT model for masked language modeling
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Move the model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

### Model Training, IG

In [19]:
from torch.optim import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set the model to training mode
model.train()

# Number of epochs
epochs = 5

# Training loop
for epoch in range(epochs):
    epoch_loss = 0.0  # Track total loss for the epoch
    
    for batch_idx, batch in enumerate(normal_sequences):
        # Move batch to GPU if available
        inputs = {key: val.to(device) for key, val in batch.items()}
        
        # Forward pass
        outputs = model(**inputs, labels=inputs['input_ids'])
        
        # Compute loss
        loss = outputs.loss
        epoch_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # Print progress for the current batch
        print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}/{len(normal_sequences)}, Loss: {loss.item():.4f}")
    
    # Print average loss for the epoch
    print(f"Epoch {epoch + 1} completed. Average Loss: {epoch_loss / len(normal_sequences):.4f}")


Epoch 1/5, Batch 1/63, Loss: 11.1450
Epoch 1/5, Batch 2/63, Loss: 10.3049
Epoch 1/5, Batch 3/63, Loss: 9.9644
Epoch 1/5, Batch 4/63, Loss: 8.3420
Epoch 1/5, Batch 5/63, Loss: 8.2380
Epoch 1/5, Batch 6/63, Loss: 7.9525
Epoch 1/5, Batch 7/63, Loss: 7.2533
Epoch 1/5, Batch 8/63, Loss: 7.6555
Epoch 1/5, Batch 9/63, Loss: 6.0696
Epoch 1/5, Batch 10/63, Loss: 7.3040
Epoch 1/5, Batch 11/63, Loss: 6.9121
Epoch 1/5, Batch 12/63, Loss: 6.0696
Epoch 1/5, Batch 13/63, Loss: 5.6007
Epoch 1/5, Batch 14/63, Loss: 5.4615
Epoch 1/5, Batch 15/63, Loss: 5.0936
Epoch 1/5, Batch 16/63, Loss: 4.7275
Epoch 1/5, Batch 17/63, Loss: 4.7972
Epoch 1/5, Batch 18/63, Loss: 4.6140
Epoch 1/5, Batch 19/63, Loss: 3.8254
Epoch 1/5, Batch 20/63, Loss: 4.2937
Epoch 1/5, Batch 21/63, Loss: 3.0585
Epoch 1/5, Batch 22/63, Loss: 4.1967
Epoch 1/5, Batch 23/63, Loss: 3.9028
Epoch 1/5, Batch 24/63, Loss: 3.7781
Epoch 1/5, Batch 25/63, Loss: 3.7243
Epoch 1/5, Batch 26/63, Loss: 3.5996
Epoch 1/5, Batch 27/63, Loss: 3.0075
Epoch 1/

Epoch 4/5, Batch 32/63, Loss: 0.0643
Epoch 4/5, Batch 33/63, Loss: 0.0665
Epoch 4/5, Batch 34/63, Loss: 0.0622
Epoch 4/5, Batch 35/63, Loss: 0.0615
Epoch 4/5, Batch 36/63, Loss: 0.0645
Epoch 4/5, Batch 37/63, Loss: 0.0594
Epoch 4/5, Batch 38/63, Loss: 0.0519
Epoch 4/5, Batch 39/63, Loss: 0.0673
Epoch 4/5, Batch 40/63, Loss: 0.0505
Epoch 4/5, Batch 41/63, Loss: 0.0418
Epoch 4/5, Batch 42/63, Loss: 0.0701
Epoch 4/5, Batch 43/63, Loss: 0.0569
Epoch 4/5, Batch 44/63, Loss: 0.0452
Epoch 4/5, Batch 45/63, Loss: 0.0561
Epoch 4/5, Batch 46/63, Loss: 0.0559
Epoch 4/5, Batch 47/63, Loss: 0.0595
Epoch 4/5, Batch 48/63, Loss: 0.0497
Epoch 4/5, Batch 49/63, Loss: 0.0321
Epoch 4/5, Batch 50/63, Loss: 0.0507
Epoch 4/5, Batch 51/63, Loss: 0.0532
Epoch 4/5, Batch 52/63, Loss: 0.0581
Epoch 4/5, Batch 53/63, Loss: 0.0542
Epoch 4/5, Batch 54/63, Loss: 0.0478
Epoch 4/5, Batch 55/63, Loss: 0.0582
Epoch 4/5, Batch 56/63, Loss: 0.0585
Epoch 4/5, Batch 57/63, Loss: 0.0523
Epoch 4/5, Batch 58/63, Loss: 0.0517
E

In [20]:
# Save the model state and optimizer state
model_path = "model_latest.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'hyperparameters': {'learning_rate': 1e-5, 'epochs': 5},
}, model_path)

print(f"Model saved to {model_path}")

Model saved to model_latest.pth


### TESTING

In [20]:
# Inference
with torch.no_grad():
    predictions = model(input_ids=sequences_inputs, attention_mask=attention_masks).logits

# Convert logits to probabilities
probabilities = torch.softmax(predictions, dim=1)

print("Predictions:", probabilities)


NameError: name 'attention_masks' is not defined