In [76]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer
import json
import logging
import os
import joblib
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertModel
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch
from datetime import datetime


In [None]:
# importing data file and creating dataframe
df = pd.read_csv("train_data.csv", on_bad_lines='skip')
df["event_text"].head()

0    An TLS 1.2 connection request was received fro...
1    Application popup: Idle timer expired : Sessio...
2    The storage optimizer successfully completed r...
3    A new process has been created.Creator Subject...
4    Key file operation.Subject: Security ID: S-1-5...
Name: event_text, dtype: object

In [None]:
df["event_text"] = df["event_text"].astype(str)

In [None]:
# Encode categorical labels (for each field)


encoders = {}

for col in ["eventClass", "eventDeviceCat", "eventOperation", "eventOutcome"]:
    enc = LabelEncoder()
    df[col] = enc.fit_transform(df[col].astype(str)) 
    encoders[col] = enc 

df["eventSeverity"] = df["eventSeverity"].astype(int)



tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["event_text"].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

In [None]:
print(df.dtypes)

event_id           int64
event_text        object
eventClass         int64
eventDeviceCat     int64
eventOperation     int64
eventOutcome       int64
eventSeverity      int64
dtype: object


In [None]:
class LogDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.encodings = tokenizer(list(df["event_text"]), padding=True, truncation=True, max_length=128)
        self.labels = df[["eventClass", "eventDeviceCat", "eventOperation", "eventOutcome", "eventSeverity"]].values

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = LogDataset(train_df, tokenizer)
val_dataset = LogDataset(val_df, tokenizer)

In [None]:
class MultiOutputDistilBERT(nn.Module):
    def __init__(self, model_name):
        super(MultiOutputDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Create one classifier head per field
        self.classifier_class = nn.Linear(hidden_size, len(df["eventClass"].unique()))
        self.classifier_device = nn.Linear(hidden_size, len(df["eventDeviceCat"].unique()))
        self.classifier_operation = nn.Linear(hidden_size, len(df["eventOperation"].unique()))
        self.classifier_outcome = nn.Linear(hidden_size, len(df["eventOutcome"].unique()))
        self.classifier_severity = nn.Linear(hidden_size, 6)  # assuming severity 1–5

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]

        return {
            "eventClass": self.classifier_class(pooled_output),
            "eventDeviceCat": self.classifier_device(pooled_output),
            "eventOperation": self.classifier_operation(pooled_output),
            "eventOutcome": self.classifier_outcome(pooled_output),
            "eventSeverity": self.classifier_severity(pooled_output)
        }


In [None]:
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#model init
model = MultiOutputDistilBERT("distilbert-base-uncased")
optimizer = optim.Adam(model.parameters(), lr=2e-5)
model.to(device)

MultiOutputDistilBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): 

In [None]:
# Training loop with 15 epoch
model.train()
for epoch in range(15):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)

        loss = (
            F.cross_entropy(outputs["eventClass"], labels[:, 0].long()) +
            F.cross_entropy(outputs["eventDeviceCat"], labels[:, 1].long()) +
            F.cross_entropy(outputs["eventOperation"], labels[:, 2].long()) +
            F.cross_entropy(outputs["eventOutcome"], labels[:, 3].long()) +
            F.cross_entropy(outputs["eventSeverity"], labels[:, 4].long())
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")

Epoch 1 | Loss: 6.8198
Epoch 2 | Loss: 5.6404
Epoch 3 | Loss: 4.9196
Epoch 4 | Loss: 4.1138
Epoch 5 | Loss: 3.4601
Epoch 6 | Loss: 2.7350
Epoch 7 | Loss: 2.2702
Epoch 8 | Loss: 1.9231
Epoch 9 | Loss: 1.5582
Epoch 10 | Loss: 1.3902
Epoch 11 | Loss: 1.1912
Epoch 12 | Loss: 0.9551
Epoch 13 | Loss: 0.8061
Epoch 14 | Loss: 0.6942
Epoch 15 | Loss: 0.5996


In [None]:
# Create directory to store model
os.makedirs("Model", exist_ok=True)

# Save model weights (entire state_dict)
torch.save(model.state_dict(), "Model/multioutput_distilbert.pt", _use_new_zipfile_serialization=True)

print("✅ Trained model saved as 'Model/multioutput_distilbert.pt'")

✅ Trained model saved as 'Model/multioutput_distilbert.pt'


In [None]:
# Saving tokenizer and label encoders
tokenizer.save_pretrained("Model/")


for field, enc in encoders.items():
    joblib.dump(enc, f"Model/{field}_encoder.pkl")

print("✅ Tokenizer and label encoders saved too.")

✅ Tokenizer and label encoders saved too.


In [None]:

# Validating data and Logging prediction outputs 

logging.basicConfig(
    filename="prediction_logs.txt",
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
)

model.eval()
val_loss = 0
results = []

with torch.no_grad():
    for batch_idx, batch in enumerate(val_loader):
        outputs = model(batch["input_ids"], batch["attention_mask"])
        labels = batch["labels"]

        
        # Calculate loss
        
        loss = (
            F.cross_entropy(outputs["eventClass"], labels[:, 0].long()) +
            F.cross_entropy(outputs["eventDeviceCat"], labels[:, 1].long()) +
            F.cross_entropy(outputs["eventOperation"], labels[:, 2].long()) +
            F.cross_entropy(outputs["eventOutcome"], labels[:, 3].long()) +
            F.cross_entropy(outputs["eventSeverity"], labels[:, 4].long())
        )
        val_loss += loss.item()

        
        # Get predictions & confidence
        
        softmax_outputs = {
            k: F.softmax(v, dim=1) for k, v in outputs.items()
        }

        preds = {k: torch.argmax(v, dim=1).cpu().numpy() for k, v in outputs.items()}
        confs = {k: torch.max(softmax_outputs[k], dim=1).values.cpu().numpy() for k in outputs}

        
        # Store results + log
        
        for i in range(len(preds["eventClass"])):
            event_id = int(batch_idx * val_loader.batch_size + i + 1)
            result = {
                "event_id": event_id,
                "eventClass": encoders["eventClass"].inverse_transform([preds["eventClass"][i]])[0],
                "eventDeviceCat": encoders["eventDeviceCat"].inverse_transform([preds["eventDeviceCat"][i]])[0],
                "eventOperation": encoders["eventOperation"].inverse_transform([preds["eventOperation"][i]])[0],
                "eventOutcome": encoders["eventOutcome"].inverse_transform([preds["eventOutcome"][i]])[0],
                "eventSeverity": int(preds["eventSeverity"][i]),
                "confidence": {
                    "eventClass": float(confs["eventClass"][i]),
                    "eventDeviceCat": float(confs["eventDeviceCat"][i]),
                    "eventOperation": float(confs["eventOperation"][i]),
                    "eventOutcome": float(confs["eventOutcome"][i]),
                    "eventSeverity": float(confs["eventSeverity"][i]),
                }
            }
            results.append(result)

            # Log info
            logging.info(f"Event {event_id}: {json.dumps(result)}")


print("Validation Loss:", val_loss / len(val_loader))
with open("output_with_confidence.json", "w") as f:
    json.dump(results, f, indent=4)

print("✅ Predictions with confidence saved to output_with_confidence.json")
print("✅ Logs stored in prediction_logs.txt")


Validation Loss: 4.340741276741028
✅ Predictions with confidence saved to output_with_confidence.json
✅ Logs stored in prediction_logs.txt
