## data preprocessing

In [1]:
import pandas as pd
train_data = pd.read_parquet('data/processed/train_data.parquet')
test_data  = pd.read_parquet('data/processed/test_data.parquet')
validation_data  = pd.read_parquet('data/processed/validation_data.parquet')

In [2]:
train_data.head()

Unnamed: 0,IncidentId,IncidentGrade,evidence_count,DetectorId_nunique,AlertTitle_nunique,DeviceId_nunique,Sha256_nunique,IpAddress_nunique,Url_nunique,AccountSid_nunique,...,Roles_Compromised,Roles_Contextual,Roles_Destination,Roles_Other,Roles_Source,Roles_Suspicious,org_rate_BenignPositive,org_rate_FalsePositive,org_rate_TruePositive,org_incident_count
0,0,TruePositive,29997,6,6,1,1,874,1,148,...,0.0,0.0,0.0,0.0,0.0,0.0,8.9e-05,0.009365,0.990546,264.0
1,2,BenignPositive,20525,113,934,11,1881,3,3,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.986812,0.013188,0.0,163.0
2,3,TruePositive,3,1,1,1,1,2,1,2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.003378,0.274071,0.722551,511.0
3,7,BenignPositive,12252,8,19,3,3,2474,3,1721,...,0.0,1.0,0.0,0.0,0.0,0.0,0.153124,0.0,0.846876,1177.0
4,8,TruePositive,6,2,2,1,1,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.217158,0.719717,0.063125,269.0


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["IncidentGrade"])
validation_data["label"] = label_encoder.transform(validation_data["IncidentGrade"])
test_data["label"] = label_encoder.transform(test_data["IncidentGrade"])

In [4]:
label_encoder.classes_

array(['BenignPositive', 'FalsePositive', 'TruePositive'], dtype=object)

In [5]:
train_data['label'].value_counts()

label
0    196317
1    121642
2     86051
Name: count, dtype: int64

In [6]:
X_train = train_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y_train = train_data["label"].values

X_validation = validation_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y_validation = validation_data["label"].values

X_test = test_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y_test = test_data["label"].values

In [7]:
print(f"training shape : {X_train.shape}")
print(f"validation shape : {X_validation.shape}")
print(f"testing shape : {X_test.shape}")

training shape : (404010, 176)
validation shape : (44891, 176)
testing shape : (236267, 176)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [9]:
import numpy as np
class_counts  = np.bincount(y_train)
class_weights = len(y_train) / (len(class_counts) * class_counts)

## prepare the dataset

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.long)
)
validation_dataset = TensorDataset(
    torch.tensor(X_validation, dtype=torch.float32),
    torch.tensor(y_validation, dtype=torch.long)
)
test_dataset = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.long)
)
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=4096)
test_loader = DataLoader(test_dataset, batch_size=4096)


## create the model

### create the residual blocks class

In [13]:
class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.1):
        super().__init__()
        self.block=nn.Sequential(
            nn.BatchNorm1d(in_dim),
            nn.SiLU(),
            nn.Linear(in_dim, out_dim),
            nn.BatchNorm1d(out_dim),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(out_dim, out_dim)
        )
        self.shourtcut = (
            nn.Linear(in_dim, out_dim) if in_dim != out_dim else nn.Identity()
        )
    def forward(self, x):
        return self.block(x) + self.shourtcut(x)

### create the model class

In [14]:
class ResidualModel(nn.Module):
    def __init__(self, in_dim, n_classes):
        super().__init__()
        self.input= nn.Linear(in_dim, 512)
        self.blocks = nn.Sequential(
            ResidualBlock(512, 512),
            ResidualBlock(512, 256),
            ResidualBlock(256, 256),
            ResidualBlock(256, 128),
            ResidualBlock(128, 128)
        )
        self.head = nn.Sequential(
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Linear(128, n_classes)
        )
        self._init_weights()
    
    def _init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, nonlinearity="relu")
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = self.input(x)
        x = self.blocks(x)
        return self.head(x)

### create the model

In [15]:
input_dim = X_train.shape[1]
num_classes = 3

model = ResidualModel(input_dim, num_classes).to(device)

### print the number of leanable parameters

In [16]:
sum(parameter.numel() for parameter in model.parameters())

1197827

## create the loss, optimizer and the learning rate decay scheduler

In [17]:
# create cross entropy loss with class weight
critertion = nn.CrossEntropyLoss(
    weight= torch.tensor(class_weights, dtype=torch.float32).to(device),
    label_smoothing=0.1
)

# create adamw optimizer with a soft weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    weight_decay=0.0001
)

# create learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=20,
    min_lr=1e-6
)

## train the model

In [18]:
from sklearn.metrics import f1_score

epochs = 1000
patience = 80

best_f1 = 0.0
patience_now = 0
train_history = []
validation_history = []

for epoch in range(1, epochs+1):
    # train
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = critertion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item() * len(y_batch)
    train_loss /= len(train_dataset)
    train_history.append(train_loss)

    # validate
    model.eval()
    all_preds = []
    validation_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in validation_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            pred = model(X_batch)
            loss = critertion(pred, y_batch)
            validation_loss += loss.item() * len(y_batch)
            pred_out = pred.argmax(dim=1).cpu().numpy()
            all_preds.extend(pred_out)
    validation_loss /= len(validation_dataset)
    valildation_f1_score = f1_score(y_validation, all_preds, average="macro")
    scheduler.step(valildation_f1_score)
    validation_history.append(valildation_f1_score)
    print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {validation_loss:.4f} | Val Macro-F1: {valildation_f1_score:.4f}")

    # early stopping
    if valildation_f1_score > best_f1:
        best_f1 = valildation_f1_score
        patience_now = 0
        torch.save(model.state_dict(), 'artifacts/best_model.pth')
        print(f">>>> New best saved (val macro-F1: {best_f1:.4f})")
    else:
        patience_now += 1
        if patience_now >= patience:
            print(f"\nEarly stopping at epoch {epoch} — "f"no improvement for {patience} epochs.")
            break
        
# print best score after training 
print(f"\nBest Val Macro-F1: {best_f1:.4f}")

Epoch 001 | Train Loss: 0.5888 | Val Loss: 0.5678 | Val Macro-F1: 0.8470
>>>> New best saved (val macro-F1: 0.8470)
Epoch 002 | Train Loss: 0.5568 | Val Loss: 0.5599 | Val Macro-F1: 0.8477
>>>> New best saved (val macro-F1: 0.8477)
Epoch 003 | Train Loss: 0.5434 | Val Loss: 0.5483 | Val Macro-F1: 0.8550
>>>> New best saved (val macro-F1: 0.8550)
Epoch 004 | Train Loss: 0.5319 | Val Loss: 0.5402 | Val Macro-F1: 0.8647
>>>> New best saved (val macro-F1: 0.8647)
Epoch 005 | Train Loss: 0.5256 | Val Loss: 0.5340 | Val Macro-F1: 0.8699
>>>> New best saved (val macro-F1: 0.8699)
Epoch 006 | Train Loss: 0.5209 | Val Loss: 0.5264 | Val Macro-F1: 0.8714
>>>> New best saved (val macro-F1: 0.8714)
Epoch 007 | Train Loss: 0.5173 | Val Loss: 0.5216 | Val Macro-F1: 0.8762
>>>> New best saved (val macro-F1: 0.8762)
Epoch 008 | Train Loss: 0.5122 | Val Loss: 0.5174 | Val Macro-F1: 0.8779
>>>> New best saved (val macro-F1: 0.8779)
Epoch 009 | Train Loss: 0.5089 | Val Loss: 0.5198 | Val Macro-F1: 0.8692

## evaluation

In [19]:
model.load_state_dict(torch.load('artifacts/best_model.pth', weights_only=True))
model.eval()

all_preds = []
with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        logits = model(X_batch)
        preds  = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)

test_f1 = f1_score(y_test, all_preds, average='macro')
print(f"Test Macro-F1: {test_f1:.4f}  (paper baseline: 0.87)")

Test Macro-F1: 0.9082  (paper baseline: 0.87)


## save the data

In [20]:
import json
with open('artifacts/model_config.json', 'w') as f:
    json.dump({
        'input_dim':   input_dim,
        'num_classes': num_classes,
        'classes':     list(label_encoder.classes_),
        'best_val_f1': best_f1,
        'test_f1':     test_f1,
    }, f)

print("Config saved.")


Config saved.
