## data preprocessing

In [1]:
import pandas as pd
train_data = pd.read_parquet('data/processed/train_data.parquet')
test_data  = pd.read_parquet('data/processed/test_data.parquet')

In [2]:
train_data.head()

Unnamed: 0,IncidentId,IncidentGrade,evidence_count,DetectorId_nunique,AlertTitle_nunique,DeviceId_nunique,Sha256_nunique,IpAddress_nunique,Url_nunique,AccountSid_nunique,...,EntityType_Url,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,SuspicionLevel_Incriminated,SuspicionLevel_Suspicious,LastVerdict_Malicious,LastVerdict_NoThreatsFound,LastVerdict_Other,LastVerdict_Suspicious
0,0,TruePositive,29997,6,6,1,1,874,1,148,...,0,9081,9084,20913,0.0,0.0,0.0,0.0,0.0,0.0
1,2,BenignPositive,20525,113,934,11,1881,3,3,7,...,4,320,5484,15041,0.0,20453.0,6645.0,4.0,0.0,13814.0
2,3,TruePositive,3,1,1,1,1,2,1,2,...,0,1,2,1,0.0,0.0,0.0,0.0,0.0,0.0
3,7,BenignPositive,12252,8,19,3,3,2474,3,1721,...,4,1737,3477,8775,0.0,10.0,1.0,10.0,0.0,13.0
4,8,TruePositive,6,2,2,1,1,1,1,3,...,0,2,4,2,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["IncidentGrade"])
test_data["label"] = label_encoder.transform(test_data["IncidentGrade"])

In [4]:
label_encoder.classes_

array(['BenignPositive', 'FalsePositive', 'TruePositive'], dtype=object)

In [5]:
train_data['label'].value_counts()

label
0    218131
1    135158
2     95612
Name: count, dtype: int64

In [6]:
X = train_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y = train_data["label"].values

from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1, stratify=y, random_state=67)

X_test = test_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y_test = test_data["label"].values

In [7]:
print(f"training shape : {X_train.shape}")
print(f"validation shape : {X_validation.shape}")
print(f"testing shape : {X_test.shape}")

training shape : (404010, 154)
validation shape : (44891, 154)
testing shape : (236267, 154)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [9]:
import numpy as np
class_counts  = np.bincount(y_train)
class_weights = len(y_train) / (len(class_counts) * class_counts)

## prepare the dataset

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.long)
)
validation_dataset = TensorDataset(
    torch.tensor(X_validation, dtype=torch.float32),
    torch.tensor(y_validation, dtype=torch.long)
)
test_dataset = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.long)
)
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=2048)
test_loader = DataLoader(test_dataset, batch_size=2048)


## create the model

### create the residual blocks class

In [13]:
class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.block=nn.Sequential(
            nn.BatchNorm1d(in_dim),
            nn.SiLU(),
            nn.Linear(in_dim, out_dim),
            nn.BatchNorm1d(out_dim),
            nn.SiLU(),
            nn.Linear(out_dim, out_dim)
        )
        self.shourtcut = (
            nn.Linear(in_dim, out_dim) if in_dim != out_dim else nn.Identity()
        )
    def forward(self, x):
        return self.block(x) + self.shourtcut(x)

### create the model class

In [14]:
class ResidualModel(nn.Module):
    def __init__(self, in_dim, n_classes):
        super().__init__()
        self.input= nn.Linear(in_dim, 512)
        self.blocks = nn.Sequential(
            ResidualBlock(512, 512),
            ResidualBlock(512,256),
            ResidualBlock(256, 256),
            ResidualBlock(256, 128),
            ResidualBlock(128, 128)
        )
        self.head = nn.Sequential(
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Linear(128, n_classes)
        )
        self._init_weights()
    
    def _init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, nonlinearity="relu")
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = self.input(x)
        x = self.blocks(x)
        return self.head(x)

### create the model

In [15]:
input_dim = X_train.shape[1]
num_classes = 3

model = ResidualModel(input_dim, num_classes).to(device)

### print the number of leanable parameters

In [16]:
sum(parameter.numel() for parameter in model.parameters())

1186563

## create the loss, optimizer and the learning rate decay scheduler

In [17]:
# create cross entropy loss with class weight
critertion = nn.CrossEntropyLoss(
    weight= torch.tensor(class_weights, dtype=torch.float32).to(device),
)

# create adamw optimizer with a soft weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    weight_decay=0.0001
)

# create learning rate scheduler
# first cycle take 10 epochs (T_0 = 10)
# after that reset every 2 * last cycle length (T_mult = 2) 
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

## train the model

In [18]:
from sklearn.metrics import f1_score

epochs = 200
patience = 20

best_f1 = 0.0
patience_now = 0
train_history = []
validation_history = []

for epoch in range(1, epochs+1):
    # train
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = critertion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item() * len(y_batch)
    scheduler.step()
    train_loss /= len(train_dataset)
    train_history.append(train_loss)

    # validate
    model.eval()
    all_preds = []
    validation_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in validation_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            pred = model(X_batch)
            loss = critertion(pred, y_batch)
            validation_loss += loss.item() * len(y_batch)
            pred_out = pred.argmax(dim=1).cpu().numpy()
            all_preds.extend(pred_out)
    validation_loss /= len(validation_dataset)
    valildation_f1_score = f1_score(y_validation, all_preds, average="macro")
    validation_history.append(valildation_f1_score)
    print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {validation_loss:.4f} | Val Macro-F1: {valildation_f1_score:.4f}")

    # early stopping
    if valildation_f1_score > best_f1:
        best_f1 = valildation_f1_score
        patience_now = 0
        torch.save(model.state_dict(), 'artifacts/best_model.pth')
        print(f">>>> New best saved (val macro-F1: {best_f1:.4f})")
    else:
        patience_now += 1
        if patience_now >= patience:
            print(f"\nEarly stopping at epoch {epoch} — "f"no improvement for {patience} epochs.")
            break
        
# print best score after training 
print(f"\nBest Val Macro-F1: {best_f1:.4f}")

Epoch 001 | Train Loss: 0.9332 | Val Loss: 0.8875 | Val Macro-F1: 0.5868
>>>> New best saved (val macro-F1: 0.5868)
Epoch 002 | Train Loss: 0.8512 | Val Loss: 0.8756 | Val Macro-F1: 0.6116
>>>> New best saved (val macro-F1: 0.6116)
Epoch 003 | Train Loss: 0.8242 | Val Loss: 0.8521 | Val Macro-F1: 0.6311
>>>> New best saved (val macro-F1: 0.6311)
Epoch 004 | Train Loss: 0.8059 | Val Loss: 0.8768 | Val Macro-F1: 0.6168
Epoch 005 | Train Loss: 0.7939 | Val Loss: 0.8376 | Val Macro-F1: 0.6449
>>>> New best saved (val macro-F1: 0.6449)
Epoch 006 | Train Loss: 0.7811 | Val Loss: 0.8709 | Val Macro-F1: 0.6319
Epoch 007 | Train Loss: 0.7726 | Val Loss: 0.8700 | Val Macro-F1: 0.5875
Epoch 008 | Train Loss: 0.7661 | Val Loss: 0.8362 | Val Macro-F1: 0.5905
Epoch 009 | Train Loss: 0.7602 | Val Loss: 0.8747 | Val Macro-F1: 0.5794
Epoch 010 | Train Loss: 0.7581 | Val Loss: 0.8473 | Val Macro-F1: 0.5924
Epoch 011 | Train Loss: 0.7790 | Val Loss: 0.8593 | Val Macro-F1: 0.5623
Epoch 012 | Train Loss: 0

## evaluation

In [19]:
model.load_state_dict(torch.load('artifacts/best_model.pth'))
model.eval()

all_preds = []
with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        logits = model(X_batch)
        preds  = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)

test_f1 = f1_score(y_test, all_preds, average='macro')
print(f"Test Macro-F1: {test_f1:.4f}  (paper baseline: 0.87)")

  model.load_state_dict(torch.load('artifacts/best_model.pth'))


Test Macro-F1: 0.6572  (paper baseline: 0.87)


## save the data

In [20]:
import json
with open('artifacts/model_config.json', 'w') as f:
    json.dump({
        'input_dim':   input_dim,
        'num_classes': num_classes,
        'classes':     list(label_encoder.classes_),
        'best_val_f1': best_f1,
        'test_f1':     test_f1,
    }, f)

print("Config saved.")


Config saved.
