## data preprocessing

In [13]:
import pandas as pd
train_data = pd.read_parquet('data/processed/train_data.parquet')
test_data  = pd.read_parquet('data/processed/test_data.parquet')

In [14]:
train_data.head()

Unnamed: 0,IncidentId,IncidentGrade,evidence_count,DetectorId_nunique,AlertTitle_nunique,DeviceId_nunique,Sha256_nunique,IpAddress_nunique,Url_nunique,AccountSid_nunique,...,EntityType_Url,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,SuspicionLevel_Incriminated,SuspicionLevel_Suspicious,LastVerdict_Malicious,LastVerdict_NoThreatsFound,LastVerdict_Other,LastVerdict_Suspicious
0,0,TruePositive,29997,6,6,1,1,874,1,148,...,0,9081,9084,20913,0.0,0.0,0.0,0.0,0.0,0.0
1,2,BenignPositive,20525,113,934,11,1881,3,3,7,...,4,320,5484,15041,0.0,20453.0,6645.0,4.0,0.0,13814.0
2,3,TruePositive,3,1,1,1,1,2,1,2,...,0,1,2,1,0.0,0.0,0.0,0.0,0.0,0.0
3,7,BenignPositive,12252,8,19,3,3,2474,3,1721,...,4,1737,3477,8775,0.0,10.0,1.0,10.0,0.0,13.0
4,8,TruePositive,6,2,2,1,1,1,1,3,...,0,2,4,2,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data["label"] = label_encoder.fit_transform(train_data["IncidentGrade"])
test_data["label"] = label_encoder.transform(test_data["IncidentGrade"])

In [None]:
label_encoder.classes_

array(['BenignPositive', 'FalsePositive', 'TruePositive'], dtype=object)

In [17]:
train_data['label'].value_counts()

label
0    218131
1    135158
2     95612
Name: count, dtype: int64

In [18]:
X = train_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y = train_data["label"].values

from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1, stratify=y, random_state=67)

X_test = test_data.drop(['IncidentId', 'IncidentGrade', 'label'], axis=1).values
y_test = test_data["label"].values

In [19]:
print(f"training shape : {X_train.shape}")
print(f"validation shape : {X_validation.shape}")
print(f"testing shape : {X_test.shape}")

training shape : (404010, 154)
validation shape : (44891, 154)
testing shape : (236267, 154)


In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [21]:
import numpy as np
class_counts  = np.bincount(y_train)
class_weights = len(y_train) / (len(class_counts) * class_counts)

## prepare the dataset

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.long)
)
validation_dataset = TensorDataset(
    torch.tensor(X_validation, dtype=torch.float32),
    torch.tensor(y_validation, dtype=torch.long)
)

train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=2048)


## create the model

### create the residual blocks class

In [33]:
class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.block=nn.Sequential(
            nn.BatchNorm1d(in_dim),
            nn.SiLU(),
            nn.Linear(in_dim, out_dim),
            nn.BatchNorm1d(out_dim),
            nn.SiLU(),
            nn.Linear(out_dim, out_dim)
        )
        self.shourtcut = (
            nn.Linear(in_dim, out_dim) if in_dim != out_dim else nn.Identity()
        )
    def forward(self, x):
        return self.block(x) + self.shourtcut(x)

### create the model class

In [34]:
class ResidualModel(nn.Module):
    def __init__(self, in_dim, n_classes):
        super().__init__()
        self.input= nn.Linear(in_dim, 512)
        self.blocks = nn.Sequential(
            ResidualBlock(512, 512),
            ResidualBlock(512,256),
            ResidualBlock(256, 256),
            ResidualBlock(256, 128),
            ResidualBlock(128, 128)
        )
        self.head = nn.Sequential(
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Linear(128, n_classes)
        )
        self._init_weights()
    
    def _init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, nonlinearity="relu")
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = self.input(x)
        x = self.blocks(x)
        return self.head(x)

### create the model

In [35]:
input_dim = X_train.shape[1]
num_classes = 3

model = ResidualModel(input_dim, num_classes).to(device)

### print the number of leanable parameters

In [37]:
sum(parameter.numel() for parameter in model.parameters())

1186563