## Import libraries and download dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
import os
import numpy as np
from itertools import combinations, product

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/cicidscollection")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dhoogla/cicidscollection?dataset_version_number=2...


100%|██████████| 825M/825M [00:30<00:00, 28.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/dhoogla/cicidscollection/versions/2


In [3]:

files = os.listdir(path)

for file in files:
  print(file)

cic-collection.parquet


In [4]:
path = "/root/.cache/kagglehub/datasets/dhoogla/cicidscollection/versions/2/cic-collection.parquet"
print(path)

/root/.cache/kagglehub/datasets/dhoogla/cicidscollection/versions/2/cic-collection.parquet


In [5]:
df = pd.read_parquet(path)

In [6]:
print(df.head())

   Flow Duration  Total Fwd Packets  Total Backward Packets  \
0              4                  2                       0   
1              1                  2                       0   
2              3                  2                       0   
3              1                  2                       0   
4            609                  7                       4   

   Fwd Packets Length Total  Bwd Packets Length Total  Fwd Packet Length Max  \
0                      12.0                       0.0                    6.0   
1                      12.0                       0.0                    6.0   
2                      12.0                       0.0                    6.0   
3                      12.0                       0.0                    6.0   
4                     484.0                     414.0                  233.0   

   Fwd Packet Length Mean  Fwd Packet Length Std  Bwd Packet Length Max  \
0                 6.00000               0.000000                 

## Cleaning dataset

In [7]:
def clean_df(df):
    # Remove the space before each feature names
    df.columns = df.columns.str.strip()
    print('dataset shape', df.shape)

    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('zero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('shape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('shape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('columns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df = clean_df(df)

dataset shape (9167581, 59)
zero variance columns [] dropped
shape after removing zero variance columns: (9167581, 59)
0 rows dropped
shape after removing nan: (9167581, 59)
shape after dropping duplicates: (9162310, 59)
columns which have identical values [('Total Fwd Packets', 'Subflow Fwd Packets'), ('Total Backward Packets', 'Subflow Bwd Packets'), ('Fwd Packet Length Mean', 'Avg Fwd Segment Size'), ('Bwd Packet Length Mean', 'Avg Bwd Segment Size')] dropped
shape after removing identical value columns: (9162310, 55)


## Explore and preprocess data

In [8]:
print(df.columns)

Index(['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Fwd Packets Length Total', 'Bwd Packets Length Total',
       'Fwd Packet Length Max', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count',
       'URG Flag Count', 'Avg Packet Size', 'Subflow Fwd Bytes',
       'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes',
       'Fwd Act Data Packets', 'Fwd Seg Size Min', 

In [9]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Benign,7185877
DDoS-LOIC-HTTP,575364
DoS-Hulk,318740
DDoS-HOIC,198861
Botnet,145968
DDoS,128062
DDoS-NTP,121102
Bruteforce-SSH,97260
DDoS-TFTP,96488
Infiltration,94857


In [13]:
# Separate features and labels from the original DataFrame
features = df.drop(['Label', 'ClassLabel'], axis=1)
labels = df['Label']

# Split data into training and testing sets using raw data
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42)

# Further split the raw test set into local test and demo sets
X_test, demo_test, y_test, demo_label = train_test_split(
    X_test, y_test, test_size=0.001, random_state=42)

# Download demo data
demo_dataframe = pd.DataFrame(demo_test)

# Calculate the midpoint of the demo dataframe
mid_index = len(demo_dataframe) // 2

# Split the dataframe into two halves using iloc
demo_dataframe_1 = demo_dataframe.iloc[:mid_index]
demo_dataframe_2 = demo_dataframe.iloc[mid_index:]

# Save each half to separate CSV files
demo_dataframe_1.to_csv("demo_data_1.csv", index=False)
demo_dataframe_2.to_csv("demo_data_2.csv", index=False)

In [11]:
# Scale and normalize the training and local test data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# One-hot encode the labels for training and local test data
encoder = OneHotEncoder(sparse_output=False)
y_train = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test = encoder.fit_transform(y_test.values.reshape(-1, 1))

# Get the order of label encodings to map out later
print(encoder.get_feature_names_out())

['x0_Benign' 'x0_Botnet' 'x0_Bruteforce-FTP' 'x0_Bruteforce-SSH' 'x0_DDoS'
 'x0_DDoS-DNS' 'x0_DDoS-Ddossim' 'x0_DDoS-HOIC' 'x0_DDoS-LDAP'
 'x0_DDoS-LOIC-HTTP' 'x0_DDoS-MSSQL' 'x0_DDoS-NTP' 'x0_DDoS-NetBIOS'
 'x0_DDoS-SNMP' 'x0_DDoS-Slowloris' 'x0_DDoS-Syn' 'x0_DDoS-TFTP'
 'x0_DDoS-UDP' 'x0_DDoS-UDPLag' 'x0_DoS-Goldeneye' 'x0_DoS-Heartbleed'
 'x0_DoS-Hulk' 'x0_DoS-Rudy' 'x0_DoS-Slowbody' 'x0_DoS-Slowheaders'
 'x0_DoS-Slowhttptest' 'x0_DoS-Slowloris' 'x0_DoS-Slowread'
 'x0_Infiltration' 'x0_Portscan' 'x0_Webattack-SQLi' 'x0_Webattack-XSS'
 'x0_Webattack-bruteforce']


In [12]:
len(X_train), len(y_train), len(X_test), len(y_test), len(features), len(labels)

(7329848, 7329848, 1830629, 1830629, 9162310, 9162310)

## Train the LSTM model and make predictions

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout1 = nn.Dropout(0.2)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        x = self.fc(x[:, -1, :])  # Using the last time-step output
        return x

# Reshape tensors on CPU
X_train_tensor = X_train_tensor.reshape((X_train_tensor.shape[0], 1, X_train_tensor.shape[1]))
X_test_tensor = X_test_tensor.reshape((X_test_tensor.shape[0], 1, X_test_tensor.shape[1]))

# Create DataLoader for CPU tensors
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

# Initialize the model and move it to GPU
input_size = X_train_tensor.shape[2]
hidden_size = 128
output_size = y_train_tensor.shape[1]
model = LSTMModel(input_size, hidden_size, output_size).to(device)

# Use BCEWithLogitsLoss that is safe for autocasting
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = torch.amp.GradScaler()

In [None]:
print(f"Hidden size: {hidden_size}, input size: {input_size}, output size: {output_size}")

Hidden size: 128, input size: 53, output size: 33


In [None]:
# Training loop with mixed precision and DataLoader batch transfer to GPU
epochs = 6
for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for batch_x, batch_y in train_loader:
        # Move batch data to GPU
        batch_x = batch_x.to(device, non_blocking=True)
        batch_y = batch_y.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast(device_type=device.type):
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        epoch_loss += loss.item()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss / len(train_loader):.4f}')

In [None]:
# Create DataLoader for test dataset
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)

model.eval()
all_logits = []
with torch.no_grad():
    for batch_x, _ in test_loader:
        batch_x = batch_x.to(device, non_blocking=True)
        logits = model(batch_x)
        all_logits.append(logits.cpu())
# Concatenate results from all batches
all_logits = torch.cat(all_logits, dim=0)
# Convert logits to probabilities and then binary predictions
probs = torch.sigmoid(all_logits)
y_pred = (probs > 0.5).float()
accuracy = accuracy_score(y_test_tensor.numpy(), y_pred.numpy())
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.9835


In [None]:
# Save model
torch.save(model.state_dict(), 'model.pth')

In [None]:
from google.colab import files
files.download("model.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>