In [1]:
import matplotlib.pyplot as plt
import joblib
from model import LSTMModel
from torch.utils.tensorboard import SummaryWriter 
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset as TorchDataset
from tqdm import tqdm
from datasets import Dataset
from datetime import datetime

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
def merge_and_stat_label(folder_path, to_exclude: list[str] = [], to_include: list[str] = []):
    all_dfs = []
    file_paths = os.listdir(folder_path)
    if to_exclude:
        file_paths = [file for file in file_paths if file not in to_exclude]
    if to_include:
        file_paths = [file for file in file_paths if file in to_include]
    for filename in tqdm(file_paths):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_dfs.append(df)
    merged_df = pd.concat(all_dfs, ignore_index=True)
    label_stats = merged_df['Label'].value_counts()
    return merged_df, label_stats


def filter_by_min_count(df, label_column, min_count):
    label_counts = df[label_column].value_counts()
    valid_labels = label_counts[label_counts >= min_count].index
    return df[df[label_column].isin(valid_labels)]

def process_input_and_generate_result(scaled_X, X_matrix, column_matrix):
    """
    Hàm nhận vào ma trận X và trả về kết quả tương ứng.

    :param X: Ma trận X chứa các chỉ số (index) để truy cập vào X_matrix và column_matrix
    :param scaled_X: Ma trận scaled_X chứa các chỉ số để truy cập vào các giá trị trong X_matrix
    :param X_matrix: Ma trận dữ liệu dùng để truy xuất theo các chỉ số từ scaled_X
    :param column_matrix: Ma trận cột dùng để kết hợp với các giá trị từ X_matrix
    :return: Ma trận kết quả đã được điền dữ liệu từ X_matrix và column_matrix
    """

    result = np.zeros((scaled_X.shape[0], scaled_X.shape[1],
                       X_matrix.shape[1] + column_matrix.shape[1]))

    for i in range(scaled_X.shape[0]):
        for j in range(scaled_X.shape[1]):
            x_value = X_matrix[scaled_X[i, j]]
            column_value = column_matrix[j]
            result[i, j] = np.concatenate((x_value, column_value))

    return result

def process_input_and_generate_result_tensor(scaled_X, X_matrix, column_matrix):

    result = torch.zeros((scaled_X.shape[0], scaled_X.shape[1],
                         # (195720, 76, 128)
                          X_matrix.shape[1] + column_matrix.shape[1]))

    for i in range(scaled_X.shape[0]):
        for j in range(scaled_X.shape[1]):
            x_value = X_matrix[scaled_X[i, j]]
            column_value = column_matrix[j]
            result[i, j] = torch.concatenate((x_value, column_value))

    return result

def collate_fn(batch):
    """
    Collate function to process data with process_input_and_generate_result.

    :param batch: A batch of data (inputs, targets)
    :return: Processed batch (inputs, targets)
    """
    inputs, targets = zip(*batch)  # Tách dữ liệu và nhãn

    # Chuyển danh sách inputs thành một tensor 3D, có shape: (batch_size, seq_len, input_size)
    inputs = torch.stack(inputs, dim=0)
    targets = torch.tensor(targets, dtype=torch.long)

    # Giả sử bạn đã có sẵn scaled_X, X_matrix, và column_matrix
    # Chuyển đổi dữ liệu sử dụng hàm process_input_and_generate_result
    processed_inputs = process_input_and_generate_result(inputs.numpy(), X_matrix, column_matrix)

    # Chuyển processed_inputs về dạng tensor
    processed_inputs = torch.tensor(processed_inputs, dtype=torch.float32)

    return processed_inputs, targets


In [4]:
folder_path = 'Datasets/TabularIoTAttacks-2024'

In [5]:
merged_df, label_stats = merge_and_stat_label(folder_path)
min_count = 32620
filtered_df = filter_by_min_count(merged_df, 'Attack Name', min_count)

y = filtered_df['Attack Name']
X = filtered_df.select_dtypes(include=['int64', 'float64'])
X = X.drop(columns=['Label', 'Src Port', 'Dst Port', 'Protocol'], axis=1)

scaler = MinMaxScaler(feature_range=(0, 50000))
scaled_X = scaler.fit_transform(X)

# X_matrix = np.random.rand(50000+1, 64)  
# column_matrix = np.random.rand(76, 64)
scaled_X = scaled_X.astype(int)

print("Data's shape:", scaled_X.dtype)



100%|██████████| 15/15 [00:30<00:00,  2.01s/it]


Data's shape: int64


In [6]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [7]:
print(np.unique(y_encoded))

[0 1 2 3 4 5]


In [8]:
X_tensor = torch.tensor(scaled_X, dtype=torch.long)  
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

X_train, X_temp, y_train, y_temp = train_test_split(X_tensor, y_tensor, 
                                                    test_size=0.4, random_state=42, stratify=y_tensor)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, 
                                                test_size=0.5, random_state=42, stratify=y_temp)

In [9]:
input_size = 128
hidden_size = 64  
num_layers = 3
output_size = len(np.unique(y_encoded))
bidirectional = True

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomEmbedding(nn.Module):
    def __init__(self, X_matrix: torch.Tensor, column_matrix: torch.Tensor):
        super(CustomEmbedding, self).__init__()
        self.X_matrix = nn.Parameter(X_matrix, requires_grad=False)
        self.column_matrix = nn.Parameter(column_matrix, requires_grad=False)

    def forward(self, indices: torch.Tensor):
        """
        Apply the embedding process for input indices using batch processing.
        """
        batch_size, seq_len = indices.shape
        feature_dim_X = self.X_matrix.shape[1]
        feature_dim_C = self.column_matrix.shape[1]

        # Efficient batch operation: get X_matrix and column_matrix embeddings
        X_embeddings = self.X_matrix[indices]  # (batch_size, seq_len, feature_dim_X)
        
        # Expand column_matrix to match batch size and seq_len
        column_embeddings = self.column_matrix.unsqueeze(0).expand(batch_size, seq_len, feature_dim_C)
        
        # Concatenate along the last dimension (features dimension)
        result = torch.cat((X_embeddings, column_embeddings), dim=-1)

        return result

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, 
                 n_features, X_embedding_dims, column_embedding_dim,
                 X_range: tuple,
                 num_layers=2, dropout=0.2, **kwargs):
        super(LSTMModel, self).__init__()
        X_min, X_max = X_range
        X_matrix = torch.rand(X_max + 1, X_embedding_dims)
        column_matrix = torch.rand(n_features, column_embedding_dim)
        self.embedding = CustomEmbedding(X_matrix, column_matrix)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = 2 if kwargs.get('bidirectional') else 1

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, **kwargs)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size * self.num_directions, num_classes)

    def forward(self, indices):
        # Process inputs through the embedding layer
        x = self.embedding(indices)

        # Initialize hidden states
        h0 = torch.zeros(self.num_layers * self.num_directions, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * self.num_directions, x.size(0), self.hidden_size).to(x.device)

        # LSTM output
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]  # Last time-step output

        # Pass through fully connected layer
        out = self.fc(out)
        return out

In [61]:
model = LSTMModel(input_size, hidden_size, num_classes=output_size,
                  num_layers=num_layers, bidirectional=bidirectional,
                  n_features=X.shape[1], X_embedding_dims=64,
                  column_embedding_dim=64, X_range=(0, 50000))

model.to(device)


LSTMModel(
  (embedding): CustomEmbedding()
  (lstm): LSTM(128, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
)

In [62]:
criterion = nn.CrossEntropyLoss()  #
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)




In [63]:
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

batch_size = 512 * 2
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [64]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = f'./logs/{timestamp}'
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)

# np.save(os.path.join(log_dir, 'X_matrix.npy'), X_matrix)
# np.save(os.path.join(log_dir, 'column_matrix.npy'), column_matrix)
np.save(os.path.join(log_dir, 'scaled_X.npy'), scaled_X)
np.save(os.path.join(log_dir, 'y_encoded.npy'), y_encoded)

joblib.dump(le, os.path.join(log_dir, 'label_encoder.joblib'))
joblib.dump(scaler, os.path.join(log_dir, 'min_max_scaler.joblib'))


['./logs/20241215_175915\\min_max_scaler.joblib']

In [65]:
print(model)


LSTMModel(
  (embedding): CustomEmbedding()
  (lstm): LSTM(128, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
)


In [66]:
best_val_accuracy = 0.0
num_epochs = 1
patience = 5
no_improve_epochs = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_train = 0
    total_train = 0

    with tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for inputs, targets in pbar:
            inputs, targets = inputs.to(device), targets.to(device)
            print(inputs)
            break

Epoch 1/1:   0%|          | 0/1971 [00:00<?, ?batch/s]

tensor([[ 8341,     9,     0,  ...,     0,     0,     0],
        [   76,     6,    21,  ...,     0,     0,     0],
        [  121,     0,     7,  ...,     0,     0,     0],
        ...,
        [49343,    45,     0,  ...,     0,     0,     0],
        [30890,    21,     0,  ...,     0, 15590, 15590],
        [  123,     0,     7,  ...,     0,     0,     0]], device='cuda:0')





In [67]:
model.embedding.X_matrix.shape, model.embedding.column_matrix.shape

(torch.Size([50001, 64]), torch.Size([76, 64]))

In [68]:
embeddings = model.embedding(inputs)

In [69]:
embeddings.shape

torch.Size([1024, 76, 128])

In [70]:
embeddings[0][0]

tensor([0.4681, 0.8220, 0.9640, 0.6090, 0.3164, 0.2209, 0.3045, 0.0389, 0.5975,
        0.0978, 0.9521, 0.5828, 0.5141, 0.6188, 0.5128, 0.7266, 0.0681, 0.0049,
        0.8374, 0.9143, 0.6010, 0.2866, 0.2574, 0.3361, 0.7193, 0.8117, 0.8840,
        0.5659, 0.4122, 0.1353, 0.2468, 0.9108, 0.7168, 0.0628, 0.2901, 0.6523,
        0.4462, 0.7096, 0.0878, 0.7842, 0.8914, 0.7976, 0.2512, 0.2123, 0.0476,
        0.2544, 0.3157, 0.7965, 0.3359, 0.5434, 0.4454, 0.1122, 0.5906, 0.1926,
        0.3499, 0.6639, 0.9609, 0.4541, 0.0525, 0.3377, 0.0264, 0.0048, 0.4925,
        0.1963, 0.4275, 0.8422, 0.8437, 0.5399, 0.0748, 0.4936, 0.7729, 0.5370,
        0.6422, 0.1855, 0.4158, 0.0935, 0.0178, 0.9473, 0.5038, 0.4628, 0.2278,
        0.2973, 0.8222, 0.7147, 0.3509, 0.8324, 0.5428, 0.5249, 0.1439, 0.2614,
        0.5175, 0.0971, 0.5839, 0.3881, 0.3024, 0.6201, 0.9725, 0.4122, 0.4815,
        0.4364, 0.9191, 0.6735, 0.8446, 0.0321, 0.8330, 0.7324, 0.5746, 0.4621,
        0.0199, 0.3796, 0.7274, 0.8403, 

In [71]:
tmp = torch.cat((model.embedding.X_matrix[inputs[0][0]], model.embedding.column_matrix[0]))

In [72]:
tmp == embeddings[0][0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True], device='cuda:0')

In [None]:
best_val_accuracy = 0.0
num_epochs = 20
patience = 5
no_improve_epochs = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_train = 0
    total_train = 0

    with tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as pbar:
        for inputs, targets in pbar:
            inputs, targets = inputs.to(device), targets.to(device)
            print(inputs)
            
            outputs = model(inputs)

            loss = criterion(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train += targets.size(0)
            correct_train += (predicted == targets).sum().item()
            
            pbar.set_postfix(loss=epoch_loss / len(train_loader), accuracy=100 * correct_train / total_train)

    train_accuracy = 100 * correct_train / total_train
    print(f"Train Loss: {epoch_loss / len(train_loader):.4f}, Train Accuracy: {train_accuracy:.2f}%")

    writer.add_scalar('Loss/train', epoch_loss / len(train_loader), epoch)
    writer.add_scalar('Accuracy/train', train_accuracy, epoch)

    model.eval()
    correct_val = 0
    total_val = 0
    val_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val += targets.size(0)
            correct_val += (predicted == targets).sum().item()

    val_accuracy = 100 * correct_val / total_val
    lr_scheduler.step(val_loss)

    print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    writer.add_scalar('Loss/val', val_loss / len(val_loader), epoch)
    writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        no_improve_epochs = 0
        model_save_path = os.path.join(log_dir, 'best_model.pth')
        torch.save(model.state_dict(), model_save_path)
        print(f"New best model saved at {model_save_path}")
    else:
        no_improve_epochs += 1

    if no_improve_epochs >= patience:
        print("Early stopping due to no improvement in validation accuracy.")
        break


In [24]:
import matplotlib.pyplot as plt
import joblib
from model import LSTMModel
from torch.utils.tensorboard import SummaryWriter 
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from datetime import datetime
from logger import setup_logger

logger = setup_logger(log_file="logs/training_lstm.log")

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = f'./logs/{timestamp}'
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)

logger.info(f"Log directory: {log_dir}")    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")


def merge_and_stat_label(folder_path, to_exclude: list[str] = [], to_include: list[str] = []):
    all_dfs = []
    file_paths = os.listdir(folder_path)
    if to_exclude:
        file_paths = [file for file in file_paths if any(att in file for att in to_exclude)]
    if to_include:
        file_paths = [file for file in file_paths if any(att in file for att in to_include)]
    for filename in tqdm(file_paths):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            all_dfs.append(df)
    merged_df = pd.concat(all_dfs, ignore_index=True)
    label_stats = merged_df['Label'].value_counts()
    return merged_df, label_stats


def filter_by_min_count(df, label_column, min_count):
    label_counts = df[label_column].value_counts()
    valid_labels = label_counts[label_counts >= min_count].index
    return df[df[label_column].isin(valid_labels)]

def scale_df(df, scalers):
    scaled_df = df.copy()
    for i, col in enumerate(df.columns):
        scaled_df[col] = scalers[i].fit_transform(df[col].values.reshape(-1, 1))
    return scaled_df

folder_path = 'Datasets/TabularIoTAttacks-2024'
selected_attacks = ['DoS TCP Flood', 'Recon Port Scan', 'MQTT DDoS Publish Flood', 'MQTT DoS Connect Flood', 'Benign Traffic']

merged_df, label_stats = merge_and_stat_label(folder_path, to_include=selected_attacks)
min_count = 32620
filtered_df = filter_by_min_count(merged_df, 'Attack Name', min_count)

y = filtered_df['Attack Name']
X = filtered_df.select_dtypes(include=['int64', 'float64'])
X = X.drop(columns=['Label', 'Src Port', 'Dst Port', 'Protocol'], axis=1)

idx_range = (0, 50000)
scalers = [MinMaxScaler(feature_range=idx_range) for _ in range(X.shape[1])]
scaled_X = scale_df(X, scalers)

2024-12-16 14:29:04,934 - INFO - Log directory: ./logs/20241216_142904
2024-12-16 14:29:04,934 - INFO - Log directory: ./logs/20241216_142904
2024-12-16 14:29:04,934 - INFO - Log directory: ./logs/20241216_142904
2024-12-16 14:29:04,934 - INFO - Log directory: ./logs/20241216_142904
2024-12-16 14:29:04,934 - INFO - Log directory: ./logs/20241216_142904
2024-12-16 14:29:04,938 - INFO - Using device: cuda
2024-12-16 14:29:04,938 - INFO - Using device: cuda
2024-12-16 14:29:04,938 - INFO - Using device: cuda
2024-12-16 14:29:04,938 - INFO - Using device: cuda
2024-12-16 14:29:04,938 - INFO - Using device: cuda
100%|██████████| 5/5 [00:31<00:00,  6.35s/it]


In [29]:
scaled_X

Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,6144.318333,7.844122,0.000000,0.068270,0.000000,1335.616438,0.0,359.589041,827.433379,0.000000,...,3.137747,18750.0,0.000000,0.0,0.000000,0.000000,6118.187105,0.000000,6118.187105,6118.187105
1,25042.545833,26.670013,0.000000,4.480914,0.000000,48219.178082,0.0,7867.199391,23950.825185,0.000000,...,12.550988,18750.0,96.683836,0.0,96.683836,96.683836,6233.452823,127.088758,6276.884065,6114.409370
2,37544.487083,12.550595,0.000000,1.261375,0.000000,6643.835616,0.0,4429.223744,4736.834100,0.000000,...,7.844368,18750.0,257.029788,0.0,257.029788,257.029788,12436.066992,212.827380,12522.604420,12282.950995
3,49999.575000,191.396567,879.702475,2.337444,0.020381,684.931507,0.0,600.567992,112.768725,68.493151,...,188.264826,18750.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,49999.391250,191.396567,879.702475,5.863658,0.020381,1575.342466,0.0,1506.570888,278.322371,68.493151,...,188.264826,18750.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3276997,122.409583,0.000000,7.152053,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3276998,121.410417,0.000000,7.152053,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3276999,121.102500,0.000000,7.152053,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3277000,116.226667,0.000000,7.152053,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
