In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import string
import pickle

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader, ConcatDataset

In [3]:
num_epochs = 10
batch_size = 32
learning_rate = 0.00001
momentum = 0.9

In [4]:
ALPHABET = string.ascii_lowercase + string.digits + "."
char2idx = {c: i + 1 for i, c in enumerate(ALPHABET)}  # padding=0
idx2char = {i: c for c, i in char2idx.items()}  # Reverse mapping index -> character
vocab_size = len(char2idx) + 1

MAX_LEN = 50


def domain_to_tensor(domain):
    arr = [char2idx.get(c, 0) for c in domain.lower()][:MAX_LEN]
    arr += [0] * (MAX_LEN - len(arr))
    return torch.tensor(arr, dtype=torch.long)


def tensor_to_domain(tensor):
    domain = "".join(idx2char.get(idx, "") for idx in tensor.tolist() if idx > 0)  # Ignore padding (0)
    return domain


def load_dataset(file_path):
    with open(file_path, 'rb') as file:
        dataloader = pickle.load(file)
    print(f"DataLoader loaded from {file_path}.")
    return dataloader


class DomainDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        dom, lbl = self.samples[idx]
        x = domain_to_tensor(dom)
        return x, lbl

In [6]:
import pandas as pd 
data = pd.read_pickle('/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/benign_test.pkl')
print(dir(data) ) 
print(type(data) )
print(len(data))

obj = data 
print(type(obj.samples))
print(len(obj.samples))

# xem 5 phần tử đầu tiên
for i in range(5):
    print("\n--- sample", i, "---")
    print(obj.samples[i])
    print("type:", type(obj.samples[i]))




['__add__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_is_protocol', 'samples']
<class '__main__.DomainDataset'>
200000
<class 'list'>
200000

--- sample 0 ---
('callcontroller.swce-02.ic3-calling-callcontroller.swedencentral-prod.cosmic.office.net', 0)
type: <class 'tuple'>

--- sample 1 ---
('a407.d.akamai.net', 0)
type: <class 'tuple'>

--- sample 2 ---
('n4i6g1.akamai.net', 0)
type: <class 'tuple'>

--- sample 3 ---
('mxb-0005f601.gslb.pphosted.com', 0)
type: <class 'tuple'>

--- sample 4 ---
('log-upload.mihoyo.com', 0)
type: <class 'tuple'>


In [7]:
print(type(obj.samples))
print(len(obj.samples))

# xem 5 phần tử đầu tiên
for i in range(5):
    print("\n--- sample", i, "---")
    print(obj.samples[i])
    print("type:", type(obj.samples[i]))


<class 'list'>
200000

--- sample 0 ---
('callcontroller.swce-02.ic3-calling-callcontroller.swedencentral-prod.cosmic.office.net', 0)
type: <class 'tuple'>

--- sample 1 ---
('a407.d.akamai.net', 0)
type: <class 'tuple'>

--- sample 2 ---
('n4i6g1.akamai.net', 0)
type: <class 'tuple'>

--- sample 3 ---
('mxb-0005f601.gslb.pphosted.com', 0)
type: <class 'tuple'>

--- sample 4 ---
('log-upload.mihoyo.com', 0)
type: <class 'tuple'>


In [8]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size=vocab_size, embed_dim=32, hidden_dim=64, num_classes=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        emb = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        bilstm_out, _ = self.bilstm(emb)  # (batch_size, seq_len, hidden_dim*2)

        # Attention Mechanism
        attn_weights = F.softmax(self.attention(bilstm_out).squeeze(2), dim=1)  # (batch_size, seq_len)
        attn_output = torch.bmm(attn_weights.unsqueeze(1), bilstm_out).squeeze(1)  # (batch_size, hidden_dim*2)

        attn_output = self.dropout(attn_output)
        logits = self.fc(attn_output)
        return logits



In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
    benign_train_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/benign_train.pkl")
    benign_test_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/benign_test.pkl")
    dga_1_train_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_1_train.pkl")
    dga_1_test_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_1_test.pkl")
    dga_2_train_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_2_train.pkl")
    dga_2_test_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_2_test.pkl")
    dga_3_train_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_3_train.pkl")
    dga_3_test_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_3_test.pkl")
    dga_4_train_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_4_train.pkl")
    dga_4_test_ds = load_dataset("/home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_4_test.pkl")

    train_ds = ConcatDataset([benign_train_ds, dga_1_train_ds, dga_2_train_ds, dga_3_train_ds, dga_4_train_ds])
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    test_ds = ConcatDataset([benign_test_ds, dga_1_test_ds, dga_2_test_ds, dga_3_test_ds, dga_4_test_ds])
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    model = BiLSTMClassifier().to(device)
    criterion = nn.CrossEntropyLoss()

    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/benign_train.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/benign_test.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_1_train.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_1_test.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_2_train.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_2_test.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_3_train.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_3_test.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2/dga_4_train.pkl.
DataLoader loaded from /home/dung/Downloads/test-20251120T153114Z-1-001/test/domain2

In [11]:



def train_client(model, client_loader, criterion, optimizer):
    model.train()
    for x_batch, y_batch in tqdm(client_loader):
        if x_batch.size(0) == 1:
            continue
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()


def test_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(x_batch)
            preds = logits.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(y_batch.cpu().numpy().flatten())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    return f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}"

In [12]:
for _ in range(num_epochs):
        train_client(model, train_loader, criterion, optimizer)
        result = test_model(model, test_loader)
        print(result)


100%|██████████| 49660/49660 [01:56<00:00, 426.73it/s]


Accuracy: 0.8061, Precision: 0.7822, Recall: 0.8447, F1-score: 0.8122


100%|██████████| 49660/49660 [01:51<00:00, 445.88it/s]


Accuracy: 0.8793, Precision: 0.8525, Recall: 0.9152, F1-score: 0.8828


100%|██████████| 49660/49660 [01:55<00:00, 429.98it/s]


Accuracy: 0.9025, Precision: 0.8662, Recall: 0.9505, F1-score: 0.9064


100%|██████████| 49660/49660 [02:01<00:00, 408.66it/s]


Accuracy: 0.9134, Precision: 0.8859, Recall: 0.9478, F1-score: 0.9158


100%|██████████| 49660/49660 [01:45<00:00, 472.32it/s]


Accuracy: 0.9190, Precision: 0.8779, Recall: 0.9721, F1-score: 0.9226


100%|██████████| 49660/49660 [01:51<00:00, 445.94it/s]


Accuracy: 0.9263, Precision: 0.8927, Recall: 0.9678, F1-score: 0.9287


100%|██████████| 49660/49660 [01:53<00:00, 438.05it/s]


Accuracy: 0.9312, Precision: 0.9016, Recall: 0.9670, F1-score: 0.9331


100%|██████████| 49660/49660 [01:56<00:00, 424.91it/s]


Accuracy: 0.9325, Precision: 0.8992, Recall: 0.9731, F1-score: 0.9347


100%|██████████| 49660/49660 [01:53<00:00, 436.21it/s]


Accuracy: 0.9364, Precision: 0.9084, Recall: 0.9698, F1-score: 0.9381


100%|██████████| 49660/49660 [02:02<00:00, 406.04it/s]


Accuracy: 0.9401, Precision: 0.9222, Recall: 0.9604, F1-score: 0.9409
