In [1]:

# ===== INCRE SGDOCSVM =====
import os, sys
from pathlib import Path

import pandas as pd 
import dask.dataframe as dd

# ----- THƯ VIỆN XỬ LÝ CHÍNH
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score,
    recall_score, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, log_loss
)

import torch
import torch.nn as nn


# ----- MODEL -----
from xgboost import XGBClassifier
import xgboost as xgb
device = "cuda"

# ----- CÁC THƯ VIỆN HỖ TRỢ -----
import hashlib  
import ipaddress
import json
import glob
import gc
import pickle
import tempfile


# ----- THƯ VIỆN PLT -----
import matplotlib.pyplot as plt

# ---------------- CONFIG -----------------
dir_in = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/Incremental_1.3/session{i}.parquet" for i in range(0, 3)]
dir_in_train = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/Incremental_1.3/session{i}_train.parquet" for i in range(0, 3)]
dir_in_test = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/Incremental_1.3/session{i}_test.parquet" for i in range(0, 3)]


max_epochs_per_round = 50
tol = 1e-4  # độ thay đổi loss để dừng


# ====== FUNCTION =====
dtypes = {}    
with open('features.json') as json_file:
    data = json.load(json_file)
    for key, type in data.items():
        if type == "int8":
            dtypes[key]= np.int8
        elif type == "float32":
            dtypes[key] = np.float32
    json_file.close()

print(dtypes)

def astype(df):
    for key, type in df.dtypes.items():
        # print(f"Key: {key} \t {type}")
        if type == "int8":
            df[key] = df[key].astype(np.int8)
        elif type == "float32":
            df[key] = df[key].astype(np.float32)
            
    return df

def torch_tensor(df):
    for key, type in df.dtypes.items():
        # print(f"Key: {key} \t {type}")
        if type == "int8":
            df[key] = torch.tensor(df[key].to_numpy()).int()
        elif type == "float32":
            df[key] = torch.tensor(df[key].to_numpy()).float()
            
    return df

# ----- DUMMY SAMPLE  -----
def add_dummy_unknown_classes(X, y, seen_classes):
    existing_classes = np.unique(y)
    missing_classes = [c for c in seen_classes if c not in existing_classes]
    if not missing_classes:
        return X, y
    X_dummy = np.zeros((len(missing_classes), X.shape[1]), dtype=X.dtype)
    y_dummy = np.array(missing_classes)
    X_new = np.vstack([X, X_dummy])
    y_new = np.hstack([y, y_dummy])
    return X_new, y_new

# ===== INCRE XGB =====
def incre_xgb(train_files, test_files):
    # ----- INIT -----
    
    models = []
    # Lưu lịch sử
    
    history = {
        "train_acc": [],
        "val_acc": [],
        "train_loss": [],
        "val_loss": []
    }
    
    seen_classes = []
    
    
    # ----- INCRE -----
    for index, filepath in enumerate(train_files):
        df_train =  pd.read_parquet(filepath)
        df_train = astype(df_train)
        
        # numClasses = len(df_train["Label"].value_counts())
    
        trainX = df_train.drop(["Label", "Binary Label"], axis =1)
        trainy = df_train["Label"] 
        
        # trainD = xgb.DMatrix(trainX, label=trainy)
        
        del df_train
        gc.collect()
        
        df_test = pd.read_parquet(test_files[index])
        df_test =astype(df_test)
        
        testX = df_test.drop(["Label", "Binary Label"], axis =1)
        testy = df_test["Label"]
        
        # testD = xgb.DMatrix(testX)

        del df_test
        
        gc.collect()
        
        # ----- Add dummy samples for missing classes -----
        trainX, trainy = add_dummy_unknown_classes(trainX, trainy, seen_classes)
        testX, testy = add_dummy_unknown_classes(testX, testy, seen_classes)
        
        # ----- UPDATE SEEN CLASSES -----
        for c in np.unique(trainy):
            if c not in seen_classes:
                seen_classes.append(c)
        seen_classes = sorted(seen_classes)
        
        # ----- Move to GPU -----
        testX = torch.tensor(testX.to_numpy()).float()
        # testX = torch_tensor(testX)
        testX = testX.to('cuda')
        
        # ----- TRAIN + VAL DATA SPLIT -----
        trainX, valX, trainy, valy = train_test_split(trainX, trainy, test_size=0.1, random_state=42, stratify=trainy)
        
        # ----- Move to GPU -----
        trainX = torch.tensor(trainX.to_numpy()).float()
        trainX = trainX.to('cuda')
        
        trainy = torch.tensor(trainy.to_numpy()).float()
        trainy = trainy.to('cuda')
        
        
        # ----- NUMCLASSES -----
        numClasses = max(seen_classes) + 1
        # numClasses = len(trainy.value_counts())
        
                

        model = XGBClassifier(
            n_estimators=5,
            learning_rate=0.05,
            
            max_depth=3, # giới hạn độ sâu -> tránh overfit
            min_child_weight = 3,
            
            subsample=0.7,
            colsample_bytree=0.8,
            
            reg_alpha=1.0,  # Học tăng dần sẽ tích lũy lỗi → nên tăng regularization. 
                            #   reg_alpha = 0.1 – 1  reg_lambda = 1 – 5
            reg_lambda=3.0, 
            gamma = 0.1,
            
            tree_method='hist', # Tăng tốc đáng kể nếu dữ liệu lớn.
            device='cuda',
            
            objective='multi:softprob', #binary:logistic
            num_class = numClasses,
            random_state=42
        )
        
        print(f"\nIncremental Learning XGB | Round {index+1} | Seen classes: {seen_classes}")

        # ----- INIT FIT -----
        if index ==0:
            # ===== INIT MODEL =====
            model.fit(trainX, trainy)
        else:
            if models[index -1]:
                model.fit(trainX, trainy, xgb_model = models[index - 1].get_booster())
                
        # ----- INCRE LEARNING -----
        prev_val_loss = np.inf
        for epoch in range(max_epochs_per_round):
            model.fit(trainX, trainy, xgb_model=model.get_booster())

            # ----- Metrics -----
            train_prob = model.predict_proba(trainX)
            val_prob   = model.predict_proba(valX)

            train_loss = log_loss(trainy, train_prob, labels=seen_classes)
            val_loss   = log_loss(valy, val_prob, labels=seen_classes)

            train_acc = accuracy_score(trainy, np.argmax(train_prob, axis=1))
            val_acc   = accuracy_score(valy, np.argmax(val_prob, axis=1))

            # ----- Save history -----
            history["train_loss"].append(train_loss)
            history["val_loss"].append(val_loss)
            history["train_acc"].append(train_acc)
            history["val_acc"].append(val_acc)

            print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

            # ----- Early stopping based on validation loss -----
            if abs(prev_val_loss - val_loss) < tol:
                print(f"Converged at epoch {epoch+1}")
                break
            prev_val_loss = val_loss
            
        models.append(model)
        
    return models, history
if __name__ == '__main__':
    incre_xgb(train_files= dir_in_train, test_files = dir_in_test)
        

{'Src IP': <class 'numpy.float32'>, 'Src Port': <class 'numpy.int8'>, 'Dst IP': <class 'numpy.float32'>, 'Dst Port': <class 'numpy.int8'>, 'Protocol': <class 'numpy.int8'>, 'Flow Duration': <class 'numpy.float32'>, 'Total Fwd Packet': <class 'numpy.float32'>, 'Total Bwd packets': <class 'numpy.float32'>, 'Total Length of Fwd Packet': <class 'numpy.float32'>, 'Total Length of Bwd Packet': <class 'numpy.float32'>, 'Fwd Packet Length Max': <class 'numpy.float32'>, 'Fwd Packet Length Min': <class 'numpy.float32'>, 'Fwd Packet Length Mean': <class 'numpy.float32'>, 'Fwd Packet Length Std': <class 'numpy.float32'>, 'Bwd Packet Length Max': <class 'numpy.float32'>, 'Bwd Packet Length Min': <class 'numpy.float32'>, 'Bwd Packet Length Mean': <class 'numpy.float32'>, 'Bwd Packet Length Std': <class 'numpy.float32'>, 'Flow Bytes/s': <class 'numpy.float32'>, 'Flow Packets/s': <class 'numpy.float32'>, 'Flow IAT Mean': <class 'numpy.float32'>, 'Flow IAT Std': <class 'numpy.float32'>, 'Flow IAT Max':

ModuleNotFoundError: No module named 'cupy'