In [1]:
# Oversampling with class weights
print("Calculating class weights for class balancing...")

import cupy as cp
import numpy as np
import cudf
from sklearn.utils.class_weight import compute_class_weight
from cuml.preprocessing import LabelEncoder

#  Loading the datasets 
X_train = cudf.read_csv("../data/train_test/X_train.csv")
y_train = cudf.read_csv("../data/train_test/y_train.csv").iloc[:, 0]

#  Read only the labels column from cleaned file to get original class names 
y_labels = cudf.read_csv("../data/processed_file_cleaned.csv", usecols=['mapped_label'])['mapped_label']

#  Label encoding 
le = LabelEncoder()
_ = le.fit_transform(y_labels)  # encode the original names
class_names = le.classes_.to_pandas().tolist()
print(f"Class names: {class_names}")

# Count samples per class in training set
y_train_numpy = y_train.to_pandas().values
unique, counts = np.unique(y_train_numpy, return_counts=True)
target_counts = {}

print("Distribution in training set:")
for i, count in zip(unique, counts):
    target_counts[i] = count
    print(f"  {class_names[i]}: {count}")

# Calculate weights for all classes
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train_numpy),
    y=y_train_numpy
)

# Convert to dictionary
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

print(f"\nClass weights (balanced):")
for i, weight in class_weight_dict.items():
    print(f"  {class_names[i]}: {weight:.3f}")

# Keep data on GPU (no conversion)
X_train_cp = X_train.to_cupy()
y_train_cp = y_train.to_cupy()

print("\nData ready for GPU training with class balancing!")

Calculating class weights for class balancing...
Class names: ['BENIGN', 'Bot', 'DOS/DDOS', 'Patator', 'PortScan', 'WebAttack']
Distribution in training set:
  BENIGN: 1817056
  Bot: 1565
  DOS/DDOS: 303798
  Patator: 11066
  PortScan: 127043
  WebAttack: 1744

Class weights (balanced):
  BENIGN: 0.208
  Bot: 240.924
  DOS/DDOS: 1.241
  Patator: 34.072
  PortScan: 2.968
  WebAttack: 216.196

Data ready for GPU training with class balancing!


In [2]:
print("Training the XGBoost model with class balancing...")

import xgboost as xgb

sample_weights = np.array([class_weight_dict[label] for label in y_train_numpy])

model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='hist',
    device='cuda',
    random_state=42
)

# Training with weights
model.fit(
    X_train_cp, 
    y_train_cp,
    sample_weight=cp.asarray(sample_weights)  # Poids sur GPU
)

print("Model trained successfully!")

Training the XGBoost model with class balancing...
Model trained successfully!


In [3]:
import numpy as np
import json

model.save_model("../models/xgb_model_v2.json")
print("Model saved as xgb_model_v2.json")

# Convert cudf column -> pandas -> numpy array of strings
classes_np = le.classes_.to_pandas().astype(str).to_numpy()

# Fix for .npy: convert to fixed-length Unicode array so allow_pickle=False works
max_len = max(len(s) for s in classes_np)
classes_np_fixed = classes_np.astype(f'<U{max_len}')

# Save as .npy → fast, reliable, ideal for Python/NumPy
np.save("../models/label_encoder_classes_v2.npy", classes_np_fixed, allow_pickle=False)
print("Label Encoder classes saved as label_encoder_classes_v2.npy")

# Save as .json → human-readable, portable, easy to inspect/share
with open("../models/label_encoder_classes_v2.json", "w", encoding="utf-8") as f:
    json.dump(classes_np_fixed.tolist(), f, ensure_ascii=False)
print("Label Encoder classes saved as label_encoder_classes_v2.json")

Model saved as xgb_model_v2.json
Label Encoder classes saved as label_encoder_classes_v2.npy
Label Encoder classes saved as label_encoder_classes_v2.json
