Importing all necessary dependencies.

In [13]:
import pandas as pd
from glob import glob
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
import joblib
import numpy as np

Importing datasets, extracting x, y values and the numerical/categorical columns.

In [14]:
dataset = glob("CICIDS2017/*.csv")
df = pd.concat(map(pd.read_csv, dataset), ignore_index=True)

df.columns = df.columns.str.strip()

le = LabelEncoder()
df["Label"] = df["Label"].astype(str).str.strip()  # clean up label column just in case
y = le.fit_transform(df["Label"])
X = df.drop(columns=["Label"])

X.replace(["Infinity", "-Infinity", np.inf, -np.inf], np.nan, inplace=True)

for col in X.columns:
    if X[col].dtype == "object":
        X[col] = pd.to_numeric(X[col], errors="coerce")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

constant_cols = [col for col in X_train.columns if X_train[col].nunique() <= 1]
X_train.drop(columns=constant_cols, inplace=True)
X_test.drop(columns=constant_cols, inplace=True)

# Re-check for any remaining infs or NaNs just to be safe
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

joblib.dump(le, "label_encoder.pkl")

print(f"X_train shape: {X_train.shape}")
print(f"Numerical columns: {len(num_cols)}, Categorical columns: {len(cat_cols)}")
print(f"Class distribution in y_train: {np.bincount(y_train)}")

X_train shape: (2264594, 70)
Numerical columns: 70, Categorical columns: 0
Class distribution in y_train: [1818477    1573  102421    8234  184858    4399    4637    6350       9
      29  127144    4718    1206      17     522]


This step does the following:
- Creating pipelines for numerical and categorical values.
- Creating a column transformation for the pipelines.
- Creating a model pipeline.

In [15]:
num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

col_trans = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
],
remainder="drop",
n_jobs=-1
)

xgb_classifier = XGBClassifier(
    tree_method="hist",
    device="cuda",
    max_bin=256,
    max_depth=10,
    subsample=0.8,
    n_estimators=100,
    learning_rate=0.1,
    eval_metric="mlogloss",
    random_state=42
)

model_pipeline = Pipeline(steps=[
    ("col_trans", col_trans),
    ("select", SelectKBest(score_func=f_classif)),
    ("model", xgb_classifier)
])

In [16]:
param_distribution = {
    "select__k": [10, 20, 30, "all"],
    "model__n_estimators": [100, 200],
    "model__max_depth": [6, 10, 15],
    "model__learning_rate": [0.01, 0.1, 0.2],
    "model__subsample": [0.8, 1.0],
}

f1_weighted = make_scorer(f1_score, average="weighted")

random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distribution,
    cv=3,
    n_iter=10,
    scoring=f1_weighted,
    n_jobs=4,
    verbose=2,
    random_state=42
)

Fitting against the x and y values, to get the best model using grid search.

In [17]:
random_search.fit(X_train, y_train, model__sample_weight=sample_weights)

best_model = random_search.best_estimator_
joblib.dump(random_search.best_estimator_, "network_intrusion_model.pkl")

print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__learning_rate=0.2, model__max_depth=10, model__n_estimators=100, model__subsample=1.0, select__k=20; total time=  53.1s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__learning_rate=0.2, model__max_depth=10, model__n_estimators=100, model__subsample=1.0, select__k=20; total time=  53.5s
[CV] END model__learning_rate=0.2, model__max_depth=10, model__n_estimators=100, model__subsample=1.0, select__k=20; total time=  53.7s


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__learning_rate=0.01, model__max_depth=10, model__n_estimators=100, model__subsample=0.8, select__k=all; total time=  55.6s
[CV] END model__learning_rate=0.01, model__max_depth=10, model__n_estimators=100, model__subsample=0.8, select__k=all; total time= 1.1min
[CV] END model__learning_rate=0.01, model__max_depth=10, model__n_estimators=100, model__subsample=0.8, select__k=all; total time= 1.1min
[CV] END model__learning_rate=0.1, model__max_depth=15, model__n_estimators=100, model__subsample=0.8, select__k=30; total time= 1.3min
[CV] END model__learning_rate=0.1, model__max_depth=15, model__n_estimators=100, model__subsample=0.8, select__k=30; total time= 1.3min
[CV] END model__learning_rate=0.2, model__max_depth=6, model__n_estimators=100, model__subsample=0.8, select__k=20; total time=  37.0s
[CV] END model__learning_rate=0.2, model__max_depth=6, model__n_estimators=100, model__subsample=0.8, select__k=20; total time=  38.9s
[CV] END model__learning_rate=0.2, model__ma

In [18]:
loaded_model = joblib.load("network_intrusion_model.pkl")
y_pred = loaded_model.predict(X_test)

Printing a report of the classification.

In [19]:
print(classification_report(y_test, y_pred, target_names=le.classes_))

                            precision    recall  f1-score   support

                    BENIGN       1.00      0.99      0.99    454620
                       Bot       0.15      1.00      0.26       393
                      DDoS       1.00      1.00      1.00     25606
             DoS GoldenEye       0.90      1.00      0.95      2059
                  DoS Hulk       0.99      1.00      1.00     46215
          DoS Slowhttptest       0.90      0.99      0.94      1100
             DoS slowloris       0.80      1.00      0.89      1159
               FTP-Patator       0.97      1.00      0.98      1588
                Heartbleed       0.50      1.00      0.67         2
              Infiltration       0.47      1.00      0.64         7
                  PortScan       0.99      1.00      1.00     31786
               SSH-Patator       0.68      1.00      0.81      1179
  Web Attack � Brute Force       0.21      0.67      0.32       301
Web Attack � Sql Injection       0.11      0.50