Importing all necessary dependencies.

In [28]:
import pandas as pd 
from glob import glob
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
import joblib
import matplotlib.pyplot as plt
import numpy as np

Importing datasets, extracting x, y values and the numerical/categorical columns.

In [None]:
dataset = glob("CICIDS2017/*.csv")
df = pd.concat(map(pd.read_csv, dataset), ignore_index=True)

df.columns = df.columns.str.strip()

le = LabelEncoder()
df["Label"] = df["Label"].astype(str).str.strip()  # clean up label column just in case
y = le.fit_transform(df["Label"])
X = df.drop(columns=["Label"])

X.replace(["Infinity", "-Infinity", np.inf, -np.inf], np.nan, inplace=True)

for col in X.columns:
    if X[col].dtype == "object":
        X[col] = pd.to_numeric(X[col], errors="coerce")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

constant_cols = [col for col in X_train.columns if X_train[col].nunique() <= 1]
X_train.drop(columns=constant_cols, inplace=True)
X_test.drop(columns=constant_cols, inplace=True)

# Re-check for any remaining infs or NaNs just to be safe
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

joblib.dump(le, "label_encoder.pkl")

print(f"X_train shape: {X_train.shape}")
print(f"Numerical columns: {len(num_cols)}, Categorical columns: {len(cat_cols)}")
print(f"Class distribution in y_train: {np.bincount(y_train)}")

This step does the following:
- Creating pipelines for numerical and categorical values.
- Creating a column transformation for the pipelines.
- Creating a model pipeline.

In [30]:
num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

col_trans = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
],
remainder="drop",
n_jobs=-1
)

random_forest_classifier = RandomForestClassifier(random_state=42)

model_pipeline = Pipeline(steps=[
    ("col_trans", col_trans),
    ("select", SelectKBest(score_func=f_classif)),
    ("model", random_forest_classifier)
])

In [31]:
param_distribution = {
    "select__k": [10, 20, 30, "all"],
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5],
}

f1_weighted = make_scorer(f1_score, average="weighted")

random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distribution,
    cv=5,
    scoring=f1_weighted,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    n_iter=20
)

Fitting against the x and y values, to get the best model using grid search.

In [None]:
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
joblib.dump(random_search.best_estimator_, "network_intrusion_model.pkl")

print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

In [None]:
loaded_model = joblib.load("network_intrusion_model.pkl")
y_pred = loaded_model.predict(X_test)

Printing a report of the classification.

In [None]:
print(classification_report(y_test, y_pred, target_names=le.classes_))