Importing all necessary dependencies.

In [82]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
import joblib
import matplotlib.pyplot as plt
import numpy as np

Importing datasets, extracting x, y values and the numerical/categorical columns.

In [None]:
df = pd.read_csv("./dataset/Train_data.csv")

y_train = df["class"]
y_train = y_train.map({"normal": 0, "anomaly": 1})
X_train = df.drop(columns=["class"])

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

test_df = pd.read_csv("./dataset/Test_data.csv")
X_test = test_df[num_cols + cat_cols]

This step does the following:
- Creating pipelines for numerical and categorical values.
- Creating a column transformation for the pipelines.
- Creating a model pipeline.

In [84]:
num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

col_trans = ColumnTransformer(transformers=[
    ("num_pipeline", num_pipeline, num_cols),
    ("cat_pipeline", cat_pipeline, cat_cols)
],
remainder="drop",
n_jobs=-1
)

random_forest_classifier = RandomForestClassifier(random_state=42)

model_pipeline = Pipeline(steps=[
    ("col_trans", col_trans),
    ("select", SelectKBest(score_func=f_classif)),
    ("model", random_forest_classifier)
])

In [85]:
param_grid = {
    "select__k": [10, 20, 30, "all"],
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5],
}

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="f1",
    n_jobs=1
)

Fitting against the x and y values, to get the best model using grid search.

In [None]:
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
joblib.dump(grid_search.best_estimator_, "network_intrusion_model.pkl")

In [None]:
loaded_model = joblib.load("network_intrusion_model.pkl")
y_pred = loaded_model.predict(X_test)

Plotting a bar chart of the predicted values.

In [None]:
unique, counts = np.unique(y_pred, return_counts=True)

plt.bar(unique.astype(str), counts)
plt.xticks([0, 1], labels=["Normal", "Attack"])
plt.title("Prediction Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()