In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import joblib

In [2]:
data_path = "../../data/KDD_reduced.csv"

data_df = pd.read_csv(data_path)


In [3]:
target_col = "attack_class"

# Drop rows where the target column is NaN
data_df_cleaned = data_df.dropna(subset=[target_col])

X = data_df_cleaned.drop(columns=[target_col, "attack"])
y = data_df_cleaned[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
categorical_features = ["protocol_type", "service", "flag"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numerical_features)
])

In [5]:
logreg_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=1000,
        solver="lbfgs",
        class_weight="balanced"
    ))
])

logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_test)

In [6]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))


Confusion Matrix:
 [[ 9009   199    13     0     3]
 [   65 12462   310   371   178]
 [    8    29  2327     3     6]
 [    0     6     0   186     7]
 [    0     2     0     4     7]]
              precision    recall  f1-score   support

         DoS       0.99      0.98      0.98      9224
      Normal       0.98      0.93      0.96     13386
       Probe       0.88      0.98      0.93      2373
         R2L       0.33      0.93      0.49       199
         U2R       0.03      0.54      0.07        13

    accuracy                           0.95     25195
   macro avg       0.64      0.87      0.68     25195
weighted avg       0.97      0.95      0.96     25195



In [7]:
artifacts_dir = Path("../../models/artifacts")
artifacts_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(logreg_pipeline, artifacts_dir / "logistic.joblib")

# Metadata
metadata_path = Path("../../models/metadata.json")
metadata = {}
if metadata_path.exists():
    with open(metadata_path, "r") as f:
        metadata = json.load(f)

metadata["LogisticRegression"] = {
    "model_name": "Logistic Regression",
    "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "task": "Intrusion Detection",
    "metrics": report,
    "hyperparameters": logreg_pipeline.named_steps["classifier"].get_params()
}

with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.pipeline import Pipeline

# Create a pipeline for RandomizedSearchCV that includes preprocessing
# The preprocessor was defined in cell MhI9Lo4C1IDK

log_reg_pipeline_for_search = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Hyperparameter distributions for the classifier step within the pipeline
param_dist = {
    'classifier__C': np.logspace(-4, 4, 20),      # regularization strength
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

# Random Search
random_search_lr = RandomizedSearchCV(
    estimator=log_reg_pipeline_for_search,
    param_distributions=param_dist,
    n_iter=10,              # number of combinations
    scoring='f1_macro',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Train
random_search_lr.fit(X_train, y_train)

# Best parameters
print("Best Parameters:")
print(random_search_lr.best_params_)

print("\nBest CV Score:")
print(random_search_lr.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


45 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\medaz\Documents\IntrusionDetectionSystem\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\medaz\Documents\IntrusionDetectionSystem\venv\Lib\site-packages\sklearn\base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\medaz\Documents\IntrusionDetectionSystem\venv\Lib\site-packages\sklearn\pipeline.py", line 621, in fit
    self._final_estimator.fit(Xt, y, **last_s

Best Parameters:
{'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': np.float64(0.08858667904100823)}

Best CV Score:
0.7744885546553166




In [9]:
best_lr = random_search_lr.best_estimator_

y_pred = best_lr.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         DoS       0.99      0.97      0.98      9224
      Normal       0.97      0.98      0.98     13386
       Probe       0.96      0.94      0.95      2373
         R2L       0.59      0.58      0.59       199
         U2R       0.50      0.31      0.38        13

    accuracy                           0.97     25195
   macro avg       0.80      0.76      0.78     25195
weighted avg       0.97      0.97      0.97     25195

Confusion Matrix:
 [[ 8983   236     5     0     0]
 [   44 13179    82    77     4]
 [   22   117  2233     1     0]
 [    1    82     0   116     0]
 [    1     5     0     3     4]]


In [12]:
model_path = "../../models/logreg_model.joblib"

In [13]:
joblib.dump(best_lr, model_path)

['../../models/logreg_model.joblib']