In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import joblib

In [2]:
train_path = "../../data/KDDTrain+.csv"
test_path = "../../data/KDDTest+.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


In [3]:
target_col = "attack_class"

X_train = train_df.drop(columns=[target_col, "attack"])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[target_col, "attack"])
y_test = test_df[target_col]

In [4]:
categorical_features = ["protocol_type", "service", "flag"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numerical_features)
])

In [5]:
logreg_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=1000,
        solver="lbfgs",
        class_weight="balanced"
    ))
])

logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_test)

In [6]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))


Confusion Matrix:
 [[6205 1191   61    0    2]
 [ 416 8878  346   34   37]
 [ 123  191 2060   41    6]
 [   3 1801   19  681  381]
 [   0   17    0   16   34]]
              precision    recall  f1-score   support

         DoS       0.92      0.83      0.87      7459
      Normal       0.74      0.91      0.81      9711
       Probe       0.83      0.85      0.84      2421
         R2L       0.88      0.24      0.37      2885
         U2R       0.07      0.51      0.13        67

    accuracy                           0.79     22543
   macro avg       0.69      0.67      0.61     22543
weighted avg       0.82      0.79      0.78     22543



In [None]:
artifacts_dir = Path("../../models/artifacts")
artifacts_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(logreg_pipeline, artifacts_dir / "logistic.joblib")

# Metadata
metadata_path = Path("../../models/metadata.json")
metadata = {}
if metadata_path.exists():
    with open(metadata_path, "r") as f:
        metadata = json.load(f)

metadata["LogisticRegression"] = {
    "model_name": "Logistic Regression",
    "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "task": "Intrusion Detection",
    "metrics": report,
    "hyperparameters": logreg_pipeline.named_steps["classifier"].get_params()
}

with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)