In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB


# Load the dataset

The dataset from Kaggle doesn't have column headers, so we'll add them.

You will need to download 'KDDTrain+.txt' and 'KDDTest+.txt' from the Kaggle link you provided.


In [11]:

# Column names for the dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'attack', 'difficulty'
]

In [12]:
# Load training and testing data
train_df = pd.read_csv('./archive (2)/KDDTrain+.txt', header=None, names=columns)
test_df = pd.read_csv('./archive (2)/KDDTest+.txt', header=None, names=columns)


# --- 1. Data Preprocessing ---


In [13]:
# Combine train and test sets for consistent preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True)


In [14]:
# Identify categorical and numerical columns
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [col for col in combined_df.columns if col not in categorical_cols + ['attack', 'difficulty']]


In [15]:
# Label Encoding for categorical features
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])


In [16]:
# Create the target variable: 1 for 'attack', 0 for 'normal'
combined_df['attack_class'] = combined_df['attack'].apply(lambda x: 0 if x == 'normal' else 1)


In [17]:
# Drop the original 'attack' and 'difficulty' columns
combined_df = combined_df.drop(columns=['attack', 'difficulty'])


In [18]:
# Separate features (X) and target (y)
X = combined_df.drop(columns=['attack_class'])
y = combined_df['attack_class']


In [19]:
# Scaling numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])


In [20]:
# Split the data back into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 2. Model Training and Evaluation ---

In [21]:
# Define models and their hyperparameter grids
models_and_params = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, solver='saga'),
        "params": {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [10, 20]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            'n_estimators': [50, 100],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }
    },
    "Support Vector Machine (SVM)": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            'C': [0.1, 1],
            'kernel': ['rbf', 'linear']
        }
    },
    "K-Nearest Neighbors (KNN)": {
        "model": KNeighborsClassifier(),
        "params": {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            'criterion': ['gini', 'entropy'],
            'max_depth': [10, 20, None]
        }
    },
    "Multi-layer Perceptron": {
        "model": MLPClassifier(max_iter=500, random_state=42, early_stopping=True),
        "params": {
            'hidden_layer_sizes': [(50,), (100,)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001]
        }
    },
    # Naive Bayes has no significant hyperparameters to tune, so we run it directly.
    "Gaussian Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    }
}

results = {}
best_estimators = {}


In [22]:
# To run on the full dataset, use these:
X_TRAIN_DATA = X_train
Y_TRAIN_DATA = y_train


In [23]:
# Perform Grid Search or direct fitting and evaluate the best model
for name, mp in models_and_params.items():
    print(f"--- Processing: {name} ---")

    # If there are parameters to tune, use GridSearchCV
    if mp['params']:
        grid_search = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
        grid_search.fit(X_TRAIN_DATA, Y_TRAIN_DATA)
        best_model = grid_search.best_estimator_

        results[name] = {
            "Best CV Score": grid_search.best_score_,
            "Best Params": grid_search.best_params_
        }
        print(f"Best CV Score: {grid_search.best_score_:.4f}")
        print(f"Best Params: {grid_search.best_params_}")

    # If no parameters, just fit the model directly
    else:
        best_model = mp['model']
        best_model.fit(X_TRAIN_DATA, Y_TRAIN_DATA)
        results[name] = {
            "Best CV Score": None,
            "Best Params": "N/A"
        }

    best_estimators[name] = best_model
    y_pred = best_model.predict(X_test)

    # Store performance metrics
    results[name]["Accuracy"] = accuracy_score(y_test, y_pred)
    results[name]["Precision"] = precision_score(y_test, y_pred, average='weighted')
    results[name]["Recall"] = recall_score(y_test, y_pred, average='weighted')
    results[name]["F1-Score"] = f1_score(y_test, y_pred, average='weighted')

    print(f"\n--- Test Set Evaluation for {name} ---")
    print(classification_report(y_test, y_pred, target_names=['Normal', 'Attack']))
    print("-" * 70)


--- Processing: Logistic Regression ---
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best CV Score: 0.9364
Best Params: {'C': 1, 'penalty': 'l2'}

--- Test Set Evaluation for Logistic Regression ---
              precision    recall  f1-score   support

      Normal       0.92      0.96      0.94     15411
      Attack       0.95      0.92      0.93     14293

    accuracy                           0.94     29704
   macro avg       0.94      0.94      0.94     29704
weighted avg       0.94      0.94      0.94     29704

----------------------------------------------------------------------
--- Processing: Random Forest ---
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best CV Score: 0.9955
Best Params: {'max_depth': 20, 'n_estimators': 50}

--- Test Set Evaluation for Random Forest ---
              precision    recall  f1-score   support

      Normal       0.99      1.00      1.00     15411
      Attack       1.00      0.99      1.00     14293

    accuracy                           1.00     29704
   macro avg       1.00      1.00      1.00     29704
we

# --- 3. Comparison of Results ---

In [24]:
# Create a DataFrame from the detailed results for easy comparison
results_summary = {name: {k: v for k, v in res.items() if k not in ['Best Params']} for name, res in results.items()}
results_df = pd.DataFrame(results_summary).T.sort_values(by='F1-Score', ascending=False)


In [25]:
print("\n--- 🚀 Final Model Performance Comparison 🚀 ---")
print(results_df)



--- 🚀 Final Model Performance Comparison 🚀 ---
                              Best CV Score  Accuracy  Precision    Recall  \
Random Forest                      0.995539  0.995859   0.995866  0.995859   
Decision Tree                      0.994066  0.995623   0.995627  0.995623   
Gradient Boosting                  0.993932  0.994176   0.994178  0.994176   
K-Nearest Neighbors (KNN)          0.992324  0.992459   0.992460  0.992459   
Multi-layer Perceptron             0.989656  0.990069   0.990069  0.990069   
Support Vector Machine (SVM)       0.949947  0.952936   0.954236  0.952936   
Logistic Regression                0.936354  0.936642   0.937081  0.936642   
Gaussian Naive Bayes                    NaN  0.871734   0.872102  0.871734   

                              F1-Score  
Random Forest                 0.995859  
Decision Tree                 0.995623  
Gradient Boosting             0.994176  
K-Nearest Neighbors (KNN)     0.992459  
Multi-layer Perceptron        0.990069  
Sup

In [26]:
print("\n--- ⚙️ Best Hyperparameters Found ⚙️ ---")
for name, res in results.items():
    if res['Best Params'] != "N/A":
        print(f"\n{name}:")
        print(f"  {res['Best Params']}")


--- ⚙️ Best Hyperparameters Found ⚙️ ---

Logistic Regression:
  {'C': 1, 'penalty': 'l2'}

Random Forest:
  {'max_depth': 20, 'n_estimators': 50}

Gradient Boosting:
  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

Support Vector Machine (SVM):
  {'C': 1, 'kernel': 'rbf'}

K-Nearest Neighbors (KNN):
  {'n_neighbors': 3, 'weights': 'distance'}

Decision Tree:
  {'criterion': 'entropy', 'max_depth': 20}

Multi-layer Perceptron:
  {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,)}
