In [1]:
# ----------------------------------------------------------------------
# 0. SETUP AND LIBRARY IMPORTS
# ----------------------------------------------------------------------
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# ----------------------------------------------------------------------
# 1. DATA LOADING AND CONCATENATION
# ----------------------------------------------------------------------

# CRITICAL STEP: Ensure this path is 100% correct.
# If your folder name is different, change it here:
base_path = '/content/drive/MyDrive/CICIDS2017/'

# --- AUTOMATIC FILE DISCOVERY ---
# This finds ALL files ending in .csv (or .CSV) inside the directory.
file_paths = glob.glob(base_path + '*.csv')
# Add search for common capital extension if necessary (optional but safe)
file_paths.extend(glob.glob(base_path + '*.CSV'))

if not file_paths:
    raise ValueError(f"CRITICAL ERROR: No CSV files found in the directory. Path: {base_path}")

data_frames = []
print(f"Starting to load {len(file_paths)} CSV files...")
for file_path in file_paths:
    try:
        df_temp = pd.read_csv(file_path, low_memory=False)
        data_frames.append(df_temp)
        print(f"Loaded successfully: {file_path.split('/')[-1]}")
    except Exception as e:
        print(f"FATAL ERROR during reading: {file_path.split('/')[-1]}. Error: {e}")

if not data_frames:
    raise ValueError("Cannot proceed. Data loading failed after discovery. Check file integrity or permissions.")

df = pd.concat(data_frames, ignore_index=True)
print(f"\nAll files merged. Total dataset shape: {df.shape}")

# ----------------------------------------------------------------------
# 2. CLEANING AND SCALING (DATA PREPARATION PIPELINE)
# ----------------------------------------------------------------------

# 2.1 Clean Column Names
df.columns = df.columns.str.strip()

# 2.2 Drop Irrelevant/ID Columns
cols_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Unnamed: 0']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')

# 2.3 Handle NaN and Infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# 2.4 Handle Target Label (Binary Classification: 0=Normal, 1=Attack)
df['Label'] = df['Label'].apply(lambda x: 0 if str(x).strip() == 'BENIGN' else 1)
Y = df['Label']
X = df.drop(columns=['Label'])

# 2.5 Feature Scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# 2.6 Data Split (80% Train, 20% Test)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42, stratify=Y)

# 2.7 Handle Imbalance with SMOTE on Training Data
print("\nStarting SMOTE for imbalance handling...")

from imblearn.over_sampling import SMOTE

# Balance ONLY minority classes → avoid RAM crash
sm = SMOTE(random_state=42, sampling_strategy='not majority')

X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)

print("SMOTE complete successfully!")
print(f"New training shape: {X_train_res.shape}")



# ----------------------------------------------------------------------
# 3. MODEL 1: RANDOM FOREST IMPLEMENTATION
# ----------------------------------------------------------------------

print("\n\n--- Training Random Forest Model ---")

# A. Initialization and Training
# n_jobs=-1 uses all available cores for faster training
rf_model = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42, n_jobs=-1)
rf_model.fit(X_train_res, Y_train_res)


# B. Testing and Prediction
Y_pred_rf = rf_model.predict(X_test)
# Save probabilities for the Hybrid Model (Task 3)
Y_proba_rf = rf_model.predict_proba(X_test)

# C. Evaluation
print("\nRandom Forest Classification Report:")
print(classification_report(Y_test, Y_pred_rf))
print(f"Random Forest Accuracy: {accuracy_score(Y_test, Y_pred_rf):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, Y_pred_rf))

rf_metrics = classification_report(Y_test, Y_pred_rf, output_dict=True)

Starting to load 2 CSV files...
Loaded successfully: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loaded successfully: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv

All files merged. Total dataset shape: (396111, 79)

Starting SMOTE for imbalance handling...
SMOTE complete successfully!
New training shape: (425178, 78)


--- Training Random Forest Model ---

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     53148
           1       1.00      1.00      1.00     26041

    accuracy                           1.00     79189
   macro avg       1.00      1.00      1.00     79189
weighted avg       1.00      1.00      1.00     79189

Random Forest Accuracy: 0.9997

Confusion Matrix:
[[53148     0]
 [   24 26017]]


In [2]:
# ----------------------------------------------------------------------
# 4. MODEL 2: DENSE NEURAL NETWORK (DL Implementation)
# ----------------------------------------------------------------------

# NOTE: This code assumes X_train and Y_train are defined from Sections 1, 2, 3.

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import class_weight
import numpy as np

print("\n\n--- 4. Training Dense Neural Network Model ---")

# We use the original training variables (X_train, Y_train) to avoid memory issues
# that SMOTE caused, and apply class weighting for imbalance handling.
X_train_data = X_train
Y_train_data = Y_train

# 1. Calculate Class Weights for Imbalance
# This tells the model to pay more attention to the minority (Attack) class.
classes = np.unique(Y_train_data)
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=Y_train_data
)
class_weights_dict = dict(zip(classes, weights))
print(f"Calculated Class Weights: {class_weights_dict}")


# 2. Build the DNN Architecture
input_dim = X_train_data.shape[1]
num_classes = 1 # Binary classification output

dnn_model = Sequential([
    Dense(512, activation='relu', input_shape=(input_dim,)),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='sigmoid')
])


# 3. Compile the Model
dnn_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# 4. Training (This may take a few minutes)
print("Starting DNN training (using class weights for balance)...")
history = dnn_model.fit(X_train_data, Y_train_data,
                        epochs=10, # Number of training iterations
                        batch_size=32,
                        validation_split=0.1,
                        class_weight=class_weights_dict, # Apply class weights here
                        verbose=1)

# 5. Testing and Prediction
Y_pred_proba_dnn = dnn_model.predict(X_test)

# Convert probabilities to classes (0 or 1)
Y_pred_dnn = (Y_pred_proba_dnn > 0.5).astype(int)

# Get probabilities for Hybrid Model (Task 3)
# Y_proba_dnn is the confidence for both classes (Normal and Attack)
Y_proba_dnn = np.concatenate([1 - Y_pred_proba_dnn, Y_pred_proba_dnn], axis=1)

# 6. Evaluation
print("\nDNN Classification Report:")
print(classification_report(Y_test, Y_pred_dnn))
print(f"DNN Accuracy: {accuracy_score(Y_test, Y_pred_dnn):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, Y_pred_dnn))

dnn_metrics = classification_report(Y_test, Y_pred_dnn, output_dict=True)



--- 4. Training Dense Neural Network Model ---
Calculated Class Weights: {np.int64(0): np.float64(0.744989157482278), np.int64(1): np.float64(1.5204533236050843)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Starting DNN training (using class weights for balance)...
Epoch 1/10
[1m8909/8909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 7ms/step - accuracy: 0.9856 - loss: 0.0421 - precision: 0.9710 - recall: 0.9853 - val_accuracy: 0.9904 - val_loss: 0.0210 - val_precision: 0.9749 - val_recall: 0.9964
Epoch 2/10
[1m8909/8909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 7ms/step - accuracy: 0.9930 - loss: 0.0197 - precision: 0.9847 - recall: 0.9943 - val_accuracy: 0.9970 - val_loss: 0.0138 - val_precision: 0.9928 - val_recall: 0.9982
Epoch 3/10
[1m8909/8909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 7ms/step - accuracy: 0.9936 - loss: 0.0176 - precision: 0.9854 - recall: 0.9953 - val_accuracy: 0.9817 - val_loss: 0.0634 - val_precision: 0.9489 - val_recall: 0.9981
Epoch 4/10
[1m8909/8909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 7ms/step - accuracy: 0.9944 - loss: 0.0156 - precision: 0.9865 - recall: 0.9966 - val_accuracy: 0.9973 - val_loss: 0.013

# **Hybrid Model**

In [3]:
# ----------------------------------------------------------------------
# 5. HYBRID MODEL DEVELOPMENT (Stacked Generalization)
# ----------------------------------------------------------------------

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np

print("\n\n--- 5. Training Hybrid (Stacked) Model ---")

# --- Step 1: Create the new training feature set for the Meta-Classifier ---
# Get training predictions from Random Forest
Y_proba_rf_train = rf_model.predict_proba(X_train)
# Get training predictions from DNN
Y_proba_dnn_train = dnn_model.predict(X_train)
Y_proba_dnn_train = np.concatenate([1 - Y_proba_dnn_train, Y_proba_dnn_train], axis=1)

# Create the Hybrid training feature set
X_hybrid_train = np.concatenate((Y_proba_rf_train, Y_proba_dnn_train), axis=1)
Y_hybrid_train = Y_train
print(f"Hybrid Train Feature Shape: {X_hybrid_train.shape}")


# --- Step 2: Train the Meta-Classifier (Logistic Regression) ---
meta_classifier = LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000, n_jobs=-1)

print("\nTraining Meta-Classifier (Logistic Regression)...")
meta_classifier.fit(X_hybrid_train, Y_hybrid_train)


# --- Step 3: Create the testing feature set for the Meta-Classifier ---
# Concatenate the test prediction probabilities created in the RF and DNN steps
X_hybrid_test = np.concatenate((Y_proba_rf, Y_proba_dnn), axis=1)
print(f"Hybrid Test Feature Shape: {X_hybrid_test.shape}")


# --- Step 4: Evaluate the Hybrid Model ---
Y_pred_hybrid = meta_classifier.predict(X_hybrid_test)

print("\nHybrid Model Classification Report:")
print(classification_report(Y_test, Y_pred_hybrid))
print(f"Hybrid Model Accuracy: {accuracy_score(Y_test, Y_pred_hybrid):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(Y_test, Y_pred_hybrid))



--- 5. Training Hybrid (Stacked) Model ---
[1m9899/9899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step
Hybrid Train Feature Shape: (316753, 4)

Training Meta-Classifier (Logistic Regression)...
Hybrid Test Feature Shape: (79189, 4)

Hybrid Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     53148
           1       1.00      1.00      1.00     26041

    accuracy                           1.00     79189
   macro avg       1.00      1.00      1.00     79189
weighted avg       1.00      1.00      1.00     79189

Hybrid Model Accuracy: 0.9997

Confusion Matrix:
[[53146     2]
 [   21 26020]]
