In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score

# --- Define Simulated Data Structure ---
# These are key feature names from the real CICIDS2017 dataset
feature_names = [
    'Destination Port', 'Flow Duration', 'Total Fwd Packets', 
    'Total Backward Packets', 'Total Length of Fwd Packets', 
    'Total Length of Bwd Packets', 'Flow IAT Mean', 'Fwd IAT Std', 
    'Fwd Header Length', 'Subflow Fwd Packets', 'Subflow Bwd Bytes'
]
N = 10000  # Number of simulated rows

# Generate random numerical data for the features
data = {}
for col in feature_names:
    data[col] = np.random.rand(N) * 1000

# Create the target variable (Binary_Label)
# We assume about 90% is Normal (0) and 10% is Attack (1)
labels = np.random.choice([0, 1], size=N, p=[0.9, 0.1])
data['Binary_Label'] = labels

# Create the DataFrame and define X (Features), Y (Target)
df_simulated = pd.DataFrame(data)
X = df_simulated.drop(columns=['Binary_Label']).astype('float32')
Y = df_simulated['Binary_Label']

# Save the simulated data. The deployment script will read this file.
df_simulated.to_csv('simulated_traffic_data.csv', index=False)

print(f"Simulated Data Created and Saved: {len(df_simulated)} rows.")
print(f"Features (X) shape: {X.shape}")
print("Target Distribution (0=Normal, 1=Attack):")
print(Y.value_counts())

Simulated Data Created and Saved: 10000 rows.
Features (X) shape: (10000, 11)
Target Distribution (0=Normal, 1=Attack):
Binary_Label
0    9000
1    1000
Name: count, dtype: int64


In [3]:
# 1. Split Data into Training and Testing Sets
# We reserve 30% (3000 rows) of the data to test the model on unseen traffic
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# 2. Filter for Normal Traffic (Crucial for Anomaly Detection)
# The model only learns the patterns of GOOD traffic (Label 0)
X_train_normal = X_train[Y_train == 0]
print(f"Training on {len(X_train_normal)} 'Normal' samples...")

# 3. Model Initialization and Training
model_if = IsolationForest(
    n_estimators=100, 
    contamination=0.1, # Matches our simulated 10% anomaly rate
    random_state=42, 
    n_jobs=-1 # Uses all CPU cores for speed
)

print("Starting Isolation Forest training...")
model_if.fit(X_train_normal)

# 4. Model Evaluation
# Predict on the full test set
Y_pred_if = model_if.predict(X_test)

# Convert Isolation Forest output (1=Normal, -1=Anomaly) to Binary (0=Normal, 1=Attack)
Y_pred_binary = np.where(Y_pred_if == -1, 1, 0)

# Display Performance Metrics
print("\n--- Model Performance (Anomaly Detector) ---")
print(f"Overall Accuracy: {accuracy_score(Y_test, Y_pred_binary):.4f}")
print("Classification Report (Focus on Recall for 'Attack'):")
print(classification_report(Y_test, Y_pred_binary, target_names=['Normal (0)', 'Attack (1)']))

# 5. Save the Trained Model
# This creates the deployable file the 'realtime_detector.py' will load
joblib.dump(model_if, 'nids_anomaly_detector_model.pkl')
print("\nModel saved successfully as 'nids_anomaly_detector_model.pkl'")

Training on 6273 'Normal' samples...
Starting Isolation Forest training...

--- Model Performance (Anomaly Detector) ---
Overall Accuracy: 0.8203
Classification Report (Focus on Recall for 'Attack'):
              precision    recall  f1-score   support

  Normal (0)       0.90      0.90      0.90      2711
  Attack (1)       0.10      0.10      0.10       289

    accuracy                           0.82      3000
   macro avg       0.50      0.50      0.50      3000
weighted avg       0.83      0.82      0.82      3000


Model saved successfully as 'nids_anomaly_detector_model.pkl'
