In [None]:
import pandas as pd
import glob
import os
import numpy as np
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Load and Merge Real Data ---
# ***PATH IS CORRECTED to the folder created by unzip: MachineLearningCVE/ ***
data_path = './MachineLearningCVE/' 
all_files = glob.glob(os.path.join(data_path, "*.csv")) 

li = []
print("Starting to load and merge actual CICIDS2017 files. This will take a few minutes...")

for filename in all_files:
    try:
        df = pd.read_csv(filename, low_memory=False)
        df.columns = df.columns.str.strip() # Clean column names
        li.append(df)
    except Exception as e:
        print(f"Skipping problematic file: {os.path.basename(filename)}")

df_raw = pd.concat(li, axis=0, ignore_index=True, join='outer')
df_raw.rename(columns={'Label': 'Attack_Type'}, inplace=True)
print(f"Total rows loaded: {len(df_raw)}")

# --- 2. Cleaning and Saving Features ---
# Apply cleaning logic
df_raw.fillna(0, inplace=True); df_raw.replace([np.inf, -np.inf], 0, inplace=True)
df_raw.drop_duplicates(inplace=True, keep='first')

# Final feature selection and saving
df_raw['Binary_Label'] = df_raw['Attack_Type'].apply(lambda x: 0 if x == 'BENIGN' else 1)
features_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Attack_Type']
df_clean = df_raw.drop(columns=features_to_drop, errors='ignore')

# Separate Features (X) and Target (Y)
X = df_clean.drop(columns=['Binary_Label']).astype('float32')
Y = df_clean['Binary_Label']

# Save the final processed data for the deployment script
X.to_csv('combined_real_traffic_features.csv', index=False) 
print(f"\nReal data processing complete. Total rows: {len(X)}. Features (X) shape: {X.shape}. **Now run the training cell!**")

In [None]:
# 1. Split Data into Training and Testing Sets
# We reserve 30% (3000 rows) of the data to test the model on unseen traffic
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# 2. Filter for Normal Traffic (Crucial for Anomaly Detection)
# The model only learns the patterns of GOOD traffic (Label 0)
X_train_normal = X_train[Y_train == 0]
print(f"Training on {len(X_train_normal)} 'Normal' samples...")

# 3. Model Initialization and Training
model_if = IsolationForest(
    n_estimators=100, 
    contamination=0.1, # Matches our simulated 10% anomaly rate
    random_state=42, 
    n_jobs=-1 # Uses all CPU cores for speed
)

print("Starting Isolation Forest training...")
model_if.fit(X_train_normal)

# 4. Model Evaluation
# Predict on the full test set
Y_pred_if = model_if.predict(X_test)

# Convert Isolation Forest output (1=Normal, -1=Anomaly) to Binary (0=Normal, 1=Attack)
Y_pred_binary = np.where(Y_pred_if == -1, 1, 0)

# Display Performance Metrics
print("\n--- Model Performance (Anomaly Detector) ---")
print(f"Overall Accuracy: {accuracy_score(Y_test, Y_pred_binary):.4f}")
print("Classification Report (Focus on Recall for 'Attack'):")
print(classification_report(Y_test, Y_pred_binary, target_names=['Normal (0)', 'Attack (1)']))

# 5. Save the Trained Model
# This creates the deployable file the 'realtime_detector.py' will load
joblib.dump(model_if, 'nids_anomaly_detector_model.pkl')
print("\nModel saved successfully as 'nids_anomaly_detector_model.pkl'")