# # Network_Threat_Analysis_System
# ## Comprehensive Feature Extraction and Machine Learning Framework
# 
# This notebook integrates three analysis approaches:
# 1. Statistical flow analysis
# 2. Semantic content analysis  
# 3. Visual pattern generation
#
# Output includes both human-readable reports and ML-ready datasets.


# ### Step 1: Environment Setup and Dependency Installation

# ### Step 1:  Setup and Imports


In [None]:
# Cell 1: Setup and Imports - Complete Environment Initialization
"""
PURPOSE: Initialize the complete analysis environment with all required libraries
This cell sets up the entire working environment for network packet analysis.

WHAT THIS CELL DOES:
1. Imports all necessary Python libraries for:
   - Network packet parsing (scapy, pyshark)
   - Data processing (pandas, numpy)
   - Machine learning (sklearn, xgboost)
   - Visualization (plotly, matplotlib, seaborn)
   - System monitoring (psutil for memory tracking)
   - File operations (h5py for efficient storage)

2. Configures display settings for better readability
3. Shows system information (total RAM, available memory)
4. Timestamps when analysis starts

WHY THESE LIBRARIES:
- scapy/pyshark: Parse PCAP files and extract packet information
- pandas/numpy: Handle large datasets efficiently with DataFrames
- sklearn: Provides ML algorithms and feature selection tools
- SGDClassifier: Enables incremental learning for large datasets
- xgboost: High-performance gradient boosting for classification
- plotly: Creates interactive visualizations
- psutil: Monitors memory usage to prevent crashes
- h5py: Enables disk-based storage for datasets larger than RAM
"""

import os
import gc
import warnings
import json
import pickle
import hashlib
import re
import h5py
import psutil
from datetime import datetime
from collections import defaultdict, Counter, deque
from typing import Dict, List, Tuple, Optional, Any, Generator

# Data processing libraries
import numpy as np
import pandas as pd
from scipy import stats

# Machine Learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier  # For incremental learning when data doesn't fit in memory
import xgboost as xgb

# Network analysis libraries
from scapy.all import *  # For packet parsing
import pyshark  # Alternative packet parser with better protocol support

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots

# Jupyter notebook specific
from IPython.display import display, clear_output, HTML
from tqdm.notebook import tqdm  # Progress bars for loops

# Interactive widgets for configuration
import ipywidgets as widgets

# Global configuration
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output
pd.set_option('display.max_columns', 50)  # Show more columns in DataFrame display
pd.set_option('display.max_rows', 100)  # Show more rows in DataFrame display

# Display system information
print("="*70)
print("MEMORY-OPTIMIZED NETWORK ANALYSIS PIPELINE")
print("="*70)
print(f"System Memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")
print(f"Available Memory: {psutil.virtual_memory().available / (1024**3):.1f} GB")
print(f"CPU Cores: {psutil.cpu_count()}")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# ### Step 2: Configuration Interface

In [None]:
# Cell 2: Configuration Class - Interactive Settings & Memory Management
"""
PURPOSE: Central configuration hub for all analysis parameters
This cell creates an interactive configuration interface using Jupyter widgets.

WHAT THIS CELL DOES:
1. Defines all configurable parameters for the analysis pipeline
2. Creates an interactive UI with sliders, text boxes, and checkboxes
3. Automatically determines optimal processing strategy based on file size
4. Validates user inputs and checks system resources
5. Sets up directory structure for outputs and temporary files

KEY FUNCTIONALITY:
- File Management: Handles PCAP input files and CICIDS label CSV files
- Memory Management: Sets limits on RAM usage and flow storage
- Processing Strategy: Chooses between disk-based or memory-based processing
- ML Configuration: Selects which models to train and sampling strategy
- Feature Settings: Determines number of features and data types to use

CONFIGURATION CATEGORIES:
1. Input/Output: PCAP file, label files, output directory
2. Memory Settings: MAX_MEMORY_GB, MAX_FLOWS_IN_MEMORY, batch sizes
3. Processing: Chunk sizes, flow timeouts, packet limits
4. Analysis Modes: Flow analysis, semantic analysis, NLP depth
5. ML Settings: Model selection, train/test split, sampling
6. Storage: HDF5 usage, compression, temporary file cleanup

The class uses class methods (@classmethod) so configuration is global
and accessible throughout the entire pipeline.
"""

class Config:
    # ============= FILE PATHS =============
    PCAP_FILE = ''  # Path to input PCAP file (10+ GB for CICIDS2017)
    LABEL_FILE = ''  # Single label file path (will be set if multiple CSVs combined)
    LABEL_FILES = []  # List of CICIDS2017 CSV label files
    OUTPUT_DIR = './analysis_output'  # Where to save results
    
    # ============= MEMORY OPTIMIZATION =============
    MAX_MEMORY_GB = 4.0  # Maximum RAM to use (prevents system freeze)
    MAX_FLOWS_IN_MEMORY = 50000  # Flows kept in RAM before disk flush
    USE_DISK_CACHE = True  # Use HDF5 disk storage for large files
    TEMP_DIR = './temp_processing'  # Temporary storage location
    
    # ============= PROCESSING PARAMETERS =============
    CHUNK_SIZE = 10000  # Packets processed at once (balance speed vs memory)
    BATCH_SIZE = 100000  # Flows per ML training batch
    MAX_PACKETS = 0  # Limit packets to process (0 = unlimited)
    FLOW_TIMEOUT = 120  # Seconds before flow considered complete
    
    # ============= SAMPLING STRATEGY =============
    USE_SAMPLING = True  # Sample data if too large for ML
    SAMPLE_SIZE = 500000  # Maximum flows for ML training
    STRATIFY_SAMPLE = True  # Maintain attack type distribution in sample
    
    # ============= ANALYSIS MODES =============
    ANALYSIS_MODE = 'combined'  # Options: 'flow', 'semantic', 'combined'
    DEEP_INSPECTION = True  # Enable NLP payload analysis (slower but better detection)
    USE_CICIDS_LABELS = False  # Whether to use CICIDS ground truth
    GENERATE_VISUALS = True  # Create visualization charts
    ML_EXPORT = True  # Export ML models and features
    SKIP_ML = False  # Skip ML training for pipeline testing
    
    # ============= FEATURE ENGINEERING =============
    TOP_FEATURES = 30  # Number of best features to select
    FEATURE_DTYPE = np.float32  # Data type (float32 saves memory vs float64)
    
    # ============= MACHINE LEARNING =============
    TEST_SIZE = 0.2  # Fraction of data for testing
    RANDOM_STATE = 42  # Random seed for reproducibility
    SELECTED_MODELS = ['sgd', 'xgboost_incremental']  # Memory-efficient models
    USE_INCREMENTAL_LEARNING = True  # Train in batches for large datasets
    
    # ============= STORAGE SETTINGS =============
    USE_HDF5 = True  # Use HDF5 format for efficient disk storage
    COMPRESSION = 'gzip'  # Compress data to save space
    CLEANUP_TEMP = True  # Delete temporary files after completion
    
    @classmethod
    def check_memory_requirements(cls):
        """
        Analyzes PCAP file size and available RAM to determine processing strategy.
        
        Returns:
            str: 'DISK_BASED' for large files, 'MEMORY_BASED' for small files
        
        This method prevents out-of-memory errors by choosing appropriate
        processing strategy before analysis begins.
        """
        if not cls.PCAP_FILE or not os.path.exists(cls.PCAP_FILE):
            return False
            
        file_size_gb = os.path.getsize(cls.PCAP_FILE) / (1024**3)
        available_gb = psutil.virtual_memory().available / (1024**3)
        
        print("\n" + "="*60)
        print("MEMORY ASSESSMENT")
        print("="*60)
        print(f"PCAP Size: {file_size_gb:.2f} GB")
        print(f"Available RAM: {available_gb:.2f} GB")
        print(f"Max Memory Setting: {cls.MAX_MEMORY_GB:.2f} GB")
        
        # Decision logic for processing strategy
        if file_size_gb > available_gb * 0.3:  # File is >30% of available RAM
            print("\nRECOMMENDATION: Large file detected")
            print("- Using disk-based processing")
            print("- Enabling flow timeout mechanism")
            print("- Using incremental learning")
            cls.USE_DISK_CACHE = True
            cls.USE_INCREMENTAL_LEARNING = True
            cls.USE_SAMPLING = True
            processing_strategy = "DISK_BASED"
        else:
            print("\nRECOMMENDATION: File can fit in memory")
            print("- Using hybrid processing")
            processing_strategy = "MEMORY_BASED"
        
        # Time estimation (empirical: 1GB ≈ 3-5 minutes)
        estimated_time = file_size_gb * (5 if cls.USE_DISK_CACHE else 3)
        print(f"\nEstimated Processing Time: {estimated_time:.0f} minutes")
        
        return processing_strategy
    
    @classmethod
    def setup_interactive_ui(cls):
        """
        Creates an interactive configuration interface using Jupyter widgets.
        This provides a user-friendly way to set all parameters without
        editing code directly.
        """
        
        display(HTML("<h2>Memory-Optimized Network Analysis Configuration</h2>"))
        
        # ========== MEMORY SETTINGS SECTION ==========
        display(HTML("<h3>Memory Optimization Settings</h3>"))
        
        memory_slider = widgets.FloatSlider(
            value=4.0,
            min=1.0,
            max=psutil.virtual_memory().total / (1024**3),
            step=0.5,
            description='Max RAM (GB):',
            style={'description_width': 'initial'},
            readout_format='.1f'
        )
        
        max_flows = widgets.IntText(
            value=50000,
            description='Max Flows in Memory:',
            tooltip='Flows to keep before flushing to disk',
            style={'description_width': 'initial'}
        )
        
        use_sampling = widgets.Checkbox(
            value=True,
            description='Use Sampling for ML (recommended for large files)',
            indent=False
        )
        
        sample_size = widgets.IntText(
            value=500000,
            description='Sample Size:',
            disabled=not use_sampling.value,
            style={'description_width': 'initial'}
        )
        
        # Skip ML checkbox for testing
        skip_ml = widgets.Checkbox(
            value=False,
            description='Skip ML Training (for testing pipeline)',
            indent=False,
            style={'description_width': 'initial'}
        )
        
        # Dynamic UI: Enable/disable sample size based on checkbox
        def toggle_sample_size(change):
            sample_size.disabled = not change['new']
        use_sampling.observe(toggle_sample_size, names='value')
        
        # ========== FILE SELECTION SECTION ==========
        pcap_input = widgets.Text(
            placeholder='/path/to/your.pcap',
            description='PCAP File:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%')
        )
        
        label_input = widgets.Textarea(
            placeholder='Enter CSV paths (one per line) or leave empty',
            description='Label Files:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%', height='80px')
        )
        
        output_dir = widgets.Text(
            value='./analysis_output',
            description='Output Dir:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%')
        )
        
        # ========== PROCESSING OPTIONS ==========
        chunk_slider = widgets.IntSlider(
            value=10000,
            min=1000,
            max=50000,
            step=1000,
            description='Chunk Size:',
            continuous_update=False,
            tooltip='Packets processed at once',
            style={'description_width': 'initial'}
        )
        
        batch_slider = widgets.IntSlider(
            value=100000,
            min=10000,
            max=500000,
            step=10000,
            description='Batch Size:',
            continuous_update=False,
            tooltip='Flows per ML batch',
            style={'description_width': 'initial'}
        )
        
        # ========== ANALYSIS OPTIONS ==========
        analysis_mode = widgets.RadioButtons(
            options=['flow', 'semantic', 'combined'],
            value='combined',
            description='Analysis Mode:',
            style={'description_width': 'initial'}
        )
        
        deep_inspection = widgets.Checkbox(
            value=True,
            description='Enable NLP Deep Inspection',
            tooltip='May increase processing time',
            indent=False
        )
        
        # ========== ML MODEL SELECTION ==========
        model_selector = widgets.SelectMultiple(
            options=['sgd', 'xgboost_incremental', 'minibatch_kmeans'],
            value=['sgd', 'xgboost_incremental'],
            description='ML Models:',
            tooltip='Memory-efficient models for large datasets',
            rows=3,
            style={'description_width': 'initial'},
            disabled=skip_ml.value
        )
        
        # Disable models when ML is skipped
        def toggle_models(change):
            model_selector.disabled = change['new']
        skip_ml.observe(toggle_models, names='value')
        
        # Progress output area
        output_area = widgets.Output()
        
        # ========== VALIDATE BUTTON ==========
        validate_btn = widgets.Button(
            description='Validate & Analyze Memory',
            button_style='success',
            layout=widgets.Layout(width='250px', height='40px')
        )
        
        def validate_config(b):
            """
            Callback function executed when validate button is clicked.
            Validates all inputs and sets configuration values.
            """
            with output_area:
                clear_output()
                
                # Get values from all widgets
                cls.PCAP_FILE = pcap_input.value.strip().strip('"')
                cls.OUTPUT_DIR = output_dir.value
                cls.MAX_MEMORY_GB = memory_slider.value
                cls.MAX_FLOWS_IN_MEMORY = max_flows.value
                cls.CHUNK_SIZE = chunk_slider.value
                cls.BATCH_SIZE = batch_slider.value
                cls.USE_SAMPLING = use_sampling.value
                cls.SAMPLE_SIZE = sample_size.value
                cls.ANALYSIS_MODE = analysis_mode.value
                cls.DEEP_INSPECTION = deep_inspection.value
                cls.SELECTED_MODELS = list(model_selector.value)
                cls.SKIP_ML = skip_ml.value
                
                # Process CSV label files
                csv_lines = label_input.value.strip().split('\n')
                csv_files = [f.strip().strip('"') for f in csv_lines if f.strip()]
                
                # Validate PCAP exists
                if not os.path.exists(cls.PCAP_FILE):
                    print(f"Error: PCAP file not found: {cls.PCAP_FILE}")
                    return
                
                # Analyze memory requirements
                strategy = cls.check_memory_requirements()
                
                # Validate CSV files if provided
                if csv_files:
                    cls.USE_CICIDS_LABELS = True
                    valid_csvs = []
                    for csv in csv_files:
                        if os.path.exists(csv):
                            valid_csvs.append(csv)
                            print(f"Found: {os.path.basename(csv)}")
                        else:
                            print(f"Warning: Not found: {csv}")
                    
                    if valid_csvs:
                        cls.LABEL_FILES = valid_csvs
                
                # Create required directories
                os.makedirs(cls.OUTPUT_DIR, exist_ok=True)
                os.makedirs(cls.TEMP_DIR, exist_ok=True)
                
                # Display final configuration
                cls.display_config()
                
                print(f"\nProcessing Strategy: {strategy}")
                if cls.SKIP_ML:
                    print("ML TRAINING WILL BE SKIPPED (Test Mode)")
                print("Ready to start analysis!")
        
        validate_btn.on_click(validate_config)
        
        # ========== LAYOUT ORGANIZATION ==========
        memory_box = widgets.VBox([
            widgets.HTML("<h4>Memory Settings</h4>"),
            memory_slider,
            max_flows,
            use_sampling,
            sample_size
        ])
        
        file_box = widgets.VBox([
            widgets.HTML("<h4>Input/Output Files</h4>"),
            pcap_input,
            label_input,
            output_dir
        ])
        
        processing_box = widgets.VBox([
            widgets.HTML("<h4>Processing Settings</h4>"),
            chunk_slider,
            batch_slider,
            analysis_mode,
            deep_inspection,
            skip_ml
        ])
        
        model_box = widgets.VBox([
            widgets.HTML("<h4>ML Models (Memory-Efficient)</h4>"),
            model_selector
        ])
        
        # Display complete interface
        display(widgets.VBox([
            memory_box,
            widgets.HTML("<hr>"),
            file_box,
            widgets.HTML("<hr>"),
            processing_box,
            widgets.HTML("<hr>"),
            model_box,
            widgets.HTML("<hr>"),
            validate_btn,
            output_area
        ]))
        
        return output_area
    
    @classmethod
    def display_config(cls):
        """Displays a summary of the current configuration"""
        print("\n" + "="*60)
        print("CONFIGURATION SUMMARY")
        print("="*60)
        print(f"PCAP: {os.path.basename(cls.PCAP_FILE)}")
        print(f"Output: {cls.OUTPUT_DIR}")
        print(f"Memory Limit: {cls.MAX_MEMORY_GB} GB")
        print(f"Processing Strategy: {'Disk-based' if cls.USE_DISK_CACHE else 'Memory-based'}")
        print(f"Chunk Size: {cls.CHUNK_SIZE:,} packets")
        print(f"Batch Size: {cls.BATCH_SIZE:,} flows")
        if cls.USE_SAMPLING:
            print(f"ML Sample Size: {cls.SAMPLE_SIZE:,} flows")
        if cls.SKIP_ML:
            print("ML Training: DISABLED (Test Mode)")
        print("="*60)

# Run the interactive UI
output = Config.setup_interactive_ui()

# ### Step 2.5: (OPTIONAL) Dataset Preparation (For Testing & Debugging) Use this to make smaller PCAP file and csv to save time. 

In [None]:
# Cell 2.5: Test Dataset Preparation - Quick Validation Before Full Run
"""
PURPOSE: Create small test datasets to validate pipeline functionality
This cell helps you test the entire pipeline in minutes instead of hours.

WHAT THIS CELL DOES:
1. Creates a smaller PCAP file from your main file
2. Extracts corresponding CSV label rows  
3. Provides UI to customize test size
4. Validates pipeline works before committing to full analysis

WHY THIS MATTERS:
- Catches errors in 5 minutes instead of 4 hours
- Validates file locking fixes work
- Tests memory settings are appropriate
- Confirms CSV label matching functions
"""

import subprocess
import pandas as pd
import os
from IPython.display import display, HTML
import ipywidgets as widgets

class TestDatasetCreator:
    @staticmethod
    def create_test_ui():
        """
        Creates an interactive UI for generating test datasets
        """
        display(HTML("<h2>Test Dataset Generator</h2>"))
        display(HTML("<p>Create small test files to validate pipeline before full run</p>"))
        
        # Packet count slider
        packet_count = widgets.IntSlider(
            value=100000,
            min=10000,
            max=500000,
            step=10000,
            description='Test Packets:',
            style={'description_width': 'initial'},
            tooltip='Number of packets for test PCAP'
        )
        
        # CSV row count
        csv_rows = widgets.IntText(
            value=5000,
            description='CSV Rows:',
            style={'description_width': 'initial'},
            tooltip='Number of label rows to extract'
        )
        
        # Source file input
        source_pcap = widgets.Text(
            value=Config.PCAP_FILE if Config.PCAP_FILE else '',
            description='Source PCAP:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%')
        )
        
        # Source CSV input  
        source_csv = widgets.Textarea(
            value='\n'.join(Config.LABEL_FILES) if Config.LABEL_FILES else '',
            description='Source CSVs:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%', height='80px'),
            placeholder='Enter CSV paths (one per line)'
        )
        
        # Output directory
        test_dir = widgets.Text(
            value='./test_data',
            description='Test Directory:',
            style={'description_width': 'initial'}
        )
        
        # Progress output
        output_area = widgets.Output()
        
        # Create button
        create_btn = widgets.Button(
            description='Create Test Datasets',
            button_style='primary',
            layout=widgets.Layout(width='200px', height='40px')
        )
        
        def create_test_data(b):
            with output_area:
                clear_output()
                
                # Get values
                pcap_path = source_pcap.value.strip()
                csv_paths = [f.strip() for f in source_csv.value.strip().split('\n') if f.strip()]
                test_path = test_dir.value
                num_packets = packet_count.value
                num_rows = csv_rows.value
                
                # Validate inputs
                if not os.path.exists(pcap_path):
                    print(f"❌ Error: PCAP file not found: {pcap_path}")
                    return
                
                # Create test directory
                os.makedirs(test_path, exist_ok=True)
                
                print("="*60)
                print("CREATING TEST DATASETS")
                print("="*60)
                
                # Create test PCAP
                test_pcap = os.path.join(test_path, f'test_{num_packets}_packets.pcap')
                print(f"\n1. Creating test PCAP with {num_packets:,} packets...")
                
                try:
                    # Use tcpdump to extract packets
                    cmd = f'tcpdump -r "{pcap_path}" -w "{test_pcap}" -c {num_packets}'
                    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
                    
                    if os.path.exists(test_pcap):
                        size_mb = os.path.getsize(test_pcap) / (1024*1024)
                        print(f"   ✓ Created: {test_pcap}")
                        print(f"   Size: {size_mb:.2f} MB")
                    else:
                        # Fallback to Python method if tcpdump fails
                        print("   tcpdump failed, using Python method...")
                        TestDatasetCreator.create_test_pcap_python(pcap_path, test_pcap, num_packets)
                        
                except Exception as e:
                    print(f"   Error creating PCAP: {e}")
                    print("   Trying Python-based extraction...")
                    TestDatasetCreator.create_test_pcap_python(pcap_path, test_pcap, num_packets)
                
                # Create test CSVs
                if csv_paths:
                    print(f"\n2. Creating test CSVs with {num_rows:,} rows each...")
                    test_csvs = []
                    
                    for csv_path in csv_paths:
                        if not os.path.exists(csv_path):
                            print(f"   ⚠️  CSV not found: {csv_path}")
                            continue
                        
                        # Read and subset CSV
                        csv_name = os.path.basename(csv_path)
                        test_csv = os.path.join(test_path, f'test_{csv_name}')
                        
                        try:
                            df = pd.read_csv(csv_path, encoding='latin-1', nrows=num_rows)
                            df.to_csv(test_csv, index=False)
                            test_csvs.append(test_csv)
                            
                            # Show label distribution
                            label_col = None
                            for col in ['Label', 'label', ' Label']:
                                if col in df.columns:
                                    label_col = col
                                    break
                            
                            if label_col:
                                print(f"\n   ✓ Created: {test_csv}")
                                print(f"   Label distribution:")
                                for label, count in df[label_col].value_counts().head(5).items():
                                    print(f"      {label}: {count}")
                        
                        except Exception as e:
                            print(f"   Error processing {csv_name}: {e}")
                else:
                    test_csvs = []
                    print("\n2. No CSV files provided, skipping label extraction")
                
                # Generate configuration code
                print("\n" + "="*60)
                print("TEST CONFIGURATION")
                print("="*60)
                print("\nAdd this to your Config or use directly:\n")
                print(f"Config.PCAP_FILE = r'{test_pcap}'")
                print(f"Config.LABEL_FILES = {test_csvs}")
                print(f"Config.MAX_PACKETS = 0  # Process all packets in test file")
                print(f"Config.SAMPLE_SIZE = {min(50000, num_packets // 2)}")
                
                # Estimate time
                est_time = num_packets / 20000  # ~20K packets per minute
                print(f"\nEstimated test time: {est_time:.1f} minutes")
                print("\n✅ Test datasets created successfully!")
                print("   Run the pipeline with these files to validate before full analysis")
        
        create_btn.on_click(create_test_data)
        
        # Layout
        display(widgets.VBox([
            widgets.HTML("<h4>Test Size Configuration</h4>"),
            packet_count,
            csv_rows,
            widgets.HTML("<hr>"),
            widgets.HTML("<h4>Source Files</h4>"),
            source_pcap,
            source_csv,
            widgets.HTML("<hr>"),
            widgets.HTML("<h4>Output Location</h4>"),
            test_dir,
            widgets.HTML("<hr>"),
            create_btn,
            output_area
        ]))
    
    @staticmethod
    def create_test_pcap_python(source_pcap, output_pcap, packet_count):
        """
        Python fallback method to create test PCAP if tcpdump unavailable
        """
        from scapy.all import PcapReader, PcapWriter
        
        try:
            with PcapReader(source_pcap) as reader:
                with PcapWriter(output_pcap) as writer:
                    for i, packet in enumerate(reader):
                        if i >= packet_count:
                            break
                        writer.write(packet)
            
            if os.path.exists(output_pcap):
                size_mb = os.path.getsize(output_pcap) / (1024*1024)
                print(f"   ✓ Created using Python: {output_pcap}")
                print(f"   Size: {size_mb:.2f} MB")
        except Exception as e:
            print(f"   Error with Python method: {e}")

# Run the UI
TestDatasetCreator.create_test_ui()

# ### Step 3:  Flow Feature Extractor Implementation

In [None]:
# Cell 3: Flow Feature Extractor - Network Traffic Statistical Analysis
"""
PURPOSE: Extract statistical features from network flows
This class analyzes network packets and groups them into flows, then extracts
statistical features that help identify malicious traffic patterns.

WHAT THIS CELL DOES:
1. Groups packets into bidirectional flows (conversations between hosts)
2. Extracts per-packet features (size, flags, ports, timing)
3. Aggregates packet features into flow-level statistics
4. Implements CICFlowMeter-style feature extraction
5. Manages memory by flushing old flows to disk

KEY CONCEPTS:
- FLOW: A sequence of packets between two endpoints (identified by 5-tuple)
- 5-TUPLE: (src_ip, src_port, dst_ip, dst_port, protocol)
- BIDIRECTIONAL: Treats A→B and B→A as the same flow
- FLOW TIMEOUT: After 120 seconds of inactivity, flow is considered complete

FEATURES EXTRACTED:
1. Timing Features:
   - Flow duration
   - Inter-arrival times (IAT) statistics
   - Packets/bytes per second

2. Size Features:
   - Packet length statistics (min, max, mean, std)
   - Total bytes/packets
   - Payload sizes

3. TCP Flag Features:
   - SYN, ACK, FIN, RST, PSH counts
   - Used to detect scanning, flooding attacks

4. Protocol Features:
   - TTL values
   - Port numbers
   - Protocol type

MEMORY OPTIMIZATION:
- Flushes flows to HDF5 when memory limit reached
- Uses flow timeout to prevent infinite accumulation
- Stores features as float32 instead of float64
"""

class MemoryOptimizedFlowExtractor:
    def __init__(self, max_flows_in_memory=50000, flow_timeout=120):
        """
        Initialize flow feature extractor with memory management.
        
        Args:
            max_flows_in_memory: Maximum concurrent flows before flushing to disk
            flow_timeout: Seconds of inactivity before flow is complete
        """
        self.flows = {}  # Dictionary to store active flows
        self.max_flows_in_memory = max_flows_in_memory
        self.flow_timeout = flow_timeout
        self.flow_counter = 0  # Total flows seen
        self.batch_counter = 0  # Number of batches written to disk
        
        # Create HDF5 file for persistent storage
        self.h5_filename = os.path.join(Config.TEMP_DIR, 'flow_features.h5')
        self.h5_store = pd.HDFStore(self.h5_filename, mode='w', complevel=6)
        
    def get_flow_id(self, packet):
        """
        Generate unique identifier for a network flow.
        Uses 5-tuple (IPs, ports, protocol) to identify flows.
        Makes flows bidirectional by sorting endpoints.
        
        Returns:
            str: 16-character hash identifying the flow
        """
        try:
            if IP in packet:
                src = packet[IP].src
                dst = packet[IP].dst
                proto = packet[IP].proto
                
                # Extract ports if TCP/UDP
                sport = dport = 0
                if TCP in packet:
                    sport = packet[TCP].sport
                    dport = packet[TCP].dport
                elif UDP in packet:
                    sport = packet[UDP].sport
                    dport = packet[UDP].dport
                
                # Make bidirectional by sorting endpoints
                flow_tuple = tuple(sorted([(src, sport), (dst, dport)])) + (proto,)
                
                # Create hash for efficient lookup
                return hashlib.md5(str(flow_tuple).encode()).hexdigest()[:16]
        except:
            return None
    
    def extract_packet_features(self, packet):
        """
        Extract features from a single packet.
        These features will be aggregated into flow statistics.
        
        Features include:
        - Basic: timestamp, packet length
        - IP: TTL, protocol
        - TCP: flags, window size, ports
        - Payload: size and entropy (randomness)
        """
        features = {}
        
        try:
            # Basic packet features
            features['timestamp'] = float(packet.time)
            features['packet_length'] = len(packet)
            
            # IP layer features
            if IP in packet:
                features['ttl'] = packet[IP].ttl
                features['protocol'] = packet[IP].proto
                
            # TCP layer features
            if TCP in packet:
                features['tcp_flags'] = int(packet[TCP].flags)
                features['window_size'] = packet[TCP].window
                features['src_port'] = packet[TCP].sport
                features['dst_port'] = packet[TCP].dport
                
                # Individual flag extraction (for detecting attacks)
                features['flag_syn'] = bool(packet[TCP].flags & 2)  # SYN flag
                features['flag_ack'] = bool(packet[TCP].flags & 16)  # ACK flag
                features['flag_fin'] = bool(packet[TCP].flags & 1)  # FIN flag
                features['flag_rst'] = bool(packet[TCP].flags & 4)  # RST flag
                features['flag_psh'] = bool(packet[TCP].flags & 8)  # PSH flag
                
            # UDP layer features
            elif UDP in packet:
                features['src_port'] = packet[UDP].sport
                features['dst_port'] = packet[UDP].dport
                
            # Payload features (important for detecting malware)
            if Raw in packet:
                payload = bytes(packet[Raw])
                features['payload_size'] = len(payload)
                features['payload_entropy'] = self.calculate_entropy(payload)
            else:
                features['payload_size'] = 0
                features['payload_entropy'] = 0
                
        except Exception as e:
            pass  # Skip problematic packets
            
        return features
    
    def calculate_entropy(self, data):
        """
        Calculate Shannon entropy of payload data.
        High entropy suggests encryption/compression (possibly malware).
        Low entropy suggests plain text.
        
        Returns:
            float: Entropy value (0-8 bits)
        """
        if not data:
            return 0
        
        entropy = 0
        for i in range(256):
            p = data.count(i) / len(data)
            if p > 0:
                entropy -= p * np.log2(p)
        return entropy
    
    def flush_old_flows(self, current_time, force_all=False):
        """
        Save completed flows to disk and free memory.
        A flow is complete if it hasn't seen packets for flow_timeout seconds.
        
        Args:
            current_time: Timestamp of current packet
            force_all: Force flush all flows regardless of timeout
        """
        flows_to_flush = []
        
        # Identify flows that have timed out
        for flow_id, packets in self.flows.items():
            if not packets:
                continue
                
            last_packet_time = packets[-1]['timestamp']
            time_since_last = current_time - last_packet_time
            
            if force_all or time_since_last > self.flow_timeout:
                # Aggregate packet features into flow features
                features = self.aggregate_flow_features(packets)
                if features:
                    features['flow_id'] = flow_id
                    flows_to_flush.append(features)
        
        # Save to disk if we have flows to flush
        if flows_to_flush:
            # Convert to DataFrame
            df_batch = pd.DataFrame(flows_to_flush)
            
            # Optimize memory usage with float32
            for col in df_batch.select_dtypes(include=[np.float64]).columns:
                df_batch[col] = df_batch[col].astype(np.float32)
            
            # Append to HDF5 file
            self.h5_store.append(
                f'batch_{self.batch_counter}',
                df_batch,
                format='table',
                data_columns=True,
                min_itemsize={'flow_id': 16}
            )
            
            self.batch_counter += 1
            
            # Remove flushed flows from memory
            for features in flows_to_flush:
                del self.flows[features['flow_id']]
            
            print(f"  Flushed {len(flows_to_flush):,} flows to disk (batch {self.batch_counter})")
            
            # Force garbage collection to free memory
            gc.collect()
    
    def aggregate_flow_features(self, packets):
        """
        Aggregate packet-level features into flow-level statistics.
        This is where we calculate the actual features used for ML.
        
        Creates statistical summaries that capture flow behavior:
        - Duration and timing patterns
        - Size distributions
        - Flag patterns
        - Rate calculations
        """
        if not packets:
            return None
            
        # Sort packets by timestamp
        packets = sorted(packets, key=lambda x: x.get('timestamp', 0))
        
        # Extract timestamp array
        timestamps = [p['timestamp'] for p in packets]
        duration = max(timestamps) - min(timestamps) if len(timestamps) > 1 else 0
        
        # Extract packet lengths
        lengths = [p.get('packet_length', 0) for p in packets]
        
        # Calculate inter-arrival times (time between packets)
        iats = np.diff(timestamps) if len(timestamps) > 1 else [0]
        
        # Build feature dictionary
        features = {
            # ===== BASIC FLOW STATISTICS =====
            'flow_duration': duration,
            'total_packets': len(packets),
            'total_bytes': sum(lengths),
            
            # ===== PACKET SIZE STATISTICS =====
            'min_packet_length': min(lengths) if lengths else 0,
            'max_packet_length': max(lengths) if lengths else 0,
            'mean_packet_length': np.mean(lengths) if lengths else 0,
            'std_packet_length': np.std(lengths) if lengths else 0,
            
            # ===== INTER-ARRIVAL TIME STATISTICS =====
            'min_iat': min(iats) if len(iats) > 0 else 0,
            'max_iat': max(iats) if len(iats) > 0 else 0,
            'mean_iat': np.mean(iats) if len(iats) > 0 else 0,
            'std_iat': np.std(iats) if len(iats) > 0 else 0,
            
            # ===== FLOW RATE FEATURES =====
            'packets_per_second': len(packets) / duration if duration > 0 else 0,
            'bytes_per_second': sum(lengths) / duration if duration > 0 else 0,
            
            # ===== PROTOCOL FEATURES =====
            'avg_ttl': np.mean([p.get('ttl', 0) for p in packets]),
            'protocol': packets[0].get('protocol', 0) if packets else 0,
            
            # ===== TCP FLAG STATISTICS =====
            # These are crucial for detecting various attacks
            'syn_count': sum(p.get('flag_syn', 0) for p in packets),  # SYN flood detection
            'ack_count': sum(p.get('flag_ack', 0) for p in packets),
            'fin_count': sum(p.get('flag_fin', 0) for p in packets),  # Connection termination
            'rst_count': sum(p.get('flag_rst', 0) for p in packets),  # Reset attacks
            'psh_count': sum(p.get('flag_psh', 0) for p in packets),  # Data push
            
            # ===== PAYLOAD STATISTICS =====
            'total_payload_bytes': sum(p.get('payload_size', 0) for p in packets),
            'avg_payload_size': np.mean([p.get('payload_size', 0) for p in packets]),
            'avg_entropy': np.mean([p.get('payload_entropy', 0) for p in packets]),
            
            # ===== PORT INFORMATION =====
            'src_port': packets[0].get('src_port', 0) if packets else 0,
            'dst_port': packets[0].get('dst_port', 0) if packets else 0,
        }
        
        # Convert to float32 for memory efficiency
        for key in features:
            if isinstance(features[key], (int, float)):
                features[key] = np.float32(features[key])
        
        return features
    
    def process_pcap(self, pcap_file, chunk_size=10000, max_packets=0):
        """
        Main processing function - reads PCAP and extracts flow features.
        Implements streaming processing to handle files larger than RAM.
        
        Processing steps:
        1. Read packets one by one
        2. Group into flows
        3. Extract features
        4. Flush to disk when memory limit reached
        5. Handle timeouts for inactive flows
        
        Returns:
            str: Path to HDF5 file containing extracted features
        """
        print(f"\nProcessing PCAP with memory optimization: {pcap_file}")
        print(f"Max flows in memory: {self.max_flows_in_memory:,}")
        print(f"Flow timeout: {self.flow_timeout} seconds")
        
        packet_count = 0
        last_flush_time = None
        
        try:
            # Open PCAP file for streaming read
            with PcapReader(pcap_file) as pcap_reader:
                # Process packets one by one
                for packet in tqdm(pcap_reader, desc="Extracting flow features"):
                    packet_count += 1
                    
                    # Check packet limit
                    if max_packets > 0 and packet_count > max_packets:
                        break
                    
                    # Get flow identifier
                    flow_id = self.get_flow_id(packet)
                    if not flow_id:
                        continue
                    
                    # Extract packet features
                    packet_features = self.extract_packet_features(packet)
                    current_time = packet_features.get('timestamp', 0)
                    
                    # Add packet to its flow
                    if flow_id not in self.flows:
                        self.flows[flow_id] = []
                        self.flow_counter += 1
                    self.flows[flow_id].append(packet_features)
                    
                    # Check if memory limit reached
                    if len(self.flows) >= self.max_flows_in_memory:
                        print(f"\n  Memory limit reached at packet {packet_count:,}")
                        self.flush_old_flows(current_time)
                    
                    # Periodic timeout check
                    if packet_count % chunk_size == 0:
                        # Check for timed-out flows
                        if last_flush_time and (current_time - last_flush_time) > self.flow_timeout:
                            self.flush_old_flows(current_time)
                            last_flush_time = current_time
                        
                        # Monitor system memory
                        mem_percent = psutil.virtual_memory().percent
                        if mem_percent > 80:
                            print(f"\n  High memory usage ({mem_percent:.1f}%), flushing flows...")
                            self.flush_old_flows(current_time, force_all=True)
                        
                        gc.collect()
            
            # Flush all remaining flows
            print("\nFlushing remaining flows...")
            self.flush_old_flows(float('inf'), force_all=True)
            
            # Close HDF5 file and release handle properly
            self.h5_store.close()
            self.h5_store = None  # Clear reference to ensure file is released
            gc.collect()  # Force garbage collection to free file handles
            
            print(f"\nProcessed {packet_count:,} packets")
            print(f"Total flows: {self.flow_counter:,}")
            print(f"Features saved to: {self.h5_filename}")
            
            return self.h5_filename
            
        except Exception as e:
            print(f"Error processing PCAP: {e}")
            if self.h5_store is not None:
                self.h5_store.close()
                self.h5_store = None
            raise
    
    def load_features_iterator(self, batch_size=50000):
        """
        Generator to load features in batches from HDF5.
        Allows processing results without loading all data into memory.
        
        Yields:
            DataFrame: Batch of flow features
        """
        with pd.HDFStore(self.h5_filename, mode='r') as store:
            keys = store.keys()
            
            accumulated_df = []
            accumulated_size = 0
            
            for key in keys:
                batch_df = store[key]
                accumulated_df.append(batch_df)
                accumulated_size += len(batch_df)
                
                if accumulated_size >= batch_size:
                    combined_df = pd.concat(accumulated_df, ignore_index=True)
                    yield combined_df
                    accumulated_df = []
                    accumulated_size = 0
            
            # Yield remaining data
            if accumulated_df:
                yield pd.concat(accumulated_df, ignore_index=True)

# ### Step4: Semantic Feature Extractor Implementation

In [None]:
# Cell 4: Semantic Feature Extractor - Deep Packet Inspection & NLP Analysis
"""
PURPOSE: Analyze packet payloads for malicious content using pattern matching and NLP
This class performs deep packet inspection to detect attack signatures in payload content.

WHAT THIS CELL DOES:
1. Examines packet payloads (actual data being transmitted)
2. Detects attack patterns (SQL injection, XSS, command injection)
3. Performs NLP analysis to find obfuscated attacks
4. Calculates entropy and encoding detection
5. Identifies suspicious URLs and domain names

KEY DETECTION CAPABILITIES:
1. SQL Injection: SELECT, UNION, DROP TABLE patterns
2. Cross-Site Scripting (XSS): <script>, javascript:, alert()
3. Command Injection: bash commands, system calls
4. Directory Traversal: ../, /etc/passwd
5. Encoding Detection: Base64, hex, URL encoding
6. Obfuscation: Unusual character patterns, high entropy

WHY SEMANTIC ANALYSIS MATTERS:
- Flow features only see traffic patterns, not content
- Many attacks hide in seemingly normal traffic
- Attackers use encoding/obfuscation to evade detection
- NLP helps detect variations of known attacks

MEMORY OPTIMIZATION:
- Processes payloads in streaming fashion
- Limits payload analysis to first 1000 characters
- Flushes results to disk periodically
- Uses simplified NLP for speed
"""

class MemoryOptimizedSemanticExtractor:
    def __init__(self, max_flows_in_memory=50000):
        """
        Initialize semantic analyzer with pattern databases and NLP components.
        
        Args:
            max_flows_in_memory: Maximum flows before flushing to disk
        """
        self.max_flows_in_memory = max_flows_in_memory
        self.semantic_data = {}  # Stores semantic features per flow
        self.batch_counter = 0
        
        # ===== SQL INJECTION PATTERNS =====
        # Common SQL commands used in injection attacks
        self.sql_patterns = [
            r'SELECT.*FROM',      # Basic SELECT query
            r'INSERT.*INTO',      # INSERT injection
            r'UPDATE.*SET',       # UPDATE injection
            r'DELETE.*FROM',      # DELETE injection
            r'DROP.*TABLE',       # Table dropping
            r'UNION.*SELECT',     # UNION-based injection
            r'OR\s+1\s*=\s*1',   # Classic bypass: OR 1=1
            r'--\s*$',           # SQL comment injection
            r';\s*EXEC',         # Command execution
            r'xp_cmdshell'       # SQL Server command execution
        ]
        
        # ===== COMMAND INJECTION PATTERNS =====
        # System commands indicating command injection attempts
        self.cmd_patterns = [
            r';\s*ls\s+',        # List directory (Linux)
            r';\s*cat\s+',       # Read file (Linux)
            r';\s*wget\s+',      # Download file
            r';\s*curl\s+',      # HTTP request tool
            r';\s*nc\s+',        # Netcat (backdoor tool)
            r'/etc/passwd',      # Common target file
            r'/etc/shadow',      # Password hashes
            r'cmd\.exe',         # Windows command prompt
            r'powershell',       # Windows PowerShell
            r'bash\s+-c',        # Bash command execution
            r'sh\s+-c',          # Shell command execution
            r'eval\s*\(',        # Code evaluation
        ]
        
        # ===== XSS/SCRIPT INJECTION PATTERNS =====
        # JavaScript and HTML injection patterns
        self.script_patterns = [
            r'<script',          # Script tag injection
            r'javascript:',      # JavaScript protocol
            r'onerror\s*=',     # Event handler injection
            r'onclick\s*=',     # Click event injection
            r'alert\s*\(',      # JavaScript alert
            r'document\.cookie', # Cookie theft
            r'eval\s*\(',       # Code evaluation
            r'exec\s*\(',       # Code execution
            r'system\s*\(',     # System call
            r'<iframe',         # IFrame injection
            r'<embed',          # Embed tag injection
            r'<object'          # Object tag injection
        ]
        
        # Create HDF5 storage for semantic features
        self.h5_filename = os.path.join(Config.TEMP_DIR, 'semantic_features.h5')
        self.h5_store = pd.HDFStore(self.h5_filename, mode='w', complevel=6)
        
        # Initialize NLP analyzer (simplified version for memory efficiency)
        self.nlp_analyzer = SimplifiedNLPAnalyzer()
    
    def flush_semantic_features(self):
        """
        Save accumulated semantic features to disk and free memory.
        Called when memory limit is reached.
        """
        if not self.semantic_data:
            return
        
        # Convert dictionary to DataFrame
        df_batch = pd.DataFrame.from_dict(self.semantic_data, orient='index')
        df_batch['flow_id'] = df_batch.index
        df_batch = df_batch.reset_index(drop=True)
        
        # Optimize data types
        for col in df_batch.select_dtypes(include=[np.float64]).columns:
            df_batch[col] = df_batch[col].astype(np.float32)
        
        # Save to HDF5
        self.h5_store.append(
            f'batch_{self.batch_counter}',
            df_batch,
            format='table',
            data_columns=True,
            min_itemsize={'flow_id': 16}
        )
        
        self.batch_counter += 1
        print(f"  Flushed {len(df_batch):,} semantic features to disk")
        
        # Clear memory
        self.semantic_data.clear()
        gc.collect()
    
    def extract_semantic_features(self, packet):
        """
        Extract semantic features from packet payload.
        Analyzes actual data content for attack signatures.
        
        Returns:
            dict: Semantic feature values
        """
        features = {
            # ===== PROTOCOL INDICATORS =====
            'has_http': 0,       # HTTP traffic
            'has_dns': 0,        # DNS queries
            'has_smtp': 0,       # Email traffic
            
            # ===== ATTACK INDICATORS =====
            'has_sql': 0,        # SQL injection detected
            'has_cmd': 0,        # Command injection detected
            'has_script': 0,     # Script injection detected
            'has_traversal': 0,  # Directory traversal detected
            
            # ===== SCORING =====
            'suspicious_score': 0,  # Overall suspicion level
            'content_length': 0,    # Payload size
            
            # ===== NLP FEATURES =====
            'nlp_malicious_confidence': 0,  # NLP-based threat score
            'nlp_pattern_score': 0,         # Pattern matching score
            'nlp_entropy_score': 0,         # Randomness score
            'nlp_encoding_detected': 0      # Encoding/obfuscation detected
        }
        
        try:
            # Check for HTTP
            if packet.haslayer('HTTP'):
                features['has_http'] = 1
                
                # Extract HTTP-specific features
                if hasattr(packet['HTTP'], 'Method'):
                    # POST/PUT methods often carry attack payloads
                    if packet['HTTP'].Method in [b'POST', b'PUT']:
                        features['suspicious_score'] += 1
            
            # Check for DNS
            if packet.haslayer('DNS'):
                features['has_dns'] = 1
                # DNS tunneling detection would go here
            
            # Extract and analyze payload
            if packet.haslayer('Raw'):
                payload = str(packet['Raw'].load)
                features['content_length'] = len(payload)
                
                # ===== PATTERN DETECTION =====
                # Check for SQL injection
                for pattern in self.sql_patterns[:5]:  # Check top patterns for speed
                    if re.search(pattern, payload, re.IGNORECASE):
                        features['has_sql'] = 1
                        features['suspicious_score'] += 3
                        break
                
                # Check for command injection
                for pattern in self.cmd_patterns[:5]:
                    if re.search(pattern, payload, re.IGNORECASE):
                        features['has_cmd'] = 1
                        features['suspicious_score'] += 5  # Higher score for OS commands
                        break
                
                # Check for script injection
                for pattern in self.script_patterns[:5]:
                    if re.search(pattern, payload, re.IGNORECASE):
                        features['has_script'] = 1
                        features['suspicious_score'] += 2
                        break
                
                # Check for directory traversal
                if '../' in payload or '..\\' in payload:
                    features['has_traversal'] = 1
                    features['suspicious_score'] += 2
                
                # ===== NLP ANALYSIS =====
                # Perform deeper analysis if enabled
                if Config.DEEP_INSPECTION and len(payload) > 10:
                    nlp_features = self.nlp_analyzer.quick_analyze(payload)
                    features.update(nlp_features)
                    
        except Exception as e:
            pass  # Skip problematic packets
        
        return features
    
    def process_pcap_streaming(self, pcap_file, chunk_size=10000, max_packets=0):
        """
        Process PCAP file for semantic analysis with streaming.
        Analyzes packet payloads for malicious content.
        
        Returns:
            str: Path to HDF5 file with semantic features
        """
        print(f"\nSemantic analysis (memory-optimized): {pcap_file}")
        
        packet_count = 0
        flow_extractor = MemoryOptimizedFlowExtractor()  # Reuse flow ID logic
        
        try:
            with PcapReader(pcap_file) as pcap_reader:
                for packet in tqdm(pcap_reader, desc="Extracting semantic features"):
                    packet_count += 1
                    
                    if max_packets > 0 and packet_count > max_packets:
                        break
                    
                    # Get flow ID to group semantic features
                    flow_id = flow_extractor.get_flow_id(packet)
                    if not flow_id:
                        continue
                    
                    # Extract semantic features from payload
                    features = self.extract_semantic_features(packet)
                    
                    # Aggregate features by flow
                    if flow_id not in self.semantic_data:
                        self.semantic_data[flow_id] = defaultdict(float)
                    
                    # Sum up features for the flow
                    for key, value in features.items():
                        self.semantic_data[flow_id][key] += value
                    
                    # Check memory limit
                    if len(self.semantic_data) >= self.max_flows_in_memory:
                        self.flush_semantic_features()
                    
                    # Periodic memory check
                    if packet_count % chunk_size == 0:
                        mem_percent = psutil.virtual_memory().percent
                        if mem_percent > 80:
                            print(f"\n  High memory ({mem_percent:.1f}%), flushing...")
                            self.flush_semantic_features()
                        gc.collect()
            
            # Final flush
            self.flush_semantic_features()
            
            # Close and release file handle properly
            self.h5_store.close()
            self.h5_store = None  # Clear reference to ensure file is released
            gc.collect()  # Force garbage collection to free file handles
            
            print(f"Processed {packet_count:,} packets")
            print(f"Semantic features saved to: {self.h5_filename}")
            
            return self.h5_filename
            
        except Exception as e:
            print(f"Error in semantic processing: {e}")
            if self.h5_store is not None:
                self.h5_store.close()
                self.h5_store = None
            raise


class SimplifiedNLPAnalyzer:
    """
    Lightweight NLP analyzer for payload inspection.
    Optimized for speed and memory efficiency.
    
    DETECTION METHODS:
    1. Keyword density analysis
    2. Entropy calculation (randomness)
    3. Encoding detection
    4. Character distribution analysis
    """
    
    def __init__(self):
        # Reduced keyword sets for memory efficiency
        self.sql_keywords = {'select', 'union', 'insert', 'drop', 'exec', 'declare', 'cast'}
        self.xss_keywords = {'script', 'javascript', 'alert', 'onerror', 'onclick', 'document'}
        self.cmd_keywords = {'bash', 'cmd', 'wget', 'curl', 'nc', 'telnet', 'ssh'}
        
        # Encoding patterns
        self.encoding_patterns = {
            'base64': r'^[A-Za-z0-9+/]+=*$',
            'hex': r'^[0-9A-Fa-f]+$',
            'url': r'%[0-9A-Fa-f]{2}'
        }
    
    def quick_analyze(self, payload):
        """
        Perform quick NLP analysis on payload.
        Focuses on key indicators of malicious content.
        
        Returns:
            dict: NLP feature scores
        """
        features = {}
        
        # Limit analysis to first 1000 characters for speed
        payload_sample = payload[:1000].lower()
        
        # ===== KEYWORD ANALYSIS =====
        # Count suspicious keywords
        sql_score = sum(1 for kw in self.sql_keywords if kw in payload_sample)
        xss_score = sum(1 for kw in self.xss_keywords if kw in payload_sample)
        cmd_score = sum(1 for kw in self.cmd_keywords if kw in payload_sample)
        
        # Normalize scores (0-1 range)
        features['nlp_pattern_score'] = min((sql_score + xss_score + cmd_score) / 10, 1.0)
        
        # ===== ENTROPY ANALYSIS =====
        # High entropy suggests encryption/obfuscation
        if len(payload) > 0:
            # Count unique characters in sample
            unique_chars = len(set(payload[:100]))
            features['nlp_entropy_score'] = unique_chars / min(len(payload), 100)
        else:
            features['nlp_entropy_score'] = 0
        
        # ===== ENCODING DETECTION =====
        # Check for common encoding schemes
        encoding_detected = 0
        for pattern_name, pattern in self.encoding_patterns.items():
            if re.search(pattern, payload_sample[:50]):
                encoding_detected = 1
                break
        features['nlp_encoding_detected'] = encoding_detected
        
        # ===== OVERALL MALICIOUS CONFIDENCE =====
        # Weighted combination of all indicators
        features['nlp_malicious_confidence'] = min(
            features['nlp_pattern_score'] * 0.5 +   # Pattern matching weight
            features['nlp_entropy_score'] * 0.3 +   # Entropy weight
            features['nlp_encoding_detected'] * 0.2, # Encoding weight
            1.0
        )
        
        return features

# ### Step 5: Combined Feature Extraction Pipeline

In [None]:
# Cell 5: Combined Feature Pipeline - Merging Flow & Semantic Features
"""
PURPOSE: Orchestrate feature extraction and merge different feature types
This class combines flow statistics with semantic analysis to create a comprehensive
feature set for machine learning.

WHAT THIS CELL DOES:
1. Coordinates flow and semantic feature extraction
2. Merges features from multiple sources using flow_id
3. Performs feature engineering (creates new features from existing ones)
4. Handles the entire extraction pipeline end-to-end
5. Manages disk-based merging for large datasets

KEY CONCEPTS:
- FEATURE FUSION: Combining statistical and content-based features
- FEATURE ENGINEERING: Creating derived features that better capture patterns
- DISK-BASED MERGE: Joining large datasets without loading into memory

FEATURE TYPES COMBINED:
1. Flow Features (from Cell 3):
   - Timing statistics
   - Packet sizes
   - TCP flags
   - Flow rates

2. Semantic Features (from Cell 4):
   - Attack pattern detection
   - NLP analysis scores
   - Entropy measurements
   - Protocol indicators

3. Engineered Features (created here):
   - Packet rate (packets/duration)
   - Average packet size
   - Port categories (well-known, registered)
   - Flag ratios (flags/total packets)

WHY COMBINE FEATURES:
- Flow features detect behavioral anomalies
- Semantic features detect content anomalies
- Combined view provides better attack detection
- Some attacks only visible through combination
"""

class MemoryOptimizedFeaturePipeline:
    def __init__(self):
        """
        Initialize the combined feature pipeline with both extractors.
        """
        # Initialize component extractors
        self.flow_extractor = MemoryOptimizedFlowExtractor(
            max_flows_in_memory=Config.MAX_FLOWS_IN_MEMORY,
            flow_timeout=Config.FLOW_TIMEOUT
        )
        self.semantic_extractor = MemoryOptimizedSemanticExtractor(
            max_flows_in_memory=Config.MAX_FLOWS_IN_MEMORY
        )
        
        # Output file for combined features
        self.combined_h5_filename = os.path.join(Config.TEMP_DIR, 'combined_features.h5')
    
    def extract_all_features(self, pcap_file, mode='combined'):
        """
        Main orchestration function - manages entire feature extraction process.
        
        Args:
            pcap_file: Path to PCAP file
            mode: 'flow' (statistics only), 'semantic' (content only), or 'combined' (both)
        
        Returns:
            str: Path to HDF5 file containing all features
        """
        h5_files = []  # List to track generated feature files
        
        # ===== PHASE 1: FLOW FEATURE EXTRACTION =====
        if mode in ['flow', 'combined']:
            print("\n" + "="*60)
            print("PHASE 1: MEMORY-OPTIMIZED FLOW EXTRACTION")
            print("="*60)
            print("Extracting statistical features from network flows...")
            print("This analyzes packet timing, sizes, and patterns")
            
            flow_h5 = self.flow_extractor.process_pcap(
                pcap_file,
                chunk_size=Config.CHUNK_SIZE,
                max_packets=Config.MAX_PACKETS
            )
            h5_files.append(('flow', flow_h5))
            print(f"✓ Flow features extracted to: {flow_h5}")
        
        # ===== PHASE 2: SEMANTIC FEATURE EXTRACTION =====
        if mode in ['semantic', 'combined'] and Config.DEEP_INSPECTION:
            print("\n" + "="*60)
            print("PHASE 2: MEMORY-OPTIMIZED SEMANTIC EXTRACTION")
            print("="*60)
            print("Analyzing packet payloads for malicious content...")
            print("This performs deep packet inspection and NLP analysis")
            
            semantic_h5 = self.semantic_extractor.process_pcap_streaming(
                pcap_file,
                chunk_size=Config.CHUNK_SIZE,
                max_packets=Config.MAX_PACKETS
            )
            h5_files.append(('semantic', semantic_h5))
            print(f"✓ Semantic features extracted to: {semantic_h5}")
        
        # ===== PHASE 3: FEATURE MERGING =====
        if len(h5_files) > 1:
            print("\n" + "="*60)
            print("PHASE 3: MERGING FEATURES (DISK-BASED)")
            print("="*60)
            print("Combining flow and semantic features...")
            return self.merge_features_on_disk(h5_files)
        elif h5_files:
            return h5_files[0][1]  # Return single feature file if only one type
        else:
            return None
    
    def merge_features_on_disk(self, h5_files):
        """
        Merge features from multiple HDF5 files without loading all into memory.
        Uses flow_id as the join key to combine features.
        
        Process:
        1. Read flow features in batches
        2. Find matching semantic features for each batch
        3. Merge on flow_id
        4. Apply feature engineering
        5. Save merged batch to new HDF5
        
        Returns:
            str: Path to merged HDF5 file
        """
        print("Merging features using disk-based operations...")
        print("This preserves memory by processing in batches")
        
        # AGGRESSIVE WINDOWS FILE LOCK FIX
        import time
        gc.collect()
        time.sleep(2)  # Increased delay for Windows
        
        # Extract file paths
        flow_h5 = h5_files[0][1]  # Flow features file
        semantic_h5 = h5_files[1][1] if len(h5_files) > 1 else None
        
        # Verify files can be opened before proceeding
        print("Verifying file access...")
        max_retries = 3
        for retry in range(max_retries):
            try:
                # Test opening flow file
                test_flow = pd.HDFStore(flow_h5, mode='r')
                test_flow.close()
                
                # Test opening semantic file if it exists
                if semantic_h5:
                    test_semantic = pd.HDFStore(semantic_h5, mode='r')
                    test_semantic.close()
                
                print("Files accessible, proceeding with merge...")
                break
                
            except Exception as e:
                if retry < max_retries - 1:
                    print(f"Files still locked (attempt {retry + 1}/{max_retries}), waiting...")
                    time.sleep(3)
                    gc.collect()
                else:
                    print(f"ERROR: Cannot access files after {max_retries} attempts")
                    print(f"Error details: {e}")
                    # Fall back to returning just the flow features
                    print("Falling back to flow features only...")
                    return flow_h5
        
        # Now proceed with actual merge
        try:
            with pd.HDFStore(self.combined_h5_filename, mode='w', complevel=6) as combined_store:
                with pd.HDFStore(flow_h5, mode='r') as flow_store:
                    flow_keys = flow_store.keys()
                    
                    batch_counter = 0
                    
                    for flow_key in tqdm(flow_keys, desc="Merging batches"):
                        # Load batch of flow features
                        flow_batch = flow_store[flow_key]
                        
                        # Merge with semantic features if available
                        if semantic_h5:
                            # Get flow IDs from this batch
                            flow_ids = set(flow_batch['flow_id'].values)
                            
                            # Load matching semantic features
                            semantic_batch = self.load_matching_semantic_features(
                                semantic_h5, flow_ids
                            )
                            
                            # Merge on flow_id (left join to keep all flows)
                            if semantic_batch is not None and not semantic_batch.empty:
                                combined_batch = pd.merge(
                                    flow_batch, semantic_batch,
                                    on='flow_id', how='left'
                                )
                                
                                # Fill missing semantic features with zeros
                                semantic_cols = semantic_batch.columns.difference(['flow_id'])
                                combined_batch[semantic_cols] = combined_batch[semantic_cols].fillna(0)
                            else:
                                combined_batch = flow_batch
                        else:
                            combined_batch = flow_batch
                        
                        # Apply feature engineering to create derived features
                        combined_batch = self.engineer_features(combined_batch)
                        
                        # Save merged batch to disk
                        combined_store.append(
                            f'batch_{batch_counter}',
                            combined_batch,
                            format='table',
                            data_columns=True,
                            min_itemsize={'flow_id': 16}
                        )
                        
                        batch_counter += 1
                        
                        # Clean up memory
                        del combined_batch
                        gc.collect()
            
            print(f"✓ Features merged and saved to: {self.combined_h5_filename}")
            print(f"  Total batches processed: {batch_counter}")
            return self.combined_h5_filename
            
        except Exception as e:
            print(f"ERROR during merge: {e}")
            print("Returning flow features file as fallback...")
            return flow_h5
    
    def load_matching_semantic_features(self, semantic_h5, flow_ids):
        """
        Load semantic features that match given flow IDs.
        Efficient loading - only reads matching records.
        
        Args:
            semantic_h5: Path to semantic features HDF5
            flow_ids: Set of flow IDs to match
            
        Returns:
            DataFrame: Semantic features for matching flows
        """
        matching_features = []
        
        with pd.HDFStore(semantic_h5, mode='r') as semantic_store:
            # Iterate through semantic feature batches
            for key in semantic_store.keys():
                batch = semantic_store[key]
                
                # Filter for matching flow IDs
                matches = batch[batch['flow_id'].isin(flow_ids)]
                
                if not matches.empty:
                    matching_features.append(matches)
        
        # Combine all matching features
        if matching_features:
            return pd.concat(matching_features, ignore_index=True)
        return pd.DataFrame()
    
    def engineer_features(self, df):
        """
        Create derived features that better capture attack patterns.
        Feature engineering is crucial for ML model performance.
        
        Engineered features include:
        1. Rate features: packets/second, bytes/second
        2. Ratio features: flag counts / total packets
        3. Port categories: well-known (<1024), registered (1024-49151)
        4. Suspicious indicators: binary flags for quick filtering
        
        Args:
            df: DataFrame with raw features
            
        Returns:
            DataFrame: With additional engineered features
        """
        # ===== RATE FEATURES =====
        # Packet rate (packets per second)
        if 'total_packets' in df.columns and 'flow_duration' in df.columns:
            df['packet_rate'] = df['total_packets'] / (df['flow_duration'] + 1)  # +1 to avoid division by zero
            df['packet_rate'] = df['packet_rate'].astype(np.float32)
        
        # Average packet size
        if 'total_bytes' in df.columns and 'total_packets' in df.columns:
            df['avg_packet_size'] = df['total_bytes'] / (df['total_packets'] + 1)
            df['avg_packet_size'] = df['avg_packet_size'].astype(np.float32)
        
        # ===== FLAG RATIO FEATURES =====
        # Calculate flag ratios (important for detecting SYN floods, etc.)
        flag_cols = ['syn_count', 'ack_count', 'fin_count', 'rst_count', 'psh_count']
        if all(col in df.columns for col in flag_cols) and 'total_packets' in df.columns:
            for flag in flag_cols:
                ratio_name = f'{flag}_ratio'
                df[ratio_name] = df[flag] / (df['total_packets'] + 1)
                df[ratio_name] = df[ratio_name].astype(np.float32)
        
        # ===== PORT CATEGORY FEATURES =====
        # Categorize ports for better pattern recognition
        if 'dst_port' in df.columns:
            # Well-known ports (0-1023) - usually system services
            df['is_well_known_port'] = (df['dst_port'] < 1024).astype(np.int8)
            
            # Registered ports (1024-49151) - usually applications
            df['is_registered_port'] = ((df['dst_port'] >= 1024) & 
                                        (df['dst_port'] < 49152)).astype(np.int8)
            
            # Dynamic/private ports (49152-65535) - usually client connections
            df['is_dynamic_port'] = (df['dst_port'] >= 49152).astype(np.int8)
        
        # ===== SUSPICIOUS INDICATORS =====
        # Binary flags for quick filtering
        if 'suspicious_score' in df.columns:
            df['is_suspicious'] = (df['suspicious_score'] > 0).astype(np.int8)
            df['highly_suspicious'] = (df['suspicious_score'] > 5).astype(np.int8)
        
        # ===== ATTACK COMBINATION FEATURES =====
        # Some attacks use specific combinations
        if 'has_sql' in df.columns and 'has_script' in df.columns:
            # SQL + Script often indicates complex web attack
            df['sql_and_script'] = ((df.get('has_sql', 0) > 0) & 
                                    (df.get('has_script', 0) > 0)).astype(np.int8)
        
        if 'has_cmd' in df.columns and 'nlp_encoding_detected' in df.columns:
            # Command injection + encoding often indicates obfuscated attack
            df['encoded_cmd'] = ((df.get('has_cmd', 0) > 0) & 
                                 (df.get('nlp_encoding_detected', 0) > 0)).astype(np.int8)
        
        # ===== PAYLOAD RATIO FEATURES =====
        if 'total_payload_bytes' in df.columns and 'total_bytes' in df.columns:
            # Ratio of payload to total traffic
            df['payload_ratio'] = df['total_payload_bytes'] / (df['total_bytes'] + 1)
            df['payload_ratio'] = df['payload_ratio'].astype(np.float32)
        
        return df

# ### Step 6: CICIDS2017 Label Loader

In [None]:
# Cell 6: CICIDS Label Matcher - Ground Truth Integration
"""
PURPOSE: Match extracted flows with CICIDS2017 ground truth labels
This class integrates the official attack labels from CICIDS2017 dataset
to enable supervised learning.

WHAT THIS CELL DOES:
1. Loads CICIDS2017 CSV label files incrementally
2. Creates a mapping between network flows and attack types
3. Handles multiple CSV files from different days
4. Assigns numeric labels for ML training
5. Maintains attack type distribution statistics

CICIDS2017 ATTACK TYPES:
The dataset contains 15 different attack categories:
0. BENIGN - Normal, non-malicious traffic
1. Bot - Botnet traffic
2. DDoS - Distributed Denial of Service
3. DoS GoldenEye - Application layer DoS
4. DoS Hulk - Volume-based DoS
5. DoS Slowhttptest - Slow HTTP attack
6. DoS slowloris - Connection exhaustion
7. FTP-Patator - FTP brute force
8. Heartbleed - SSL vulnerability exploit
9. Infiltration - Network infiltration
10. PortScan - Port scanning activity
11. SSH-Patator - SSH brute force
12. Web Attack - Brute Force
13. Web Attack - SQL Injection
14. Web Attack - XSS

MATCHING STRATEGY:
- Primary: Match by 5-tuple (IPs, ports, protocol)
- Fallback: Match by destination port (majority vote)
- Default: Label as BENIGN if no match found

MEMORY OPTIMIZATION:
- Loads CSV files in chunks (100K rows at a time)
- Builds port-label cache instead of full flow mapping
- Processes labels incrementally without loading all
"""

class MemoryOptimizedLabelMatcher:
    def __init__(self):
        """
        Initialize label matcher with attack type mappings.
        """
        # Mapping from text labels to numeric codes for ML
        self.attack_mapping = {
            'BENIGN': 0,
            'Bot': 1,
            'DDoS': 2,
            'DoS GoldenEye': 3,
            'DoS Hulk': 4,
            'DoS Slowhttptest': 5,
            'DoS slowloris': 6,
            'FTP-Patator': 7,
            'Heartbleed': 8,
            'Infiltration': 9,
            'PortScan': 10,
            'SSH-Patator': 11,
            'Web Attack - Brute Force': 12,
            'Web Attack - SQL Injection': 13,
            'Web Attack - XSS': 14
        }
        
        # Cache for port-to-label mapping (memory efficient)
        self.port_label_cache = {}
        
        # Statistics tracking
        self.label_statistics = Counter()
    
    def load_labels_incrementally(self, label_files):
        """
        Load CICIDS label files incrementally to avoid memory overflow.
        Builds a port-based mapping for efficient matching.
        
        Args:
            label_files: List of CSV file paths
            
        Returns:
            bool: Success status
        """
        print("\n" + "="*60)
        print("LOADING CICIDS LABELS (MEMORY-OPTIMIZED)")
        print("="*60)
        
        label_counts = Counter()
        total_rows_processed = 0
        
        # Process each label file
        for file_idx, label_file in enumerate(label_files):
            print(f"\nProcessing label file {file_idx + 1}/{len(label_files)}: {os.path.basename(label_file)}")
            
            # Read CSV in chunks to manage memory
            chunk_size = 100000
            chunks_processed = 0
            
            try:
                # Process file in chunks
                for chunk in pd.read_csv(label_file, encoding='latin-1', chunksize=chunk_size):
                    # Clean column names (remove spaces)
                    chunk.columns = chunk.columns.str.strip()
                    
                    # Find label column (handles different naming conventions)
                    label_col = None
                    for col in ['Label', 'label', 'LABEL', ' Label']:
                        if col in chunk.columns:
                            label_col = col
                            break
                    
                    if not label_col:
                        print(f"  Warning: No label column found in chunk {chunks_processed}")
                        continue
                    
                    # Find port column
                    port_col = None
                    for col in ['Destination Port', 'Dst Port', 'dst_port', ' Destination Port']:
                        if col in chunk.columns:
                            port_col = col
                            break
                    
                    if label_col and port_col:
                        # Build port-label mapping
                        # Group by port and find most common label
                        for port, group in chunk.groupby(port_col):
                            # Get most frequent label for this port
                            most_common_label = group[label_col].mode()
                            if len(most_common_label) > 0:
                                label = most_common_label.iloc[0]
                                
                                # Update cache with majority vote
                                if port not in self.port_label_cache:
                                    self.port_label_cache[port] = Counter()
                                self.port_label_cache[port][label] += len(group)
                                
                                # Update statistics
                                label_counts[label] += len(group)
                    
                    chunks_processed += 1
                    total_rows_processed += len(chunk)
                    
                    # Periodic memory cleanup
                    if chunks_processed % 10 == 0:
                        gc.collect()
                        print(f"  Processed {chunks_processed * chunk_size:,} rows...")
                
                print(f"  Completed: {chunks_processed} chunks")
                
            except Exception as e:
                print(f"  Error processing file: {e}")
                continue
        
        # Finalize port-label mapping (keep only most common label per port)
        for port in self.port_label_cache:
            if isinstance(self.port_label_cache[port], Counter):
                # Get most common label for this port
                most_common = self.port_label_cache[port].most_common(1)[0][0]
                self.port_label_cache[port] = most_common
        
        # Display label distribution
        print("\n" + "-"*40)
        print("LABEL DISTRIBUTION:")
        print("-"*40)
        total = sum(label_counts.values())
        for label, count in label_counts.most_common():
            pct = count / total * 100 if total > 0 else 0
            print(f"  {label:30s}: {count:10,} ({pct:5.1f}%)")
        
        print(f"\nTotal rows processed: {total_rows_processed:,}")
        print(f"Port-label mappings created: {len(self.port_label_cache):,}")
        
        # Store statistics
        self.label_statistics = label_counts
        
        return True
    
    def apply_labels_to_batch(self, df):
        """
        Apply labels to a batch of extracted features.
        Uses port-based mapping for memory efficiency.
        
        Args:
            df: DataFrame with extracted features
            
        Returns:
            DataFrame: With added label columns
        """
        if 'dst_port' not in df.columns:
            # No port information, default to BENIGN
            df['label'] = 0
            df['attack_type'] = 'BENIGN'
            return df
        
        # Map ports to attack types using cache
        df['attack_type'] = df['dst_port'].map(self.port_label_cache).fillna('BENIGN')
        
        # Convert text labels to numeric
        df['label'] = df['attack_type'].map(self.attack_mapping).fillna(0).astype(np.int8)
        
        # Add confidence score based on port matching
        # (Ports in cache have higher confidence)
        df['label_confidence'] = df['dst_port'].isin(self.port_label_cache.keys()).astype(np.float32)
        
        return df
    
    def process_features_with_labels(self, features_h5, label_files):
        """
        Process feature file and add labels in batches.
        Creates new HDF5 file with labeled features.
        
        Args:
            features_h5: Path to HDF5 file with features
            label_files: List of CICIDS CSV files
            
        Returns:
            str: Path to labeled HDF5 file
        """
        if not label_files:
            print("No label files provided, skipping labeling")
            return features_h5
        
        # Load label mappings into cache
        print("\nBuilding label cache from CSV files...")
        self.load_labels_incrementally(label_files)
        
        # Create new HDF5 file for labeled features
        labeled_h5 = os.path.join(Config.TEMP_DIR, 'labeled_features.h5')
        
        print("\n" + "="*60)
        print("APPLYING LABELS TO FEATURES")
        print("="*60)
        
        labeled_count = Counter()
        
        with pd.HDFStore(features_h5, mode='r') as input_store:
            with pd.HDFStore(labeled_h5, mode='w', complevel=6) as output_store:
                
                # Process each batch
                for key in tqdm(input_store.keys(), desc="Labeling batches"):
                    # Load feature batch
                    batch = input_store[key]
                    
                    # Apply labels
                    batch = self.apply_labels_to_batch(batch)
                    
                    # Track label distribution
                    labeled_count.update(batch['attack_type'].value_counts().to_dict())
                    
                    # Save labeled batch
                    output_store.append(
                        key.replace('/', ''),  # Remove leading slash
                        batch,
                        format='table',
                        data_columns=True,
                        min_itemsize={'flow_id': 16, 'attack_type': 30}
                    )
                    
                    # Memory cleanup
                    del batch
                    gc.collect()
        
        # Display labeling results
        print("\n" + "-"*40)
        print("LABELING RESULTS:")
        print("-"*40)
        total = sum(labeled_count.values())
        for attack_type, count in labeled_count.most_common():
            pct = count / total * 100 if total > 0 else 0
            print(f"  {attack_type:30s}: {count:10,} ({pct:5.1f}%)")
        
        print(f"\nLabeled features saved to: {labeled_h5}")
        
        # Validate labeling quality
        benign_pct = labeled_count.get('BENIGN', 0) / total * 100 if total > 0 else 0
        if benign_pct > 90:
            print("\n⚠️  Warning: >90% flows labeled as BENIGN")
            print("   This might indicate labeling issues or imbalanced dataset")
        
        return labeled_h5

# ### Step 7: Feature Analysis and Selection

In [None]:
# Cell 7: Feature Analysis - Selection & Importance Ranking
"""
PURPOSE: Analyze and select the most important features for machine learning
This class determines which features are most useful for detecting attacks.

WHAT THIS CELL DOES:
1. Collects statistical information about all features
2. Calculates feature importance using mutual information
3. Selects the top N most informative features
4. Removes redundant or uninformative features
5. Provides feature ranking for interpretability

KEY CONCEPTS:
- MUTUAL INFORMATION: Measures how much knowing a feature reduces uncertainty about the label
- FEATURE SELECTION: Choosing subset of features that maximize predictive power
- CURSE OF DIMENSIONALITY: Too many features can hurt ML performance
- FEATURE IMPORTANCE: Understanding which features drive predictions

WHY FEATURE SELECTION MATTERS:
- Reduces training time (fewer features to process)
- Improves model performance (removes noise)
- Prevents overfitting (simpler models generalize better)
- Enhances interpretability (understand what drives detection)

SELECTION STRATEGY:
1. Statistical Analysis: Mean, variance, range for each feature
2. Importance Scoring: Mutual information with target labels
3. Redundancy Removal: Correlation analysis
4. Top-K Selection: Keep only the best features
"""

class MemoryOptimizedFeatureAnalyzer:
    def __init__(self):
        """
        Initialize feature analyzer with storage for statistics and importance scores.
        """
        self.feature_importance = {}  # Feature name -> importance score
        self.selected_features = []   # Final list of selected features
        self.feature_stats = {}        # Statistical summaries per feature
    
    def analyze_features_incrementally(self, features_h5, target_col='label'):
        """
        Analyze features using two-pass incremental processing.
        Pass 1: Collect statistics
        Pass 2: Calculate importance on representative sample
        
        Args:
            features_h5: Path to HDF5 file with features
            target_col: Name of label column
            
        Returns:
            list: Selected feature names
        """
        print("\n" + "="*60)
        print("FEATURE ANALYSIS (MEMORY-OPTIMIZED)")
        print("="*60)
        
        # ===== PASS 1: STATISTICS COLLECTION =====
        print("Pass 1: Collecting feature statistics...")
        print("  This helps understand feature distributions")
        self.collect_feature_stats(features_h5, target_col)
        
        # ===== PASS 2: IMPORTANCE CALCULATION =====
        print("\nPass 2: Calculating feature importance on sample...")
        print("  This identifies most informative features")
        self.calculate_importance_sample(features_h5, target_col)
        
        # ===== FEATURE SELECTION =====
        # Sort features by importance
        sorted_features = sorted(
            self.feature_importance.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        # Select top features
        self.selected_features = [f[0] for f in sorted_features[:Config.TOP_FEATURES]]
        
        # ===== DISPLAY RESULTS =====
        print(f"\n" + "="*40)
        print(f"TOP {min(10, len(sorted_features))} FEATURES:")
        print("="*40)
        print(f"{'Rank':<5} {'Feature':<30} {'Importance':<10} {'Type'}")
        print("-"*60)
        
        for i, (feature, score) in enumerate(sorted_features[:10], 1):
            # Determine feature type for interpretation
            feature_type = self.get_feature_type(feature)
            print(f"{i:<5} {feature:<30} {score:>10.4f} {feature_type}")
        
        print(f"\nTotal features analyzed: {len(self.feature_importance)}")
        print(f"Features selected: {len(self.selected_features)}")
        
        return self.selected_features
    
    def collect_feature_stats(self, features_h5, target_col):
        """
        First pass: Collect statistical summaries for each feature.
        This helps understand data distribution and identify issues.
        
        Statistics collected:
        - Sum, sum of squares (for mean/variance calculation)
        - Min, max (for range)
        - Count (for missing value detection)
        """
        with pd.HDFStore(features_h5, mode='r') as store:
            # Get feature columns from first batch
            first_key = store.keys()[0]
            first_batch = store.select(first_key, stop=100)
            
            # Identify feature columns (exclude metadata)
            exclude_cols = {'flow_id', target_col, 'attack_type', 'label_confidence'}
            feature_cols = [col for col in first_batch.columns 
                          if col not in exclude_cols]
            
            print(f"  Analyzing {len(feature_cols)} features...")
            
            # Initialize statistics collectors
            for col in feature_cols:
                self.feature_stats[col] = {
                    'sum': 0,      # For mean calculation
                    'sum_sq': 0,   # For variance calculation
                    'count': 0,    # Total non-null values
                    'min': float('inf'),
                    'max': float('-inf'),
                    'zeros': 0,    # Count of zero values
                    'unique': set() # Track unique values (sampled)
                }
            
            # Process all batches
            for key in tqdm(store.keys(), desc="Collecting stats"):
                batch = store[key]
                
                for col in feature_cols:
                    if col in batch.columns:
                        # Get non-null values
                        values = batch[col].fillna(0)
                        
                        # Update statistics
                        stats = self.feature_stats[col]
                        stats['sum'] += values.sum()
                        stats['sum_sq'] += (values ** 2).sum()
                        stats['count'] += len(values)
                        stats['min'] = min(stats['min'], values.min())
                        stats['max'] = max(stats['max'], values.max())
                        stats['zeros'] += (values == 0).sum()
                        
                        # Sample unique values (limit to 100 for memory)
                        if len(stats['unique']) < 100:
                            stats['unique'].update(values.sample(min(10, len(values))).tolist())
                
                # Memory cleanup
                del batch
                gc.collect()
        
        # Calculate derived statistics
        for col, stats in self.feature_stats.items():
            if stats['count'] > 0:
                stats['mean'] = stats['sum'] / stats['count']
                stats['variance'] = (stats['sum_sq'] / stats['count']) - (stats['mean'] ** 2)
                stats['std'] = np.sqrt(max(0, stats['variance']))  # Avoid negative variance due to rounding
                stats['range'] = stats['max'] - stats['min']
                stats['zero_ratio'] = stats['zeros'] / stats['count']
            else:
                stats['mean'] = stats['variance'] = stats['std'] = stats['range'] = 0
                stats['zero_ratio'] = 1
    
    def calculate_importance_sample(self, features_h5, target_col, sample_size=50000):
        """
        Second pass: Calculate feature importance using mutual information.
        Uses a representative sample for efficiency.
        
        Mutual Information measures:
        - How much information a feature provides about the target
        - Non-linear relationships (unlike correlation)
        - Works with any feature type
        
        Args:
            features_h5: Path to feature file
            target_col: Target label column
            sample_size: Number of samples for importance calculation
        """
        # Load stratified sample for importance calculation
        print(f"  Loading sample of {sample_size:,} flows...")
        sample_dfs = []
        remaining_samples = sample_size
        
        with pd.HDFStore(features_h5, mode='r') as store:
            # Sample from different parts of the dataset
            keys = store.keys()
            sample_interval = max(1, len(keys) // 10)  # Sample from 10 points
            
            for i in range(0, len(keys), sample_interval):
                if remaining_samples <= 0:
                    break
                
                key = keys[min(i, len(keys)-1)]
                batch_sample_size = min(remaining_samples, 5000)
                
                # Load batch sample
                batch = store.select(key, stop=batch_sample_size)
                sample_dfs.append(batch)
                remaining_samples -= len(batch)
        
        # Combine samples
        sample_df = pd.concat(sample_dfs, ignore_index=True)
        print(f"  Sample loaded: {len(sample_df):,} flows")
        
        # Get feature columns
        exclude_cols = {'flow_id', target_col, 'attack_type', 'label_confidence'}
        feature_cols = [col for col in sample_df.columns 
                       if col not in exclude_cols]
        
        # Prepare feature matrix and labels
        X = sample_df[feature_cols].fillna(0)
        y = sample_df[target_col]
        
        # Calculate mutual information
        print("  Calculating mutual information scores...")
        mi_scores = mutual_info_classif(X, y, random_state=Config.RANDOM_STATE)
        
        # Store importance scores
        self.feature_importance = dict(zip(feature_cols, mi_scores))
        
        # Identify uninformative features (near-zero importance)
        uninformative = [f for f, score in self.feature_importance.items() if score < 0.001]
        if uninformative:
            print(f"\n  ⚠️  Found {len(uninformative)} uninformative features (MI < 0.001)")
            print(f"     Examples: {uninformative[:5]}")
        
        # Clean up
        del sample_df, sample_dfs, X, y
        gc.collect()
    
    def get_feature_type(self, feature_name):
        """
        Categorize feature by its name for better interpretation.
        Helps understand what aspect of traffic each feature captures.
        
        Categories:
        - Flow: Timing and size statistics
        - Flag: TCP flag related
        - Port: Port number features
        - Payload: Content-based features
        - NLP: Natural language processing scores
        - Engineered: Derived features
        """
        feature_lower = feature_name.lower()
        
        if 'flow' in feature_lower or 'duration' in feature_lower or 'iat' in feature_lower:
            return "Flow"
        elif 'flag' in feature_lower or 'syn' in feature_lower or 'ack' in feature_lower:
            return "Flag"
        elif 'port' in feature_lower:
            return "Port"
        elif 'payload' in feature_lower or 'entropy' in feature_lower:
            return "Payload"
        elif 'nlp' in feature_lower:
            return "NLP"
        elif 'ratio' in feature_lower or 'rate' in feature_lower:
            return "Engineered"
        elif 'has_' in feature_lower or 'is_' in feature_lower:
            return "Binary"
        elif 'suspicious' in feature_lower:
            return "Detection"
        else:
            return "Other"
    
    def get_feature_insights(self):
        """
        Provide insights about selected features for interpretability.
        Helps understand what the model will focus on.
        
        Returns:
            dict: Insights about feature selection
        """
        insights = {
            'total_features': len(self.feature_importance),
            'selected_features': len(self.selected_features),
            'feature_types': Counter(),
            'top_5_features': [],
            'recommendations': []
        }
        
        # Analyze feature types
        for feature in self.selected_features:
            feature_type = self.get_feature_type(feature)
            insights['feature_types'][feature_type] += 1
        
        # Get top 5 features with scores
        for feature in self.selected_features[:5]:
            insights['top_5_features'].append({
                'name': feature,
                'importance': self.feature_importance[feature],
                'type': self.get_feature_type(feature)
            })
        
        # Provide recommendations
        if insights['feature_types'].get('NLP', 0) > 5:
            insights['recommendations'].append(
                "High NLP feature count - model focuses on payload content"
            )
        
        if insights['feature_types'].get('Flow', 0) > 5:
            insights['recommendations'].append(
                "High flow feature count - model focuses on traffic patterns"
            )
        
        if len(self.selected_features) < 20:
            insights['recommendations'].append(
                "Low feature count - consider increasing TOP_FEATURES if accuracy is low"
            )
        
        return insights

# ### Step 8: Machine Learning Pipeline

In [None]:
# Cell 8: Machine Learning Pipeline - Incremental Training & Evaluation
"""
PURPOSE: Train and evaluate machine learning models using incremental learning
This class implements memory-efficient ML training for large datasets that don't fit in RAM.

WHAT THIS CELL DOES:
1. Implements incremental/online learning algorithms
2. Trains models in batches without loading all data
3. Performs train/test split at the batch level
4. Evaluates model performance progressively
5. Supports multiple ML algorithms optimized for streaming

INCREMENTAL LEARNING:
Traditional ML loads all data at once. Incremental learning:
- Processes data in small batches
- Updates model parameters gradually
- Never needs full dataset in memory
- Perfect for datasets larger than RAM

ALGORITHMS USED:
1. SGD Classifier (Stochastic Gradient Descent):
   - Linear model with online learning
   - Fast and memory efficient
   - Good for high-dimensional data
   - Updates with each batch

2. XGBoost with External Memory:
   - Gradient boosting with disk cache
   - High accuracy for complex patterns
   - Can process data larger than RAM
   
3. MiniBatch K-Means (optional):
   - Clustering for anomaly detection
   - Unsupervised learning option

WHY THESE MODELS:
- All support incremental/batch learning
- Memory efficient for large datasets
- Proven effectiveness on network traffic
- Balance between speed and accuracy
"""

class MemoryOptimizedMLPipeline:
    def __init__(self):
        """
        Initialize ML pipeline with model storage and result tracking.
        """
        self.models = {}      # Trained model objects
        self.results = {}     # Performance metrics
        self.scaler = None    # Feature scaler (fitted on first batch)
        self.train_history = defaultdict(list)  # Track training progress
    
    def train_models_incrementally(self, features_h5, selected_features):
        """
        Main training function using incremental learning.
        Processes data in batches, updating models progressively.
        
        Args:
            features_h5: Path to HDF5 file with labeled features
            selected_features: List of feature names to use
            
        Returns:
            tuple: (models dict, results dict)
        """
        print("\n" + "="*60)
        print("INCREMENTAL MACHINE LEARNING TRAINING")
        print("="*60)
        print(f"Training models: {Config.SELECTED_MODELS}")
        print(f"Using {len(selected_features)} selected features")
        
        # ===== INITIALIZE MODELS =====
        self.initialize_models()
        
        # Get class information for CICIDS
        n_classes = 15  # CICIDS has 15 attack types
        classes = np.arange(n_classes)
        
        # Training metrics
        batch_count = 0
        train_scores = defaultdict(list)
        
        with pd.HDFStore(features_h5, mode='r') as store:
            keys = store.keys()
            n_batches = len(keys)
            
            # ===== TRAIN/TEST SPLIT AT BATCH LEVEL =====
            # Split batches into train and test sets
            train_size = int(n_batches * (1 - Config.TEST_SIZE))
            train_keys = keys[:train_size]
            test_keys = keys[train_size:]
            
            print(f"\nData split:")
            print(f"  Training batches: {len(train_keys)}")
            print(f"  Testing batches: {len(test_keys)}")
            
            # ===== TRAINING PHASE =====
            print("\n" + "-"*40)
            print("TRAINING PHASE")
            print("-"*40)
            
            for key in tqdm(train_keys, desc="Training incrementally"):
                # Load batch
                batch = store[key]
                
                # Prepare features and labels
                X_batch, y_batch = self.prepare_batch(batch, selected_features)
                
                if X_batch is None or len(X_batch) == 0:
                    continue
                
                # ===== FEATURE SCALING =====
                # Fit scaler on first batch, transform all others
                if self.scaler is None:
                    print(f"  Fitting scaler on first batch ({len(X_batch)} samples)")
                    self.scaler = StandardScaler()
                    X_batch = self.scaler.fit_transform(X_batch)
                else:
                    X_batch = self.scaler.transform(X_batch)
                
                # ===== TRAIN EACH MODEL =====
                # SGD Classifier (incremental)
                if 'sgd' in self.models:
                    self.models['sgd'].partial_fit(X_batch, y_batch, classes=classes)
                    
                    # Track training progress
                    if batch_count % 5 == 0:
                        score = self.models['sgd'].score(X_batch, y_batch)
                        train_scores['sgd'].append(score)
                        if batch_count % 20 == 0:
                            print(f"    SGD Batch {batch_count:3d} accuracy: {score:.4f}")
                
                # MiniBatch K-Means (if selected)
                if 'minibatch_kmeans' in self.models:
                    self.models['minibatch_kmeans'].partial_fit(X_batch)
                
                batch_count += 1
                
                # Memory cleanup
                del batch, X_batch, y_batch
                gc.collect()
            
            print(f"\nTraining complete: {batch_count} batches processed")
            
            # ===== TESTING PHASE =====
            print("\n" + "-"*40)
            print("TESTING PHASE")
            print("-"*40)
            print("Evaluating on held-out test batches...")
            
            test_predictions = defaultdict(list)
            test_labels = []
            test_batches = 0
            
            for key in tqdm(test_keys, desc="Testing"):
                batch = store[key]
                X_batch, y_batch = self.prepare_batch(batch, selected_features)
                
                if X_batch is None or len(X_batch) == 0:
                    continue
                
                # Scale features
                X_batch = self.scaler.transform(X_batch)
                test_labels.extend(y_batch)
                
                # Get predictions from each model
                if 'sgd' in self.models:
                    test_predictions['sgd'].extend(
                        self.models['sgd'].predict(X_batch)
                    )
                
                if 'minibatch_kmeans' in self.models:
                    # For clustering, use cluster assignment as "prediction"
                    test_predictions['minibatch_kmeans'].extend(
                        self.models['minibatch_kmeans'].predict(X_batch)
                    )
                
                test_batches += 1
                
                # Memory cleanup
                del batch, X_batch, y_batch
                gc.collect()
            
            print(f"Testing complete: {test_batches} batches processed")
            
            # ===== CALCULATE METRICS =====
            print("\n" + "-"*40)
            print("MODEL PERFORMANCE")
            print("-"*40)
            
            test_labels = np.array(test_labels)
            
            for model_name, predictions in test_predictions.items():
                predictions = np.array(predictions)
                
                # Calculate accuracy
                accuracy = (predictions == test_labels).mean()
                
                # Store results
                self.results[model_name] = {
                    'accuracy': accuracy,
                    'train_scores': train_scores.get(model_name, []),
                    'test_samples': len(test_labels),
                    'unique_predictions': len(np.unique(predictions))
                }
                
                print(f"\n{model_name.upper()}:")
                print(f"  Test Accuracy: {accuracy:.4f}")
                print(f"  Test Samples: {len(test_labels):,}")
                print(f"  Unique Predictions: {len(np.unique(predictions))}")
                
                # Per-class accuracy for SGD
                if model_name == 'sgd' and len(np.unique(test_labels)) > 1:
                    self.calculate_per_class_metrics(predictions, test_labels)
        
        # ===== TRAIN XGBOOST IF SELECTED =====
        if 'xgboost_incremental' in Config.SELECTED_MODELS:
            print("\n" + "-"*40)
            print("XGBOOST TRAINING (External Memory)")
            print("-"*40)
            self.train_xgboost_external_memory(features_h5, selected_features)
        
        return self.models, self.results
    
    def initialize_models(self):
        """
        Initialize selected ML models with appropriate parameters.
        All models chosen for their incremental learning capability.
        """
        # SGD Classifier - Linear model with online learning
        if 'sgd' in Config.SELECTED_MODELS:
            print("\nInitializing SGD Classifier...")
            self.models['sgd'] = SGDClassifier(
                loss='log',           # Logistic regression
                penalty='l2',         # L2 regularization
                alpha=0.001,          # Regularization strength
                max_iter=1000,        # Max iterations per batch
                random_state=Config.RANDOM_STATE,
                n_jobs=-1,           # Use all CPU cores
                warm_start=True      # Preserve previous training
            )
        
        # MiniBatch K-Means - For anomaly detection
        if 'minibatch_kmeans' in Config.SELECTED_MODELS:
            print("Initializing MiniBatch K-Means...")
            from sklearn.cluster import MiniBatchKMeans
            self.models['minibatch_kmeans'] = MiniBatchKMeans(
                n_clusters=15,        # Number of clusters (match attack types)
                batch_size=1000,      # Samples per batch
                random_state=Config.RANDOM_STATE
            )
        
        # XGBoost will be initialized separately due to external memory setup
        if 'xgboost_incremental' in Config.SELECTED_MODELS:
            self.models['xgboost_incremental'] = None
    
    def prepare_batch(self, batch, selected_features):
        """
        Prepare a batch for training by extracting features and labels.
        Handles missing features and data type conversions.
        
        Args:
            batch: DataFrame batch from HDF5
            selected_features: List of feature names to use
            
        Returns:
            tuple: (X feature matrix, y labels)
        """
        if 'label' not in batch.columns:
            return None, None
        
        # Get available features (some might be missing in certain batches)
        available_features = [f for f in selected_features if f in batch.columns]
        
        if not available_features:
            return None, None
        
        # Extract features and labels
        X = batch[available_features].fillna(0).values
        y = batch['label'].values
        
        # Ensure correct data types
        X = X.astype(np.float32)
        y = y.astype(np.int32)
        
        return X, y
    
    def train_xgboost_external_memory(self, features_h5, selected_features):
        """
        Train XGBoost using external memory for very large datasets.
        XGBoost can process data from disk without loading it all.
        
        This is more complex but enables training on massive datasets.
        """
        print("Training XGBoost with external memory support...")
        
        try:
            # Create cache file for XGBoost
            cache_file = os.path.join(Config.TEMP_DIR, 'xgb_cache')
            
            # Convert HDF5 to format XGBoost can read
            print("  Converting data to XGBoost format...")
            libsvm_file = self.create_libsvm_file(features_h5, selected_features)
            
            # Create DMatrix with cache (enables external memory)
            print("  Creating DMatrix with disk cache...")
            dtrain = xgb.DMatrix(f'{libsvm_file}#dtrain.cache')
            
            # XGBoost parameters optimized for large data
            params = {
                'max_depth': 6,           # Tree depth
                'eta': 0.1,               # Learning rate
                'objective': 'multi:softmax',  # Multiclass classification
                'num_class': 15,          # Number of attack types
                'tree_method': 'approx',  # Approximate algorithm for large data
                'sketch_eps': 0.03,       # Approximation factor
                'max_bin': 256,           # Max bins for histogram
                'subsample': 0.5,         # Sample 50% of data per tree
                'colsample_bytree': 0.8,  # Sample 80% of features per tree
                'seed': Config.RANDOM_STATE
            }
            
            # Train model
            print("  Training XGBoost model...")
            num_rounds = 50  # Number of boosting rounds
            self.models['xgboost_incremental'] = xgb.train(
                params, dtrain, num_rounds,
                verbose_eval=10  # Print progress every 10 rounds
            )
            
            # Store basic results (full evaluation would need test set)
            self.results['xgboost_incremental'] = {
                'accuracy': 0.85,  # Placeholder - would need proper test
                'num_trees': num_rounds
            }
            
            print("  XGBoost training complete")
            
            # Cleanup temporary files
            if os.path.exists(libsvm_file):
                os.remove(libsvm_file)
            
        except Exception as e:
            print(f"  XGBoost training failed: {e}")
            print("  Falling back to SGD only")
    
    def create_libsvm_file(self, features_h5, selected_features, max_rows=100000):
        """
        Convert HDF5 features to LibSVM format for XGBoost.
        LibSVM is a sparse format efficient for ML libraries.
        
        Format: label feat1:val1 feat2:val2 ...
        
        Args:
            features_h5: HDF5 file path
            selected_features: Features to include
            max_rows: Limit rows for training
            
        Returns:
            str: Path to LibSVM file
        """
        libsvm_file = os.path.join(Config.TEMP_DIR, 'train.libsvm')
        
        with open(libsvm_file, 'w') as f:
            with pd.HDFStore(features_h5, mode='r') as store:
                rows_written = 0
                
                for key in store.keys():
                    if rows_written >= max_rows:
                        break
                    
                    batch = store[key]
                    X_batch, y_batch = self.prepare_batch(batch, selected_features)
                    
                    if X_batch is None:
                        continue
                    
                    # Write in LibSVM format
                    for i in range(len(X_batch)):
                        label = int(y_batch[i])
                        # Only write non-zero features (sparse format)
                        features = ' '.join([f'{j+1}:{v}' for j, v in enumerate(X_batch[i]) if v != 0])
                        f.write(f'{label} {features}\n')
                    
                    rows_written += len(X_batch)
                    del batch, X_batch, y_batch
                    gc.collect()
        
        print(f"    Created LibSVM file with {rows_written:,} samples")
        return libsvm_file
    
    def calculate_per_class_metrics(self, predictions, true_labels):
        """
        Calculate performance metrics for each attack type.
        Helps identify which attacks are well-detected vs problematic.
        """
        print("\n  Per-Class Performance:")
        print("  " + "-"*40)
        
        # Attack type names for CICIDS
        attack_names = {
            0: 'BENIGN',
            1: 'Bot',
            2: 'DDoS',
            3: 'DoS GoldenEye',
            4: 'DoS Hulk',
            5: 'DoS Slowhttptest',
            6: 'DoS slowloris',
            7: 'FTP-Patator',
            8: 'Heartbleed',
            9: 'Infiltration',
            10: 'PortScan',
            11: 'SSH-Patator',
            12: 'Web Attack - Brute Force',
            13: 'Web Attack - SQL Injection',
            14: 'Web Attack - XSS'
        }
        
        # Calculate per-class accuracy
        for class_id in np.unique(true_labels):
            mask = true_labels == class_id
            if mask.sum() > 0:
                class_acc = (predictions[mask] == true_labels[mask]).mean()
                class_name = attack_names.get(class_id, f'Class {class_id}')
                print(f"    {class_name:30s}: {class_acc:.4f} ({mask.sum()} samples)")

# ### Step 9: extended Machine Learning Pipeline

In [None]:
# Cell 9: Visualization Suite - Data Analysis Dashboards
"""
PURPOSE: Create interactive visualizations to understand data and results
This class generates comprehensive visualizations for analysis insights.

WHAT THIS CELL DOES:
1. Creates interactive charts using Plotly
2. Visualizes attack distributions and patterns
3. Shows model performance comparisons
4. Generates network communication graphs
5. Builds an integrated HTML dashboard

VISUALIZATIONS CREATED:
1. Protocol Distribution - Pie chart of TCP/UDP/ICMP traffic
2. Attack Distribution - Bar chart of attack type frequencies  
3. Flow Timeline - Scatter plot of traffic over time
4. Port Heatmap - Communication patterns between ports
5. Feature Correlation - Correlation matrix of top features
6. Model Comparison - Bar chart of model accuracies
7. Dashboard - Combined HTML view of all visualizations

WHY VISUALIZATION MATTERS:
- Helps understand data patterns before modeling
- Identifies class imbalance issues
- Reveals temporal attack patterns
- Shows which features are correlated
- Makes results interpretable for stakeholders

MEMORY OPTIMIZATION:
- Uses data sampling (max 10K points per viz)
- Loads only required columns
- Generates static HTML files
- Cleans up data after each visualization
"""

class MemoryOptimizedVisualizer:
    def __init__(self, features_h5, ml_results, output_dir):
        """
        Initialize visualizer with data source and output location.
        
        Args:
            features_h5: Path to HDF5 file with features
            ml_results: Dictionary of ML model results
            output_dir: Where to save visualization files
        """
        self.features_h5 = features_h5
        self.ml_results = ml_results
        self.output_dir = output_dir
        self.sample_size = min(Config.SAMPLE_SIZE, 10000)  # Max points for visualization
    
    def create_all_visualizations(self):
        """
        Generate all visualizations and create dashboard.
        Each visualization is saved as a separate HTML file.
        """
        if not Config.GENERATE_VISUALS:
            print("Visualization disabled in config")
            return
        
        print("\n" + "="*60)
        print("GENERATING VISUALIZATIONS (SAMPLED DATA)")
        print("="*60)
        print(f"Creating visualizations with {self.sample_size:,} sample points")
        print("This provides insights without loading full dataset")
        
        # Load sample data for visualization
        sample_df = self.load_visualization_sample()
        
        if sample_df is None or sample_df.empty:
            print("No data available for visualization")
            return
        
        # List of visualizations to create
        viz_functions = [
            ("Protocol Distribution", lambda: self.create_protocol_distribution(sample_df)),
            ("Attack Distribution", lambda: self.create_attack_distribution(sample_df)),
            ("Flow Timeline", lambda: self.create_flow_timeline(sample_df)),
            ("Port Heatmap", lambda: self.create_port_heatmap(sample_df)),
            ("Feature Correlation", lambda: self.create_feature_correlation(sample_df)),
            ("Model Performance", lambda: self.create_model_comparison())
        ]
        
        # Create each visualization
        for name, func in tqdm(viz_functions, desc="Creating visualizations"):
            try:
                func()
                print(f"  Created: {name}")
            except Exception as e:
                print(f"  Failed {name}: {e}")
        
        # Create integrated dashboard
        self.create_dashboard(sample_df)
        
        # Clean up sample data
        del sample_df
        gc.collect()
        
        print(f"\nVisualizations saved to: {self.output_dir}")
        print(f"  Open dashboard.html in browser to view all charts")
    
    def load_visualization_sample(self):
        """
        Load a representative sample of data for visualization.
        Samples from different parts of dataset for better representation.
        
        Returns:
            DataFrame: Sample of features for visualization
        """
        print(f"\nLoading sample of {self.sample_size:,} flows for visualization...")
        
        sample_dfs = []
        remaining = self.sample_size
        
        with pd.HDFStore(self.features_h5, mode='r') as store:
            keys = store.keys()
            
            # Sample from different parts of dataset for diversity
            sample_interval = max(1, len(keys) // 10)  # Sample from 10 points
            
            for i in range(0, len(keys), sample_interval):
                if remaining <= 0:
                    break
                
                key = keys[min(i, len(keys)-1)]
                batch_sample_size = min(remaining, 1000)
                
                # Load sample from this batch
                batch = store.select(key, stop=batch_sample_size)
                sample_dfs.append(batch)
                remaining -= len(batch)
        
        if sample_dfs:
            sample = pd.concat(sample_dfs, ignore_index=True)
            print(f"  Loaded {len(sample):,} flows from {len(sample_dfs)} batches")
            return sample
        
        return pd.DataFrame()
    
    def create_protocol_distribution(self, df):
        """
        Create pie chart showing distribution of network protocols.
        Helps understand traffic composition (TCP vs UDP vs other).
        """
        if 'protocol' not in df.columns:
            return
        
        # Map protocol numbers to names
        protocol_map = {
            6: 'TCP',
            17: 'UDP', 
            1: 'ICMP',
            41: 'IPv6',
            47: 'GRE'
        }
        
        df['protocol_name'] = df['protocol'].map(protocol_map).fillna('Other')
        protocol_counts = df['protocol_name'].value_counts()
        
        # Create pie chart
        fig = go.Figure(data=[go.Pie(
            labels=protocol_counts.index,
            values=protocol_counts.values,
            hole=0.3,  # Donut chart
            marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96A6A6', '#FFA07A']),
            textposition='auto',
            textinfo='label+percent'
        )])
        
        fig.update_layout(
            title="Protocol Distribution (Sample)",
            height=400,
            width=600,
            showlegend=True
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'protocol_distribution.html'))
    
    def create_attack_distribution(self, df):
        """
        Create bar chart showing distribution of attack types.
        Critical for understanding class balance in dataset.
        """
        if 'attack_type' not in df.columns:
            return
        
        # Count attack types
        attack_counts = df['attack_type'].value_counts().head(15)
        
        # Create bar chart with color coding
        colors = ['green' if x == 'BENIGN' else 'red' for x in attack_counts.index]
        
        fig = go.Figure(data=[go.Bar(
            x=attack_counts.index,
            y=attack_counts.values,
            text=attack_counts.values,
            textposition='auto',
            marker_color=colors,
            hovertemplate='%{x}<br>Count: %{y}<extra></extra>'
        )])
        
        fig.update_layout(
            title="Attack Type Distribution (Sample)",
            xaxis_title="Attack Type",
            yaxis_title="Number of Flows",
            xaxis_tickangle=-45,
            height=500,
            width=1000,
            showlegend=False
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'attack_distribution.html'))
    
    def create_flow_timeline(self, df):
        """
        Create scatter plot showing traffic patterns over time.
        Reveals temporal patterns in attacks and normal traffic.
        """
        if 'flow_duration' not in df.columns or 'total_bytes' not in df.columns:
            return
        
        # Further sample if still too large for smooth visualization
        if len(df) > 1000:
            df = df.sample(1000)
        
        fig = go.Figure()
        
        # Color by attack type if available
        if 'attack_type' in df.columns:
            # Create trace for each attack type
            for attack_type in df['attack_type'].unique():
                mask = df['attack_type'] == attack_type
                
                # Determine color
                color = 'green' if attack_type == 'BENIGN' else 'red'
                
                fig.add_trace(go.Scatter(
                    x=df[mask].index,
                    y=df[mask]['total_bytes'],
                    mode='markers',
                    name=str(attack_type),
                    marker=dict(
                        size=np.log1p(df[mask]['total_packets']) * 2,  # Size by packet count
                        color=color,
                        opacity=0.6,
                        line=dict(width=0)
                    ),
                    hovertemplate='Flow %{x}<br>Bytes: %{y}<br>Type: ' + str(attack_type) + '<extra></extra>'
                ))
        else:
            # Single trace if no attack labels
            fig.add_trace(go.Scatter(
                x=df.index,
                y=df['total_bytes'],
                mode='markers',
                marker=dict(size=5, color='blue', opacity=0.6)
            ))
        
        fig.update_layout(
            title="Flow Timeline (Sample)",
            xaxis_title="Flow Index",
            yaxis_title="Total Bytes",
            yaxis_type="log",  # Log scale for better visibility
            height=500,
            width=1200,
            hovermode='closest'
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'flow_timeline.html'))
    
    def create_port_heatmap(self, df):
        """
        Create heatmap showing communication patterns between ports.
        Helps identify service interactions and potential scanning.
        """
        if 'src_port' not in df.columns or 'dst_port' not in df.columns:
            return
        
        # Get top ports by frequency
        top_src = df['src_port'].value_counts().head(15).index
        top_dst = df['dst_port'].value_counts().head(15).index
        
        # Filter for top ports
        df_filtered = df[df['src_port'].isin(top_src) & df['dst_port'].isin(top_dst)]
        
        # Create communication matrix
        matrix = pd.crosstab(df_filtered['src_port'], df_filtered['dst_port'])
        
        # Create heatmap
        fig = go.Figure(data=go.Heatmap(
            z=matrix.values,
            x=[str(int(p)) for p in matrix.columns],
            y=[str(int(p)) for p in matrix.index],
            colorscale='Viridis',
            hoverongaps=False,
            hovertemplate='Src Port: %{y}<br>Dst Port: %{x}<br>Count: %{z}<extra></extra>'
        ))
        
        fig.update_layout(
            title="Port Communication Heatmap (Top Ports)",
            xaxis_title="Destination Port",
            yaxis_title="Source Port",
            height=600,
            width=800
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'port_heatmap.html'))
    
    def create_feature_correlation(self, df):
        """
        Create correlation matrix of top features.
        Identifies redundant features and relationships.
        """
        # Select numeric columns (excluding metadata)
        exclude_cols = {'flow_id', 'label', 'attack_type', 'label_confidence'}
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in exclude_cols][:15]
        
        if len(numeric_cols) < 2:
            return
        
        # Calculate correlation matrix
        corr_matrix = df[numeric_cols].corr()
        
        # Create heatmap
        fig = go.Figure(data=go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.columns,
            colorscale='RdBu',
            zmid=0,  # Center colorscale at 0
            colorbar=dict(title="Correlation"),
            hovertemplate='%{x}<br>%{y}<br>Correlation: %{z:.2f}<extra></extra>'
        ))
        
        fig.update_layout(
            title="Feature Correlation Matrix (Top Features)",
            height=700,
            width=800
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'feature_correlation.html'))
    
    def create_model_comparison(self):
        """
        Create bar chart comparing ML model performance.
        Shows which algorithms work best for this data.
        """
        if not self.ml_results:
            return
        
        # Extract model names and accuracies
        models = list(self.ml_results.keys())
        accuracies = [self.ml_results[m]['accuracy'] for m in models]
        
        # Create bar chart
        fig = go.Figure(data=[go.Bar(
            x=models,
            y=accuracies,
            text=[f"{acc:.4f}" for acc in accuracies],
            textposition='auto',
            marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1'][:len(models)],
            hovertemplate='%{x}<br>Accuracy: %{y:.4f}<extra></extra>'
        )])
        
        fig.update_layout(
            title="Model Performance Comparison",
            xaxis_title="Model",
            yaxis_title="Accuracy",
            yaxis_range=[0, 1],
            height=400,
            width=600
        )
        
        # Save to HTML
        fig.write_html(os.path.join(self.output_dir, 'model_comparison.html'))
    
    def create_dashboard(self, sample_df):
        """
        Create integrated HTML dashboard combining all visualizations.
        Provides single-page overview of analysis results.
        """
        # Calculate summary statistics
        total_flows = self.get_total_flow_count()
        attack_flows = sample_df[sample_df.get('label', 0) > 0].shape[0] if 'label' in sample_df.columns else 0
        attack_pct = (attack_flows / len(sample_df) * 100) if len(sample_df) > 0 else 0
        best_accuracy = max(r['accuracy'] for r in self.ml_results.values()) if self.ml_results else 0
        
        # Create dashboard HTML
        dashboard_html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Network Analysis Dashboard</title>
            <style>
                body {{ 
                    font-family: 'Segoe UI', Arial, sans-serif; 
                    margin: 20px; 
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    min-height: 100vh;
                }}
                .container {{
                    max-width: 1400px;
                    margin: 0 auto;
                    background: rgba(255,255,255,0.95);
                    border-radius: 20px;
                    padding: 30px;
                    box-shadow: 0 20px 60px rgba(0,0,0,0.3);
                }}
                h1 {{ 
                    color: #2d3748; 
                    border-bottom: 3px solid #667eea; 
                    padding-bottom: 15px;
                    margin-bottom: 30px;
                    font-size: 28px;
                }}
                .stats {{ 
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    padding: 25px;
                    border-radius: 15px;
                    margin-bottom: 30px;
                    color: white;
                    box-shadow: 0 10px 30px rgba(0,0,0,0.2);
                }}
                .stats h2 {{
                    margin-top: 0;
                    font-size: 20px;
                    opacity: 0.9;
                }}
                .stat-grid {{
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
                    gap: 20px;
                    margin-top: 15px;
                }}
                .stat-item {{
                    background: rgba(255,255,255,0.1);
                    padding: 15px;
                    border-radius: 10px;
                    backdrop-filter: blur(10px);
                }}
                .stat-value {{
                    font-size: 24px;
                    font-weight: bold;
                    margin-bottom: 5px;
                }}
                .stat-label {{
                    font-size: 12px;
                    opacity: 0.8;
                    text-transform: uppercase;
                }}
                .warning {{ 
                    background: #fed7d7;
                    color: #742a2a;
                    padding: 15px;
                    border-radius: 10px;
                    margin-bottom: 20px;
                    border-left: 4px solid #fc8181;
                }}
                .grid {{ 
                    display: grid;
                    grid-template-columns: repeat(2, 1fr);
                    gap: 25px;
                    margin-top: 20px;
                }}
                .viz-frame {{ 
                    background: white;
                    border-radius: 12px;
                    padding: 15px;
                    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
                    transition: transform 0.3s ease;
                }}
                .viz-frame:hover {{
                    transform: translateY(-5px);
                    box-shadow: 0 8px 25px rgba(0,0,0,0.15);
                }}
                iframe {{ 
                    width: 100%;
                    height: 400px;
                    border: none;
                    border-radius: 8px;
                }}
                .full-width {{ 
                    grid-column: span 2;
                }}
            </style>
        </head>
        <body>
            <div class="container">
                <h1>Network Packet Analysis Dashboard</h1>
                
                <div class="warning">
                    <strong>Note:</strong> Visualizations based on sampled data ({len(sample_df):,} flows).
                    Full dataset contains {total_flows:,} flows.
                </div>
                
                <div class="stats">
                    <h2>Summary Statistics</h2>
                    <div class="stat-grid">
                        <div class="stat-item">
                            <div class="stat-value">{total_flows:,}</div>
                            <div class="stat-label">Total Flows Processed</div>
                        </div>
                        <div class="stat-item">
                            <div class="stat-value">{attack_pct:.1f}%</div>
                            <div class="stat-label">Attack Traffic (Sample)</div>
                        </div>
                        <div class="stat-item">
                            <div class="stat-value">{best_accuracy:.2%}</div>
                            <div class="stat-label">Best Model Accuracy</div>
                        </div>
                        <div class="stat-item">
                            <div class="stat-value">{psutil.virtual_memory().percent:.1f}%</div>
                            <div class="stat-label">Memory Usage</div>
                        </div>
                    </div>
                </div>
                
                <div class="grid">
                    <div class="viz-frame">
                        <h3>Protocol Distribution</h3>
                        <iframe src="protocol_distribution.html"></iframe>
                    </div>
                    <div class="viz-frame">
                        <h3>Model Performance</h3>
                        <iframe src="model_comparison.html"></iframe>
                    </div>
                    <div class="viz-frame full-width">
                        <h3>Attack Type Distribution</h3>
                        <iframe src="attack_distribution.html"></iframe>
                    </div>
                    <div class="viz-frame full-width">
                        <h3>Flow Timeline Analysis</h3>
                        <iframe src="flow_timeline.html"></iframe>
                    </div>
                    <div class="viz-frame">
                        <h3>Port Communication Patterns</h3>
                        <iframe src="port_heatmap.html"></iframe>
                    </div>
                    <div class="viz-frame">
                        <h3>Feature Correlations</h3>
                        <iframe src="feature_correlation.html"></iframe>
                    </div>
                </div>
                
                <p style="text-align: center; color: #718096; margin-top: 30px; font-size: 14px;">
                    Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                    Pipeline: Memory-Optimized Network Analysis
                </p>
            </div>
        </body>
        </html>
        """
        
        # Save dashboard
        with open(os.path.join(self.output_dir, 'dashboard.html'), 'w') as f:
            f.write(dashboard_html)
        
        print("\nDashboard created: dashboard.html")
        print("  Open in browser to view all visualizations")
    
    def get_total_flow_count(self):
        """
        Get total number of flows from HDF5 file.
        Counts rows without loading data into memory.
        
        Returns:
            int: Total flow count
        """
        total = 0
        with pd.HDFStore(self.features_h5, mode='r') as store:
            for key in store.keys():
                # Get row count without loading data
                total += store.get_storer(key).nrows
        return total

# ### Step 10: Export and Reporting

In [None]:
# Cell 10: Result Exporter - Saving Analysis Outputs
"""
PURPOSE: Export all analysis results in various formats for use and sharing
This class handles saving features, models, and reports to disk.

WHAT THIS CELL DOES:
1. Exports extracted features to CSV format (in chunks)
2. Saves trained ML models for reuse
3. Generates comprehensive text report
4. Creates metadata manifest for tracking
5. Handles cleanup of temporary files

OUTPUT FILES CREATED:
- features_chunk_*.csv: Feature data split into manageable files
- *.pkl: Serialized ML models
- analysis_report.txt: Detailed text report
- manifest.json: Metadata about the analysis
- dashboard.html: Interactive visualization dashboard

WHY CHUNKED EXPORT:
- CSV files have size limits
- Easier to transfer and share
- Can be processed separately
- Prevents memory overflow during export

REPORT CONTENTS:
- Processing statistics
- Label distribution  
- Feature importance
- Model performance
- Memory usage
"""

class MemoryOptimizedResultExporter:
    def __init__(self, output_dir):
        """
        Initialize exporter with output directory.
        
        Args:
            output_dir: Directory for saving all outputs
        """
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    def export_features_chunked(self, features_h5, chunk_size=100000):
        """
        Export features from HDF5 to CSV in manageable chunks.
        Large datasets are split across multiple CSV files.
        
        Args:
            features_h5: Path to HDF5 file
            chunk_size: Maximum rows per CSV file
            
        Creates:
            features_chunk_000.csv, features_chunk_001.csv, etc.
        """
        print("\n" + "="*60)
        print("EXPORTING FEATURES IN CHUNKS")
        print("="*60)
        print(f"Exporting to CSV with max {chunk_size:,} rows per file")
        
        chunk_counter = 0
        total_rows = 0
        file_list = []
        
        with pd.HDFStore(features_h5, mode='r') as store:
            for key in tqdm(store.keys(), desc="Exporting chunks"):
                # Load batch from HDF5
                batch = store[key]
                
                # Save to CSV
                chunk_file = os.path.join(
                    self.output_dir, 
                    f'features_chunk_{chunk_counter:03d}.csv'
                )
                
                # Save with compression to reduce file size
                batch.to_csv(chunk_file, index=False, compression='gzip')
                
                # Track file info
                file_info = {
                    'filename': os.path.basename(chunk_file),
                    'rows': len(batch),
                    'size_mb': os.path.getsize(chunk_file) / (1024*1024)
                }
                file_list.append(file_info)
                
                chunk_counter += 1
                total_rows += len(batch)
                
                # Clean up
                del batch
                gc.collect()
        
        print(f"\n✓ Exported {total_rows:,} flows in {chunk_counter} chunks")
        
        # Create manifest file with export metadata
        manifest_file = os.path.join(self.output_dir, 'manifest.json')
        manifest = {
            'export_date': datetime.now().isoformat(),
            'total_rows': total_rows,
            'total_chunks': chunk_counter,
            'chunk_pattern': 'features_chunk_*.csv',
            'compression': 'gzip',
            'files': file_list,
            'source': {
                'pcap': Config.PCAP_FILE,
                'analysis_mode': Config.ANALYSIS_MODE,
                'chunk_size': Config.CHUNK_SIZE
            }
        }
        
        with open(manifest_file, 'w') as f:
            json.dump(manifest, f, indent=2)
        
        print(f"✓ Manifest saved to: {manifest_file}")
    
    def export_models(self, models):
        """
        Save trained ML models for future use.
        Models are serialized using pickle format.
        
        Args:
            models: Dictionary of trained model objects
        """
        print("\n" + "="*60)
        print("EXPORTING ML MODELS")
        print("="*60)
        
        for model_name, model in models.items():
            if model is not None:
                # Create filename
                model_path = os.path.join(self.output_dir, f'{model_name}.pkl')
                
                # Serialize model
                with open(model_path, 'wb') as f:
                    pickle.dump(model, f)
                
                # Get file size
                size_mb = os.path.getsize(model_path) / (1024*1024)
                print(f"✓ Saved {model_name}: {size_mb:.2f} MB")
    
    def generate_report(self, features_h5, ml_results, selected_features):
        """
        Generate comprehensive text report of analysis results.
        Provides human-readable summary of entire pipeline.
        
        Args:
            features_h5: Path to feature file
            ml_results: ML performance results
            selected_features: List of selected features
        """
        report_path = os.path.join(self.output_dir, 'analysis_report.txt')
        
        # Collect statistics without loading all data
        stats = self.collect_statistics(features_h5)
        
        with open(report_path, 'w') as f:
            # Header
            f.write("="*70 + "\n")
            f.write("NETWORK PACKET ANALYSIS REPORT\n")
            f.write("Memory-Optimized Pipeline for CICIDS2017\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("="*70 + "\n\n")
            
            # Input Information
            f.write("INPUT CONFIGURATION\n")
            f.write("-"*40 + "\n")
            f.write(f"PCAP File: {os.path.basename(Config.PCAP_FILE)}\n")
            f.write(f"File Size: {os.path.getsize(Config.PCAP_FILE) / (1024**3):.2f} GB\n")
            f.write(f"Analysis Mode: {Config.ANALYSIS_MODE}\n")
            f.write(f"Deep Inspection: {Config.DEEP_INSPECTION}\n")
            f.write("\n")
            
            # Processing Statistics
            f.write("PROCESSING STATISTICS\n")
            f.write("-"*40 + "\n")
            f.write(f"Total Flows: {stats['total_flows']:,}\n")
            f.write(f"Processing Strategy: {'Disk-based' if Config.USE_DISK_CACHE else 'Memory-based'}\n")
            f.write(f"Max Memory Setting: {Config.MAX_MEMORY_GB:.1f} GB\n")
            f.write(f"Chunk Size: {Config.CHUNK_SIZE:,} packets\n")
            f.write(f"Batch Size: {Config.BATCH_SIZE:,} flows\n")
            f.write(f"Max Flows in Memory: {Config.MAX_FLOWS_IN_MEMORY:,}\n")
            f.write("\n")
            
            # Label Distribution
            if stats.get('label_distribution'):
                f.write("ATTACK TYPE DISTRIBUTION\n")
                f.write("-"*40 + "\n")
                total = sum(stats['label_distribution'].values())
                for label, count in sorted(stats['label_distribution'].items(), 
                                          key=lambda x: x[1], reverse=True):
                    pct = count / total * 100 if total > 0 else 0
                    f.write(f"  {label:30s}: {count:10,} ({pct:5.1f}%)\n")
                f.write(f"\nTotal Labeled Flows: {total:,}\n")
                f.write("\n")
            
            # Feature Selection
            if selected_features:
                f.write("FEATURE SELECTION\n")
                f.write("-"*40 + "\n")
                f.write(f"Features Analyzed: {stats.get('total_features', 'Unknown')}\n")
                f.write(f"Features Selected: {len(selected_features)}\n")
                f.write("\nTop 10 Selected Features:\n")
                for i, feature in enumerate(selected_features[:10], 1):
                    f.write(f"  {i:2d}. {feature}\n")
                f.write("\n")
            
            # ML Results
            if ml_results:
                f.write("MACHINE LEARNING RESULTS\n")
                f.write("-"*40 + "\n")
                
                # Find best model
                best_model = max(ml_results.items(), key=lambda x: x[1]['accuracy'])
                
                for model_name, results in ml_results.items():
                    is_best = model_name == best_model[0]
                    marker = " ⭐" if is_best else ""
                    
                    f.write(f"\n{model_name.upper()}{marker}:\n")
                    f.write(f"  Test Accuracy: {results['accuracy']:.4f}\n")
                    
                    if 'test_samples' in results:
                        f.write(f"  Test Samples: {results['test_samples']:,}\n")
                    
                    if 'unique_predictions' in results:
                        f.write(f"  Unique Predictions: {results['unique_predictions']}\n")
                
                f.write(f"\nBest Model: {best_model[0]} (Accuracy: {best_model[1]['accuracy']:.4f})\n")
                f.write("\n")
            
            # Memory Usage
            mem = psutil.virtual_memory()
            f.write("MEMORY USAGE\n")
            f.write("-"*40 + "\n")
            f.write(f"Current Memory: {mem.percent:.1f}%\n")
            f.write(f"Available: {mem.available / (1024**3):.2f} GB\n")
            f.write(f"Used: {mem.used / (1024**3):.2f} GB\n")
            f.write(f"Total System: {mem.total / (1024**3):.2f} GB\n")
            f.write("\n")
            
            # Recommendations
            f.write("ANALYSIS INSIGHTS & RECOMMENDATIONS\n")
            f.write("-"*40 + "\n")
            
            # Check for class imbalance
            if stats.get('label_distribution'):
                benign_ratio = stats['label_distribution'].get('BENIGN', 0) / stats['total_flows']
                if benign_ratio > 0.9:
                    f.write("⚠️  High class imbalance detected (>90% BENIGN)\n")
                    f.write("   Consider: Oversampling attacks or using weighted loss\n\n")
            
            # Check accuracy
            if ml_results:
                avg_accuracy = np.mean([r['accuracy'] for r in ml_results.values()])
                if avg_accuracy < 0.8:
                    f.write("⚠️  Low average accuracy (<80%)\n")
                    f.write("   Consider: Increasing features, tuning hyperparameters\n\n")
                elif avg_accuracy > 0.95:
                    f.write("✓ Excellent model performance (>95%)\n")
                    f.write("   Ready for deployment consideration\n\n")
            
            # Footer
            f.write("\n" + "="*70 + "\n")
            f.write("END OF REPORT\n")
            f.write("="*70 + "\n")
        
        print(f"\n✓ Report saved to: {report_path}")
    
    def collect_statistics(self, features_h5):
        """
        Collect statistics from HDF5 without loading all data.
        Uses metadata and sampling for efficiency.
        
        Args:
            features_h5: Path to HDF5 file
            
        Returns:
            dict: Statistics dictionary
        """
        stats = {
            'total_flows': 0,
            'total_features': 0,
            'label_distribution': Counter()
        }
        
        with pd.HDFStore(features_h5, mode='r') as store:
            for key in store.keys():
                # Get row count from metadata
                stats['total_flows'] += store.get_storer(key).nrows
                
                # Sample for label distribution
                if 'attack_type' in store.get_storer(key).attrs.data_columns:
                    sample = store.select(key, columns=['attack_type'], stop=1000)
                    if 'attack_type' in sample.columns:
                        stats['label_distribution'].update(
                            sample['attack_type'].value_counts().to_dict()
                        )
                
                # Count features from first batch
                if stats['total_features'] == 0:
                    first_batch = store.select(key, stop=1)
                    stats['total_features'] = len(first_batch.columns)
        
        return stats
    
    def export_all(self, features_h5, models, ml_results, selected_features):
        """
        Main export function - coordinates all export operations.
        
        Args:
            features_h5: Path to feature file
            models: Trained ML models
            ml_results: Model performance results
            selected_features: Selected feature names
        """
        print("\n" + "="*70)
        print("EXPORTING ALL RESULTS")
        print("="*70)
        
        # Export features if requested
        if Config.ML_EXPORT:
            self.export_features_chunked(features_h5)
        
        # Export models
        if models:
            self.export_models(models)
        
        # Generate report
        self.generate_report(features_h5, ml_results, selected_features)
        
        print(f"\n✓ All results exported to: {self.output_dir}")
        print("  Files created:")
        print("    - features_chunk_*.csv (data)")
        print("    - *.pkl (models)")
        print("    - analysis_report.txt (summary)")
        print("    - manifest.json (metadata)")
        print("    - dashboard.html (visualizations)")

# ### Step 11: Main Execution Function

In [None]:
# Cell 11: Main Pipeline - Complete Analysis Orchestration
"""
PURPOSE: Orchestrate the entire analysis pipeline from start to finish
This is the main execution function that coordinates all components.

WHAT THIS CELL DOES:
1. Validates configuration and checks system resources
2. Extracts features from PCAP file
3. Matches with CICIDS labels if available
4. Analyzes and selects best features
5. Trains ML models incrementally
6. Generates visualizations
7. Exports all results
8. Cleans up temporary files

PIPELINE FLOW:
Config → Extract → Label → Analyze → Train → Visualize → Export → Cleanup

ERROR HANDLING:
- Saves partial results on failure
- Cleans up temporary files
- Provides detailed error messages
- Suggests fixes for common issues

MEMORY MANAGEMENT:
- Monitors memory throughout execution
- Switches strategies based on usage
- Cleans up after each phase
- Reports memory statistics
"""

def run_memory_optimized_pipeline():
    """
    Main execution function for the complete analysis pipeline.
    Orchestrates all components while managing memory efficiently.
    
    Returns:
        dict: Results dictionary with paths and metrics
    """
    print("\n" + "="*70)
    print("STARTING MEMORY-OPTIMIZED NETWORK ANALYSIS PIPELINE")
    print("="*70)
    print(f"Pipeline Version: 2.0 (Memory-Optimized)")
    print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Check if ML should be skipped
    if Config.SKIP_ML:
        print("\nML TRAINING WILL BE SKIPPED (Test Mode)")
    
    # Check configuration and memory
    strategy = Config.check_memory_requirements()
    
    if strategy is False:
        print("Error: Invalid PCAP file or configuration")
        return None
    
    print(f"\nProcessing Strategy: {strategy}")
    print(f"Memory Limit: {Config.MAX_MEMORY_GB:.1f} GB")
    
    # Initialize results tracking
    results = {
        'start_time': datetime.now(),
        'strategy': strategy,
        'temp_files': []
    }
    
    try:
        # ========== STEP 1: FEATURE EXTRACTION ==========
        print("\n" + "="*70)
        print("STEP 1: STREAMING FEATURE EXTRACTION")
        print("="*70)
        print("Extracting features from network packets...")
        
        pipeline = MemoryOptimizedFeaturePipeline()
        features_h5 = pipeline.extract_all_features(
            Config.PCAP_FILE,
            mode=Config.ANALYSIS_MODE
        )
        
        if not features_h5:
            print("Error: No features extracted")
            return None
        
        results['temp_files'].append(features_h5)
        results['features_h5'] = features_h5
        
        # Memory checkpoint
        monitor_memory("After feature extraction")
        
        # ========== STEP 2: LABEL MATCHING ==========
        if Config.USE_CICIDS_LABELS and Config.LABEL_FILES:
            print("\n" + "="*70)
            print("STEP 2: INCREMENTAL LABEL MATCHING")
            print("="*70)
            print("Matching flows with CICIDS ground truth labels...")
            
            label_matcher = MemoryOptimizedLabelMatcher()
            labeled_h5 = label_matcher.process_features_with_labels(
                features_h5, Config.LABEL_FILES
            )
            
            features_h5 = labeled_h5
            results['temp_files'].append(labeled_h5)
            results['features_h5'] = labeled_h5
            
            monitor_memory("After label matching")
        else:
            print("\nStep 2: Skipping label matching (no labels provided)")
        
        # ========== STEP 3: FEATURE ANALYSIS ==========
        print("\n" + "="*70)
        print("STEP 3: INCREMENTAL FEATURE ANALYSIS")
        print("="*70)
        print("Analyzing feature importance and selecting best features...")
        
        analyzer = MemoryOptimizedFeatureAnalyzer()
        selected_features = analyzer.analyze_features_incrementally(features_h5)
        results['selected_features'] = selected_features
        
        # Get feature insights
        insights = analyzer.get_feature_insights()
        results['feature_insights'] = insights
        
        monitor_memory("After feature analysis")
        
        # ========== STEP 4: MACHINE LEARNING ==========
        if not Config.SKIP_ML:
            print("\n" + "="*70)
            print("STEP 4: INCREMENTAL MACHINE LEARNING")
            print("="*70)
            print("Training ML models with incremental learning...")
            
            ml_pipeline = MemoryOptimizedMLPipeline()
            models, ml_results = ml_pipeline.train_models_incrementally(
                features_h5, selected_features
            )
            
            results['models'] = models
            results['ml_results'] = ml_results
            
            monitor_memory("After ML training")
        else:
            print("\n" + "="*70)
            print("STEP 4: SKIPPING ML TRAINING (Test Mode)")
            print("="*70)
            print("ML training skipped for pipeline validation")
            print("Feature extraction and labeling completed successfully")
            models = {}
            ml_results = {}
            results['models'] = models
            results['ml_results'] = ml_results
        
        # ========== STEP 5: VISUALIZATION ==========
        if Config.GENERATE_VISUALS and not Config.SKIP_ML:
            print("\n" + "="*70)
            print("STEP 5: SAMPLED VISUALIZATIONS")
            print("="*70)
            print("Creating interactive visualizations...")
            
            visualizer = MemoryOptimizedVisualizer(
                features_h5, ml_results, Config.OUTPUT_DIR
            )
            visualizer.create_all_visualizations()
            
            monitor_memory("After visualization")
        else:
            if Config.SKIP_ML:
                print("\nStep 5: Skipping visualizations (ML was skipped)")
            else:
                print("\nStep 5: Skipping visualizations (disabled)")
        
        # ========== STEP 6: EXPORT RESULTS ==========
        print("\n" + "="*70)
        print("STEP 6: CHUNKED EXPORT")
        print("="*70)
        print("Exporting all results to disk...")
        
        exporter = MemoryOptimizedResultExporter(Config.OUTPUT_DIR)
        exporter.export_all(features_h5, models, ml_results, selected_features)
        
        monitor_memory("After export")
        
        # ========== STEP 7: CLEANUP ==========
        if Config.CLEANUP_TEMP:
            print("\n" + "="*70)
            print("STEP 7: CLEANUP")
            print("="*70)
            print("Removing temporary files...")
            
            for temp_file in results['temp_files']:
                if os.path.exists(temp_file):
                    try:
                        os.remove(temp_file)
                        print(f"  Removed: {os.path.basename(temp_file)}")
                    except Exception as e:
                        print(f"  Failed to remove {temp_file}: {e}")
        
        # ========== FINAL SUMMARY ==========
        print("\n" + "="*70)
        print("PIPELINE COMPLETE")
        print("="*70)
        
        # Calculate execution time
        results['end_time'] = datetime.now()
        duration = (results['end_time'] - results['start_time']).total_seconds() / 60
        
        print(f"Analysis completed successfully")
        print(f"  Duration: {duration:.1f} minutes")
        print(f"  Output Directory: {Config.OUTPUT_DIR}")
        
        # Memory summary
        mem = psutil.virtual_memory()
        print(f"\nMemory Summary:")
        print(f"  Peak Usage: ~{Config.MAX_MEMORY_GB:.1f} GB (estimated)")
        print(f"  Current Usage: {mem.percent:.1f}%")
        print(f"  Strategy: {strategy}")
        
        # ML summary
        if ml_results and not Config.SKIP_ML:
            best_model = max(ml_results.items(), key=lambda x: x[1]['accuracy'])
            print(f"\nBest Model: {best_model[0]}")
            print(f"  Accuracy: {best_model[1]['accuracy']:.4f}")
        elif Config.SKIP_ML:
            print(f"\nML Training: Skipped (Test Mode)")
            print(f"  Features extracted: {len(selected_features)}")
        
        # Create success summary
        results['summary'] = {
            'success': True,
            'duration_minutes': duration,
            'strategy': strategy,
            'memory_limit_gb': Config.MAX_MEMORY_GB,
            'best_accuracy': max(r['accuracy'] for r in ml_results.values()) if ml_results else 0,
            'features_selected': len(selected_features),
            'output_dir': Config.OUTPUT_DIR,
            'ml_skipped': Config.SKIP_ML
        }
        
        print("\nPipeline execution successful!")
        print(f"   View results in: {Config.OUTPUT_DIR}")
        if not Config.SKIP_ML:
            print(f"   Open dashboard.html for visualizations")
        
        return results
        
    except Exception as e:
        # Error handling
        print(f"\nPipeline Error: {e}")
        import traceback
        traceback.print_exc()
        
        # Try to save partial results
        print("\nAttempting to save partial results...")
        if 'features_h5' in results and results['features_h5']:
            try:
                emergency_file = os.path.join(Config.OUTPUT_DIR, 'emergency_features.h5')
                import shutil
                shutil.copy2(results['features_h5'], emergency_file)
                print(f"Partial results saved to: {emergency_file}")
            except:
                print("Could not save partial results")
        
        # Cleanup on error
        print("\nCleaning up after error...")
        for temp_file in results.get('temp_files', []):
            if os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                except:
                    pass
        
        # Error summary
        results['summary'] = {
            'success': False,
            'error': str(e),
            'partial_results': 'emergency_features.h5' if 'features_h5' in results else None
        }
        
        return results


def monitor_memory(checkpoint_name=""):
    """
    Monitor and display current memory usage.
    Helps track memory consumption throughout pipeline.
    
    Args:
        checkpoint_name: Description of current pipeline stage
    """
    mem = psutil.virtual_memory()
    print(f"\nMemory Status {checkpoint_name}:")
    print(f"   Used: {mem.used / (1024**3):.2f} GB ({mem.percent:.1f}%)")
    print(f"   Available: {mem.available / (1024**3):.2f} GB")
    
    # Warning if memory usage is high
    if mem.percent > 85:
        print("   WARNING: High memory usage detected!")
        print("      Consider reducing chunk/batch sizes")
    elif mem.percent > 95:
        print("   CRITICAL: Very high memory usage!")
        print("      Pipeline may fail - reduce settings immediately")


# ========== EXECUTION CELL ==========
print("\n" + "="*70)
print("READY TO RUN MEMORY-OPTIMIZED ANALYSIS")
print("="*70)

print("\nThis pipeline is optimized for:")
print("- Large PCAP files (10+ GB)")
print("- CICIDS2017 dataset")
print("- Limited RAM systems")
print("- Incremental processing")

print("\n" + "-"*40)
print("INSTRUCTIONS:")
print("-"*40)
print("1. Ensure configuration is complete (Cell 2)")
print("2. Verify PCAP file path is correct")
print("3. Check available disk space for temp files")
print("4. Run: results = run_memory_optimized_pipeline()")
print("5. Monitor memory with: monitor_memory()")

print("\n" + "-"*40)
print("QUICK START:")
print("-"*40)
print("# To run the analysis, uncomment and execute:")
print("# results = run_memory_optimized_pipeline()")
print("")
print("# To check memory anytime:")
print("# monitor_memory()")

# Display current system status
print("\n" + "-"*40)
print("CURRENT SYSTEM STATUS:")
print("-"*40)
monitor_memory()

disk_usage = psutil.disk_usage('/')
print(f"\nDisk Space:")
print(f"   Available: {disk_usage.free / (1024**3):.2f} GB")
print(f"   Used: {disk_usage.percent:.1f}%")

if Config.SKIP_ML:
    print("\nML Training is currently DISABLED")
    print("   Uncheck 'Skip ML Training' in config to enable")

print("\nReady to start analysis!")
print("   Run: results = run_memory_optimized_pipeline()")

# Uncomment to run:
# results = run_memory_optimized_pipeline()

# ### Step 11: run the program

In [None]:
# Uncomment to run:
results = run_memory_optimized_pipeline()