# # Network Traffic Analysis Pipeline for CICIDS2017
# ## Comprehensive Feature Extraction and Machine Learning Framework
# 
# This notebook integrates three analysis approaches:
# 1. Statistical flow analysis
# 2. Semantic content analysis  
# 3. Visual pattern generation
#
# Output includes both human-readable reports and ML-ready datasets.


# ### Step 1: Environment Setup and Dependency Installation

In [None]:
"""
Cell 1: Install Required Dependencies
Run this cell first to ensure all required packages are installed
"""

import subprocess
import sys

def install_packages():
    """Install required packages if not already present"""
    packages = [
        'pandas', 'numpy', 'scapy', 'tqdm', 'matplotlib', 'seaborn',
        'plotly', 'scikit-learn', 'psutil', 'tldextract', 'networkx',
        'xgboost', 'lightgbm', 'h5py', 'ipywidgets'
    ]
    
    for package in packages:
        try:
            __import__(package)
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✓ {package} installed successfully")

install_packages()


# ### Step 2: Import Libraries and Load Analysis Components

In [None]:
"""
Cell 2: Import all necessary libraries and custom analysis modules
"""

import os
import time
import json
import warnings
import gc
from datetime import datetime
from pathlib import Path
from collections import defaultdict, Counter
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import h5py

from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully")



# ### Step 3: User Input Configuration

In [None]:
"""
Cell 3: Interactive user input for file paths and configuration
This cell creates user-friendly widgets for configuration
"""

class ConfigurationManager:
    def __init__(self):
        self.config = {}
        self.setup_ui()
    
    def setup_ui(self):
        """Create interactive UI elements for configuration"""
        
        # File input widgets
        self.pcap_input = widgets.Text(
            placeholder='Enter PCAP file path',
            description='PCAP File:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='600px')
        )
        
        self.output_dir = widgets.Text(
            placeholder='Enter output directory path',
            description='Output Dir:',
            value='./network_analysis_output',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='600px')
        )
        
        # Analysis options
        self.deep_inspection = widgets.Checkbox(
            value=True,
            description='Enable Deep Packet Inspection',
            indent=False
        )
        
        self.generate_visuals = widgets.Checkbox(
            value=True,
            description='Generate Visualizations',
            indent=False
        )
        
        self.chunk_size = widgets.IntSlider(
            value=10000,
            min=1000,
            max=100000,
            step=1000,
            description='Chunk Size:',
            style={'description_width': 'initial'}
        )
        
        # ML options
        self.ml_approach = widgets.RadioButtons(
            options=['XGBoost (Fast)', 'LightGBM (Balanced)', 'Deep Learning (CNN+LSTM)', 'Ensemble (All)'],
            value='XGBoost (Fast)',
            description='ML Model:',
            style={'description_width': 'initial'}
        )
        
        # Process button
        self.process_btn = widgets.Button(
            description='Start Analysis',
            button_style='primary',
            layout=widgets.Layout(width='200px', height='40px')
        )
        self.process_btn.on_click(self.validate_and_save)
        
        # Status output
        self.status_output = widgets.Output()
        
        # Display UI
        display(HTML("<h3>Network Traffic Analysis Configuration</h3>"))
        display(widgets.VBox([
            widgets.HTML("<b>Input/Output Settings:</b>"),
            self.pcap_input,
            self.output_dir,
            widgets.HTML("<br><b>Analysis Options:</b>"),
            self.deep_inspection,
            self.generate_visuals,
            self.chunk_size,
            widgets.HTML("<br><b>Machine Learning Configuration:</b>"),
            self.ml_approach,
            widgets.HTML("<br>"),
            self.process_btn,
            self.status_output
        ]))
    
    def validate_and_save(self, b):
        """Validate inputs and save configuration"""
        with self.status_output:
            clear_output()
            
            # Validate PCAP file
            if not self.pcap_input.value:
                print("Please enter a PCAP file path")
                return
            
            if not os.path.exists(self.pcap_input.value):
                print(f"PCAP file not found: {self.pcap_input.value}")
                return
            
            # Create output directory
            os.makedirs(self.output_dir.value, exist_ok=True)
            
            # Save configuration
            self.config = {
                'pcap_file': self.pcap_input.value,
                'output_dir': self.output_dir.value,
                'deep_inspection': self.deep_inspection.value,
                'generate_visuals': self.generate_visuals.value,
                'chunk_size': self.chunk_size.value,
                'ml_approach': self.ml_approach.value
            }
            
            # Display configuration summary
            print("✓ Configuration saved successfully!\n")
            print("Configuration Summary:")
            print("-" * 50)
            for key, value in self.config.items():
                print(f"{key}: {value}")
            print("-" * 50)
            
            # Calculate estimated processing time
            file_size_mb = os.path.getsize(self.config['pcap_file']) / (1024**2)
            estimated_time = file_size_mb * 0.5  # Rough estimate: 2MB/sec
            print(f"\nFile size: {file_size_mb:.2f} MB")
            print(f"Estimated processing time: {estimated_time:.1f} seconds")
            
            # Create configuration manager
config_manager = ConfigurationManager()

# ### Step 4: Unified Feature Extraction Pipeline

In [None]:
"""
Cell 4: Unified feature extraction combining all three approaches
This integrates flow analysis, semantic analysis, and visual generation
"""

class UnifiedFeatureExtractor:
    """
    Combines all three feature extraction approaches into a single pipeline
    Handles memory management and progress tracking
    """
    
    def __init__(self, config):
        self.config = config
        self.features_data = []
        self.visual_data = []
        self.metadata = []
        
        # Initialize progress bar
        self.progress = widgets.IntProgress(
            value=0, min=0, max=100,
            description='Processing:',
            bar_style='info',
            style={'bar_color': 'maroon'},
            orientation='horizontal'
        )
        
        # Status text
        self.status = widgets.HTML(value='Initializing...')
        
    def extract_all_features(self):
        """
        Main extraction pipeline that coordinates all three extractors
        Returns consolidated feature dataset
        """
        display(widgets.VBox([self.progress, self.status]))
        
        # Import the custom extractors (assuming they're in the same directory)
        # In practice, you'd have these as separate .py files imported here
        
        self.status.value = "Loading PCAP file..."
        self.progress.value = 10
        
        # Simulated extraction (replace with actual extractor calls)
        flow_features = self._extract_flow_features()
        self.progress.value = 40
        
        semantic_features = self._extract_semantic_features()
        self.progress.value = 70
        
        if self.config['generate_visuals']:
            visual_features = self._generate_visual_features()
            self.progress.value = 90
        
        # Combine all features
        combined_df = self._combine_features(flow_features, semantic_features)
        self.progress.value = 100
        self.status.value = "✓ Feature extraction complete!"
        
        return combined_df
    
    def _extract_flow_features(self):
        """Extract statistical flow features"""
        # This would call your Network_TrafficFlow_Feature_Extractor
        # For demonstration, creating sample data
        
        self.status.value = "Extracting flow features..."
        
        # Simulated flow features
        flow_data = {
            'duration': np.random.exponential(5, 1000),
            'packets_per_second': np.random.gamma(2, 2, 1000),
            'bytes_per_second': np.random.gamma(3, 100, 1000),
            'avg_packet_size': np.random.normal(500, 200, 1000),
            'flow_direction_ratio': np.random.beta(2, 5, 1000),
            'syn_flag_count': np.random.poisson(3, 1000),
            'rst_flag_count': np.random.poisson(1, 1000),
            'packet_size_variance': np.random.exponential(1000, 1000)
        }
        
        return pd.DataFrame(flow_data)
    
    def _extract_semantic_features(self):
        """Extract semantic content features"""
        # This would call your Network_Semantic_Feature_Extractor
        
        self.status.value = "Extracting semantic features..."
        
        # Simulated semantic features
        semantic_data = {
            'domain_entropy': np.random.normal(3, 1, 1000),
            'dga_probability': np.random.beta(2, 8, 1000),
            'suspicious_keywords': np.random.poisson(2, 1000),
            'command_keywords': np.random.poisson(0.5, 1000),
            'obfuscation_indicators': np.random.poisson(1, 1000),
            'query_frequency': np.random.exponential(10, 1000),
            'failed_queries': np.random.poisson(0.3, 1000),
            'user_agent_entropy': np.random.normal(4, 1, 1000),
            'encryption_indicators': np.random.poisson(0.2, 1000)
        }
        
        return pd.DataFrame(semantic_data)
    
    def _generate_visual_features(self):
        """Generate visual representations for CNN"""
        # This would call your visual_nlp_system
        
        self.status.value = "Generating visual representations..."
        
        # Create sample threat heatmaps (256x256x3 RGB images)
        num_samples = min(100, len(self.features_data) if self.features_data else 100)
        visual_data = []
        
        for i in range(num_samples):
            # Generate synthetic heatmap
            heatmap = np.random.rand(256, 256, 3)
            visual_data.append(heatmap)
        
        self.visual_data = np.array(visual_data)
        return self.visual_data
    
    def _combine_features(self, flow_df, semantic_df):
        """Combine all feature types into unified dataset"""
        
        self.status.value = "Combining features..."
        
        # Combine DataFrames
        combined_df = pd.concat([flow_df, semantic_df], axis=1)
        
        # Add synthetic labels for demonstration
        # In practice, these would come from CICIDS2017 labels
        label_distribution = [0.8, 0.05, 0.05, 0.03, 0.03, 0.02, 0.02]
        labels = ['Benign', 'DoS', 'PortScan', 'WebAttack', 'Botnet', 'Infiltration', 'BruteForce']
        
        combined_df['label'] = np.random.choice(
            labels, 
            size=len(combined_df),
            p=label_distribution
        )
        
        return combined_df

# ### Step 5: Feature Analysis and Selection

In [None]:
"""
Cell 5: Feature importance analysis and selection
Identifies the most relevant features for machine learning
"""

class FeatureAnalyzer:
    """
    Analyzes extracted features and selects the most important ones
    Provides both statistical analysis and visualization
    """
    
    def __init__(self, features_df):
        self.features_df = features_df
        self.feature_importance = {}
        
    def analyze_features(self):
        """Perform comprehensive feature analysis"""
        
        print("=" * 70)
        print("FEATURE ANALYSIS REPORT")
        print("=" * 70)
        
        # Basic statistics
        self._print_basic_stats()
        
        # Correlation analysis
        self._analyze_correlations()
        
        # Feature importance using mutual information
        self._calculate_feature_importance()
        
        # Generate visualizations
        self._create_feature_visualizations()
        
        return self.get_selected_features()
    
    def _print_basic_stats(self):
        """Print basic dataset statistics"""
        print("\nDataset Statistics:")
        print(f"Total samples: {len(self.features_df):,}")
        print(f"Total features: {len(self.features_df.columns) - 1}")  # Excluding label
        
        # Class distribution
        print("\n Class Distribution:")
        class_counts = self.features_df['label'].value_counts()
        for label, count in class_counts.items():
            percentage = (count / len(self.features_df)) * 100
            print(f"  {label}: {count:,} ({percentage:.1f}%)")
        
        # Check for missing values
        missing = self.features_df.isnull().sum()
        if missing.any():
            print("\n Missing values detected:")
            print(missing[missing > 0])
    
    def _analyze_correlations(self):
        """Analyze feature correlations"""
        
        # Calculate correlation matrix
        numeric_cols = self.features_df.select_dtypes(include=[np.number]).columns
        corr_matrix = self.features_df[numeric_cols].corr()
        
        # Find highly correlated features
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > 0.9:
                    high_corr.append({
                        'feature1': corr_matrix.columns[i],
                        'feature2': corr_matrix.columns[j],
                        'correlation': corr_matrix.iloc[i, j]
                    })
        
        if high_corr:
            print("\n Highly correlated features (>0.9):")
            for pair in high_corr[:5]:  # Show top 5
                print(f"  {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")
    
    def _calculate_feature_importance(self):
        """Calculate feature importance using mutual information"""
        
        print("\n Calculating feature importance...")
        
        # Prepare data
        X = self.features_df.drop('label', axis=1)
        y = LabelEncoder().fit_transform(self.features_df['label'])
        
        # Calculate mutual information scores
        mi_scores = mutual_info_classif(X, y)
        
        # Store importance scores
        self.feature_importance = dict(zip(X.columns, mi_scores))
        
        # Print top features
        print("\n Top 10 Most Important Features:")
        sorted_features = sorted(self.feature_importance.items(), 
                               key=lambda x: x[1], reverse=True)
        
        for i, (feature, score) in enumerate(sorted_features[:10], 1):
            print(f"  {i}. {feature}: {score:.4f}")
    
    def _create_feature_visualizations(self):
        """Create feature analysis visualizations"""
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Feature Analysis Dashboard', fontsize=16)
        
        # 1. Feature importance bar chart
        ax1 = axes[0, 0]
        top_features = sorted(self.feature_importance.items(), 
                            key=lambda x: x[1], reverse=True)[:10]
        features, scores = zip(*top_features)
        
        ax1.barh(range(len(features)), scores, color='skyblue')
        ax1.set_yticks(range(len(features)))
        ax1.set_yticklabels(features)
        ax1.set_xlabel('Importance Score')
        ax1.set_title('Top 10 Feature Importance')
        ax1.invert_yaxis()
        
        # 2. Class distribution pie chart
        ax2 = axes[0, 1]
        class_counts = self.features_df['label'].value_counts()
        ax2.pie(class_counts.values, labels=class_counts.index, 
                autopct='%1.1f%%', startangle=90)
        ax2.set_title('Class Distribution')
        
        # 3. Feature correlation heatmap (top features)
        ax3 = axes[1, 0]
        top_feature_names = [f for f, _ in top_features]
        corr_subset = self.features_df[top_feature_names].corr()
        
        sns.heatmap(corr_subset, annot=True, fmt='.2f', 
                   cmap='coolwarm', center=0, ax=ax3,
                   cbar_kws={'shrink': 0.8})
        ax3.set_title('Top Features Correlation Matrix')
        
        # 4. Feature distribution comparison
        ax4 = axes[1, 1]
        feature_to_plot = top_features[0][0]  # Most important feature
        
        for label in self.features_df['label'].unique():
            subset = self.features_df[self.features_df['label'] == label][feature_to_plot]
            ax4.hist(subset, alpha=0.5, label=label, bins=20)
        
        ax4.set_xlabel(feature_to_plot)
        ax4.set_ylabel('Frequency')
        ax4.set_title(f'{feature_to_plot} Distribution by Class')
        ax4.legend()
        
        plt.tight_layout()
        plt.show()
    
    def get_selected_features(self, top_k=20):
        """Get the top K most important features"""
        sorted_features = sorted(self.feature_importance.items(), 
                               key=lambda x: x[1], reverse=True)
        return [f for f, _ in sorted_features[:top_k]]


# ### Step 6: Machine Learning Pipeline

In [None]:
"""
Cell 6: Machine Learning model training and evaluation
Implements multiple ML approaches based on user selection
"""

class MLPipeline:
    """
    Comprehensive machine learning pipeline supporting multiple algorithms
    Handles data preparation, training, evaluation, and export
    """
    
    def __init__(self, features_df, visual_data, config):
        self.features_df = features_df
        self.visual_data = visual_data
        self.config = config
        self.models = {}
        self.results = {}
        
    def prepare_data(self, selected_features):
        """Prepare data for machine learning"""
        
        print("Preparing data for machine learning...")
        
        # Separate features and labels
        X = self.features_df[selected_features]
        y = LabelEncoder().fit_transform(self.features_df['label'])
        
        # Handle missing values and infinities
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        
        # Scale features
        scaler = RobustScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.2, random_state=42, stratify=y
        )
        
        print(f"✓ Training set: {X_train.shape}")
        print(f"✓ Test set: {X_test.shape}")
        
        return X_train, X_test, y_train, y_test, scaler
    
    def train_xgboost(self, X_train, X_test, y_train, y_test):
        """Train XGBoost model"""
        
        print("\n Training XGBoost model...")
        
        # Calculate class weights for imbalanced data
        from sklearn.utils.class_weight import compute_class_weight
        classes = np.unique(y_train)
        class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
        
        # Create sample weights
        sample_weights = np.array([class_weights[y] for y in y_train])
        
        # Train model
        model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            objective='multi:softprob',
            random_state=42,
            use_label_encoder=False
        )
        
        model.fit(
            X_train, y_train,
            sample_weight=sample_weights,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=10,
            verbose=False
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        accuracy = (y_pred == y_test).mean()
        
        print(f"✓ XGBoost Accuracy: {accuracy:.4f}")
        
        self.models['xgboost'] = model
        self.results['xgboost'] = {
            'accuracy': accuracy,
            'predictions': y_pred,
            'model': model
        }
        
        return model
    
    def train_lightgbm(self, X_train, X_test, y_train, y_test):
        """Train LightGBM model"""
        
        print("\n Training LightGBM model...")
        
        # Train model
        model = lgb.LGBMClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            num_leaves=31,
            random_state=42,
            class_weight='balanced'
        )
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='multi_logloss',
            callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        accuracy = (y_pred == y_test).mean()
        
        print(f"✓ LightGBM Accuracy: {accuracy:.4f}")
        
        self.models['lightgbm'] = model
        self.results['lightgbm'] = {
            'accuracy': accuracy,
            'predictions': y_pred,
            'model': model
        }
        
        return model
    
    def evaluate_models(self, X_test, y_test):
        """Comprehensive model evaluation"""
        
        print("\n" + "=" * 70)
        print("MODEL EVALUATION RESULTS")
        print("=" * 70)
        
        # Get label names for reporting
        label_names = self.features_df['label'].unique()
        
        for model_name, result in self.results.items():
            print(f"\n📊 {model_name.upper()} Performance:")
            print("-" * 40)
            
            # Classification report
            report = classification_report(
                y_test, 
                result['predictions'],
                target_names=label_names,
                output_dict=True
            )
            
            # Print per-class metrics
            for label in label_names:
                if label in report:
                    metrics = report[label]
                    print(f"{label:15s} - Precision: {metrics['precision']:.3f}, "
                          f"Recall: {metrics['recall']:.3f}, "
                          f"F1: {metrics['f1-score']:.3f}")
            
            # Overall metrics
            print(f"\nOverall Accuracy: {report['accuracy']:.4f}")
            print(f"Macro Avg F1: {report['macro avg']['f1-score']:.4f}")
            print(f"Weighted Avg F1: {report['weighted avg']['f1-score']:.4f}")
    
    def create_evaluation_visualizations(self, X_test, y_test):
        """Create model evaluation visualizations"""
        
        # Create subplots for each model
        n_models = len(self.models)
        fig, axes = plt.subplots(n_models, 2, figsize=(15, 5*n_models))
        
        if n_models == 1:
            axes = axes.reshape(1, -1)
        
        label_names = sorted(self.features_df['label'].unique())
        
        for idx, (model_name, result) in enumerate(self.results.items()):
            # Confusion matrix
            cm = confusion_matrix(y_test, result['predictions'])
            
            # Plot confusion matrix
            ax1 = axes[idx, 0]
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
            ax1.set_title(f'{model_name.upper()} - Confusion Matrix')
            ax1.set_xlabel('Predicted')
            ax1.set_ylabel('Actual')
            
            # Feature importance (if available)
            ax2 = axes[idx, 1]
            
            if hasattr(result['model'], 'feature_importances_'):
                importances = result['model'].feature_importances_
                feature_names = self.features_df.drop('label', axis=1).columns
                
                # Get top 10 features
                indices = np.argsort(importances)[-10:]
                
                ax2.barh(range(len(indices)), importances[indices])
                ax2.set_yticks(range(len(indices)))
                ax2.set_yticklabels([feature_names[i] for i in indices])
                ax2.set_xlabel('Importance')
                ax2.set_title(f'{model_name.upper()} - Feature Importance')
            
        plt.tight_layout()
        plt.show()

# ### Step 7: Export Results for Production Use

In [None]:
"""
Cell 7: Export processed data and models for production deployment
Creates both human-readable reports and ML-ready datasets
"""

class ResultExporter:
    """
    Exports analysis results in multiple formats:
    - HDF5 for ML pipelines
    - CSV for data analysis
    - HTML reports for human review
    - Trained models for deployment
    """
    
    def __init__(self, config, features_df, models, results):
        self.config = config
        self.features_df = features_df
        self.models = models
        self.results = results
        self.export_dir = Path(config['output_dir'])
        
    def export_all(self):
        """Export all results in appropriate formats"""
        
        print("\n" + "=" * 70)
        print("EXPORTING RESULTS")
        print("=" * 70)
        
        # Create subdirectories
        (self.export_dir / 'ml_data').mkdir(exist_ok=True)
        (self.export_dir / 'human_reports').mkdir(exist_ok=True)
        (self.export_dir / 'models').mkdir(exist_ok=True)
        
        # Export ML-ready data
        self._export_ml_data()
        
        # Export human-readable reports
        self._export_human_reports()
        
        # Export trained models
        self._export_models()
        
        # Create summary file
        self._create_summary()
        
        print("\n✓ All results exported successfully!")
        print(f" Output directory: {self.export_dir}")
    
    def _export_ml_data(self):
        """Export data in ML-ready formats"""
        
        print("\n Exporting ML datasets...")
        
        # HDF5 format for large-scale ML
        h5_path = self.export_dir / 'ml_data' / 'network_features.h5'
        
        with h5py.File(h5_path, 'w') as f:
            # Store feature data
            features = self.features_df.drop('label', axis=1).values
            f.create_dataset('features', data=features, dtype='float32')
            
            # Store labels
            labels = LabelEncoder().fit_transform(self.features_df['label'])
            f.create_dataset('labels', data=labels, dtype='int32')
            
            # Store metadata
            f.attrs['feature_names'] = list(self.features_df.drop('label', axis=1).columns)
            f.attrs['label_names'] = list(self.features_df['label'].unique())
            f.attrs['n_samples'] = len(features)
            f.attrs['n_features'] = features.shape[1]
        
        print(f"   HDF5 dataset: {h5_path}")
        
        # CSV for easy inspection
        csv_path = self.export_dir / 'ml_data' / 'network_features.csv'
        self.features_df.to_csv(csv_path, index=False)
        print(f"   CSV dataset: {csv_path}")
        
        # NumPy arrays for direct use
        np_path = self.export_dir / 'ml_data' / 'features.npy'
        np.save(np_path, features)
        print(f"   NumPy array: {np_path}")
    
    def _export_human_reports(self):
        """Generate human-readable analysis reports"""
        
        print("\n Generating human reports...")
        
        # HTML report
        html_content = self._generate_html_report()
        html_path = self.export_dir / 'human_reports' / 'analysis_report.html'
        
        with open(html_path, 'w') as f:
            f.write(html_content)
        
        print(f"   HTML report: {html_path}")
        
        # Text summary
        text_path = self.export_dir / 'human_reports' / 'summary.txt'
        self._generate_text_summary(text_path)
        print(f"   Text summary: {text_path}")
    
    def _generate_html_report(self):
        """Generate comprehensive HTML report"""
        
        html = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>Network Traffic Analysis Report</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                h1 { color: #333; border-bottom: 2px solid #333; }
                h2 { color: #666; }
                table { border-collapse: collapse; width: 100%; }
                th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                th { background-color: #f2f2f2; }
                .metric { background-color: #e8f4f8; padding: 10px; margin: 10px 0; }
                .warning { color: #ff6b6b; font-weight: bold; }
                .success { color: #51cf66; font-weight: bold; }
            </style>
        </head>
        <body>
            <h1>Network Traffic Analysis Report</h1>
            <p>Generated: {timestamp}</p>
            
            <h2>Dataset Overview</h2>
            <div class="metric">
                <p>Total Samples: {n_samples}</p>
                <p>Features Extracted: {n_features}</p>
                <p>Attack Types Detected: {n_classes}</p>
            </div>
            
            <h2>Class Distribution</h2>
            <table>
                <tr><th>Class</th><th>Count</th><th>Percentage</th></tr>
                {class_table}
            </table>
            
            <h2>Model Performance</h2>
            <table>
                <tr><th>Model</th><th>Accuracy</th><th>F1 Score</th></tr>
                {model_table}
            </table>
            
            <h2>Top Security Findings</h2>
            <ul>
                {findings}
            </ul>
            
            <h2>Recommendations</h2>
            <ul>
                <li>Monitor high-entropy domains (potential DGA activity)</li>
                <li>Investigate flows with command keywords detected</li>
                <li>Review connections with obfuscation indicators</li>
                <li>Analyze failed DNS queries for reconnaissance attempts</li>
            </ul>
        </body>
        </html>
        """
        
        # Fill in template
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        n_samples = len(self.features_df)
        n_features = len(self.features_df.columns) - 1
        n_classes = len(self.features_df['label'].unique())
        
        # Class distribution table
        class_table = ""
        for label, count in self.features_df['label'].value_counts().items():
            percentage = (count / n_samples) * 100
            class_table += f"<tr><td>{label}</td><td>{count}</td><td>{percentage:.1f}%</td></tr>"
        
        # Model performance table
        model_table = ""
        for model_name, result in self.results.items():
            accuracy = result['accuracy']
            model_table += f"<tr><td>{model_name}</td><td>{accuracy:.4f}</td><td>-</td></tr>"
        
        # Security findings
        findings = ""
        
        # Check for high-risk indicators
        high_entropy = (self.features_df['domain_entropy'] > 4).sum()
        if high_entropy > 0:
            findings += f"<li class='warning'>High entropy domains detected: {high_entropy} flows</li>"
        
        command_detected = (self.features_df['command_keywords'] > 0).sum()
        if command_detected > 0:
            findings += f"<li class='warning'>Command keywords detected: {command_detected} flows</li>"
        
        obfuscation = (self.features_df['obfuscation_indicators'] > 0).sum()
        if obfuscation > 0:
            findings += f"<li class='warning'>Obfuscation detected: {obfuscation} flows</li>"
        
        return html.format(
            timestamp=timestamp,
            n_samples=n_samples,
            n_features=n_features,
            n_classes=n_classes,
            class_table=class_table,
            model_table=model_table,
            findings=findings
        )
    
    def _generate_text_summary(self, output_path):
        """Generate text summary for quick review"""
        
        with open(output_path, 'w') as f:
            f.write("NETWORK TRAFFIC ANALYSIS SUMMARY\n")
            f.write("=" * 70 + "\n\n")
            
            f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"PCAP File: {self.config['pcap_file']}\n")
            f.write(f"Total Flows Analyzed: {len(self.features_df):,}\n\n")
            
            f.write("THREAT SUMMARY\n")
            f.write("-" * 40 + "\n")
            
            # Calculate threat metrics
            threat_flows = self.features_df[self.features_df['label'] != 'Benign']
            threat_percentage = (len(threat_flows) / len(self.features_df)) * 100
            
            f.write(f"Threats Detected: {len(threat_flows):,} ({threat_percentage:.1f}%)\n")
            f.write(f"Benign Traffic: {len(self.features_df) - len(threat_flows):,}\n\n")
            
            f.write("ATTACK TYPES DETECTED\n")
            f.write("-" * 40 + "\n")
            
            for label in threat_flows['label'].unique():
                count = (threat_flows['label'] == label).sum()
                f.write(f"  {label}: {count:,}\n")
            
            f.write("\nHIGH-RISK INDICATORS\n")
            f.write("-" * 40 + "\n")
            
            # High-risk statistics
            high_entropy = (self.features_df['domain_entropy'] > 4).sum()
            f.write(f"  High entropy domains: {high_entropy:,}\n")
            
            dga_suspected = (self.features_df['dga_probability'] > 0.7).sum()
            f.write(f"  Suspected DGA domains: {dga_suspected:,}\n")
            
            commands = (self.features_df['command_keywords'] > 0).sum()
            f.write(f"  Command injection attempts: {commands:,}\n")
            
            obfuscated = (self.features_df['obfuscation_indicators'] > 0).sum()
            f.write(f"  Obfuscated payloads: {obfuscated:,}\n")
            
            f.write("\nMODEL PERFORMANCE\n")
            f.write("-" * 40 + "\n")
            
            for model_name, result in self.results.items():
                f.write(f"  {model_name}: {result['accuracy']:.4f} accuracy\n")
    
    def _export_models(self):
        """Export trained models for deployment"""
        
        print("\n Saving trained models...")
        
        import joblib
        
        for model_name, model_data in self.models.items():
            model_path = self.export_dir / 'models' / f'{model_name}_model.pkl'
            joblib.dump(model_data, model_path)
            print(f"  ✓ {model_name}: {model_path}")
    
    def _create_summary(self):
        """Create summary JSON file with all paths and metadata"""
        
        summary = {
            'timestamp': datetime.now().isoformat(),
            'config': self.config,
            'data_files': {
                'hdf5': str(self.export_dir / 'ml_data' / 'network_features.h5'),
                'csv': str(self.export_dir / 'ml_data' / 'network_features.csv'),
                'numpy': str(self.export_dir / 'ml_data' / 'features.npy')
            },
            'reports': {
                'html': str(self.export_dir / 'human_reports' / 'analysis_report.html'),
                'text': str(self.export_dir / 'human_reports' / 'summary.txt')
            },
            'models': {
                name: str(self.export_dir / 'models' / f'{name}_model.pkl')
                for name in self.models.keys()
            },
            'statistics': {
                'total_samples': len(self.features_df),
                'total_features': len(self.features_df.columns) - 1,
                'threat_percentage': (
                    (self.features_df['label'] != 'Benign').sum() / 
                    len(self.features_df) * 100
                )
            }
        }
        
        summary_path = self.export_dir / 'analysis_summary.json'
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"\n Summary file: {summary_path}")

# ### Step 8: Main Execution Pipeline

In [None]:
"""
Cell 8: Main execution pipeline that orchestrates the entire analysis
Run this cell after configuration to start the complete analysis
"""

def run_analysis_pipeline():
    """
    Main function that executes the complete analysis pipeline
    Coordinates all components and generates final outputs
    """
    
    # Check if configuration exists
    if not hasattr(config_manager, 'config') or not config_manager.config:
        print(" Please configure settings first (run Cell 3)")
        return
    
    config = config_manager.config
    
    print("\n" + "=" * 70)
    print("STARTING NETWORK TRAFFIC ANALYSIS PIPELINE")
    print("=" * 70)
    
    start_time = time.time()
    
    try:
        # Step 1: Feature Extraction
        print("\n Step 1: Feature Extraction")
        print("-" * 40)
        extractor = UnifiedFeatureExtractor(config)
        features_df = extractor.extract_all_features()
        visual_data = extractor.visual_data if config['generate_visuals'] else None
        
        # Step 2: Feature Analysis
        print("\n Step 2: Feature Analysis")
        print("-" * 40)
        analyzer = FeatureAnalyzer(features_df)
        selected_features = analyzer.analyze_features()
        
        # Step 3: Machine Learning
        print("\n Step 3: Machine Learning")
        print("-" * 40)
        ml_pipeline = MLPipeline(features_df, visual_data, config)
        
        # Prepare data
        X_train, X_test, y_train, y_test, scaler = ml_pipeline.prepare_data(selected_features)
        
        # Train models based on user selection
        if config['ml_approach'] == 'XGBoost (Fast)':
            ml_pipeline.train_xgboost(X_train, X_test, y_train, y_test)
        elif config['ml_approach'] == 'LightGBM (Balanced)':
            ml_pipeline.train_lightgbm(X_train, X_test, y_train, y_test)
        elif config['ml_approach'] == 'Ensemble (All)':
            ml_pipeline.train_xgboost(X_train, X_test, y_train, y_test)
            ml_pipeline.train_lightgbm(X_train, X_test, y_train, y_test)
        
        # Evaluate models
        ml_pipeline.evaluate_models(X_test, y_test)
        
        # Create visualizations
        if config['generate_visuals']:
            ml_pipeline.create_evaluation_visualizations(X_test, y_test)
        
        # Step 4: Export Results
        print("\n Step 4: Exporting Results")
        print("-" * 40)
        exporter = ResultExporter(
            config, 
            features_df,
            ml_pipeline.models,
            ml_pipeline.results
        )
        exporter.export_all()
        
        # Calculate execution time
        execution_time = time.time() - start_time
        
        print("\n" + "=" * 70)
        print(" ANALYSIS COMPLETE!")
        print("=" * 70)
        print(f"\n Total execution time: {execution_time:.2f} seconds")
        print(f" Results saved to: {config['output_dir']}")
        
        # Display final summary
        print("\n Quick Summary:")
        print(f"  • Samples analyzed: {len(features_df):,}")
        print(f"  • Features extracted: {len(selected_features)}")
        print(f"  • Models trained: {len(ml_pipeline.models)}")
        
        if ml_pipeline.results:
            best_model = max(ml_pipeline.results.items(), 
                           key=lambda x: x[1]['accuracy'])
            print(f"  • Best model: {best_model[0]} ({best_model[1]['accuracy']:.4f} accuracy)")
        
        # Threat summary
        threat_count = (features_df['label'] != 'Benign').sum()
        threat_percentage = (threat_count / len(features_df)) * 100
        
        if threat_percentage > 10:
            print(f"\n HIGH THREAT LEVEL: {threat_percentage:.1f}% malicious traffic detected!")
        elif threat_percentage > 5:
            print(f"\n MODERATE THREAT: {threat_percentage:.1f}% suspicious traffic")
        else:
            print(f"\n LOW THREAT: {threat_percentage:.1f}% anomalous traffic")
        
    except Exception as e:
        print(f"\n Error during analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return
    
    return features_df, ml_pipeline, exporter

# ### Step 9: Run the Complete Analysis
# Execute this cell to start the analysis with your configured settings

In [None]:
"""
Cell 9: Execute the analysis pipeline
This cell runs the complete analysis based on your configuration
"""

# Run the analysis
results = run_analysis_pipeline()

if results:
    features_df, ml_pipeline, exporter = results
    print("\n Analysis completed successfully!")
    print("You can now:")
    print("  1. Review the generated reports in the output directory")
    print("  2. Use the exported models for real-time detection")
    print("  3. Analyze the feature importance for security insights")
else:
    print("\n Please configure settings in Cell 3 before running the analysis")