# XGBoost Model Training on NFStream Features

**Goal:** Train an XGBoost model on NFStream-extracted features and compare with Random Forest.

## Why XGBoost?
- Often 5-10% better accuracy than Random Forest
- Better handling of complex patterns
- Faster inference
- Better for imbalanced datasets

## Current Results:
- Random Forest: 83.39% DDoS detection on Friday PCAP (501K flows)
- Test Accuracy: 70%

## Expected Improvement:
- Target: 85-90% DDoS detection
- Test Accuracy: 75-80%+

## Approach:
1. Load same NFStream-extracted features from previous training
2. Train XGBoost model with hyperparameter tuning
3. Compare with Random Forest
4. Test on Friday PCAP
5. Save best model


In [None]:
# Setup
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Set paths
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / 'data_processed'
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'
PCAP_DIR = BASE_DIR / 'pcap'

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")

# Check if we have saved NFStream features from previous training
monday_features = DATA_DIR / 'nfstream_monday_features.csv'
friday_features = DATA_DIR / 'nfstream_friday_features.csv'

if monday_features.exists() and friday_features.exists():
    print(f"\n‚úÖ Found saved NFStream features from previous training")
    print(f"   Monday: {monday_features}")
    print(f"   Friday: {friday_features}")
    USE_SAVED = True
else:
    print(f"\n‚ö†Ô∏è  Saved features not found. Will extract from PCAP files...")
    USE_SAVED = False


## Step 1: Load or Extract NFStream Features


In [None]:
if USE_SAVED:
    # Load saved features (much faster!)
    print("Loading saved NFStream features...")
    df_monday = pd.read_csv(monday_features)
    df_friday = pd.read_csv(friday_features)
    
    print(f"\nMonday features: {df_monday.shape}")
    print(f"Friday features: {df_friday.shape}")
    print(f"\nLabels:")
    print(f"  Monday: {df_monday['Label'].value_counts().to_dict()}")
    print(f"  Friday: {df_friday['Label'].value_counts().to_dict()}")
else:
    # Extract from PCAP files (same as notebook 05)
    print("Extracting features from PCAP files...")
    print("(This will take 30-60 minutes)")
    
    try:
        import nfstream
        from nfstream import NFStreamer
        
        NFSTREAM_ATTRIBUTES = [
            'dst_port',
            'bidirectional_duration_ms',
            'src2dst_packets', 'dst2src_packets', 'bidirectional_packets',
            'src2dst_bytes', 'dst2src_bytes', 'bidirectional_bytes',
            'src2dst_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps',
            'dst2src_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps',
            'bidirectional_min_ps', 'bidirectional_max_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps',
            'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms', 'bidirectional_max_piat_ms', 'bidirectional_min_piat_ms',
            'src2dst_duration_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'src2dst_min_piat_ms',
            'dst2src_duration_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'dst2src_min_piat_ms',
            'src2dst_psh_packets', 'src2dst_urg_packets', 'src2dst_syn_packets', 'src2dst_fin_packets', 'src2dst_rst_packets', 'src2dst_ack_packets',
            'dst2src_psh_packets', 'dst2src_urg_packets', 'dst2src_syn_packets', 'dst2src_fin_packets', 'dst2src_rst_packets', 'dst2src_ack_packets',
        ]
        
        def extract_nfstream_features(pcap_path, max_flows=250000, label=None):
            streamer = NFStreamer(
                source=str(pcap_path),
                statistical_analysis=True,
                splt_analysis=0,
                n_dissections=0,
            )
            
            flows_list = []
            for i, flow in enumerate(streamer):
                flow_dict = {}
                for attr in NFSTREAM_ATTRIBUTES:
                    try:
                        value = getattr(flow, attr, 0)
                        flow_dict[attr] = 0 if value is None else value
                    except:
                        flow_dict[attr] = 0
                
                if label:
                    flow_dict['Label'] = label
                
                flows_list.append(flow_dict)
                
                if (i + 1) % 25000 == 0:
                    print(f"  Processed {i+1:,} flows...")
                
                if i + 1 >= max_flows:
                    break
            
            return pd.DataFrame(flows_list)
        
        monday_pcap = PCAP_DIR / 'Monday-WorkingHours.pcap'
        friday_pcap = PCAP_DIR / 'Friday-WorkingHours.pcap'
        
        print("\nExtracting from Monday PCAP...")
        df_monday = extract_nfstream_features(monday_pcap, max_flows=250000, label='BENIGN')
        
        print("\nExtracting from Friday PCAP...")
        df_friday = extract_nfstream_features(friday_pcap, max_flows=250000, label='DDoS')
        
        # Save for future use
        df_monday.to_csv(monday_features, index=False)
        df_friday.to_csv(friday_features, index=False)
        print("\n‚úÖ Features saved for future use")
        
    except ImportError:
        raise ImportError("NFStream not installed. Run: pip install nfstream")


## Step 2: Prepare Data


In [None]:
# Combine datasets
print("Combining datasets...")
df_combined = pd.concat([df_monday, df_friday], ignore_index=True)

print(f"\nCombined shape: {df_combined.shape}")
print(f"Label distribution:")
print(df_combined['Label'].value_counts())

# Balance classes
benign_count = (df_combined['Label'] == 'BENIGN').sum()
ddos_count = (df_combined['Label'] == 'DDoS').sum()
min_count = min(benign_count, ddos_count)

print(f"\nBalancing to {min_count:,} samples each...")
df_benign = df_combined[df_combined['Label'] == 'BENIGN'].sample(n=min_count, random_state=42)
df_ddos = df_combined[df_combined['Label'] == 'DDoS'].sample(n=min_count, random_state=42)
df_balanced = pd.concat([df_benign, df_ddos], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Balanced shape: {df_balanced.shape}")
print(f"Label distribution:")
print(df_balanced['Label'].value_counts())

# Separate features and labels
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Preprocess
print("\nPreprocessing...")
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())
X = X.fillna(0)

print(f"Final features shape: {X.shape}")
print(f"Missing values: {X.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X.select_dtypes(include=[np.number])).sum().sum()}")


## Step 3: Train-Test Split


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining labels:")
print(y_train.value_counts())
print(f"\nTest labels:")
print(y_test.value_counts())


## Step 4: Train XGBoost Model


In [None]:
# Check if XGBoost is installed
try:
    import xgboost as xgb
    print(f"‚úÖ XGBoost version: {xgb.__version__}")
except ImportError:
    print("‚ö†Ô∏è XGBoost not installed. Installing...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'xgboost'])
    import xgboost as xgb
    print(f"‚úÖ XGBoost installed: {xgb.__version__}")

# Convert labels to numeric for XGBoost
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Class mapping
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"\nClass mapping: {class_mapping}")

print("\n" + "="*60)
print("Training XGBoost Model")
print("="*60)

# XGBoost parameters (optimized for binary classification)
xgb_model = xgb.XGBClassifier(
    n_estimators=200,  # More trees
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=1.0,  # Balanced classes
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss',
    verbosity=1
)

# Train
start_time = time.time()
xgb_model.fit(
    X_train, y_train_encoded,
    eval_set=[(X_test, y_test_encoded)],
    early_stopping_rounds=20,
    verbose=True
)
training_time = time.time() - start_time

print(f"\n‚úÖ Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)

# Convert back to original labels
y_pred_labels = le.inverse_transform(y_pred_xgb)

# Metrics
accuracy_xgb = accuracy_score(y_test, y_pred_labels)
precision_xgb = precision_score(y_test, y_pred_labels, pos_label='DDoS', zero_division=0)
recall_xgb = recall_score(y_test, y_pred_labels, pos_label='DDoS', zero_division=0)
f1_xgb = f1_score(y_test, y_pred_labels, pos_label='DDoS', zero_division=0)

print("="*60)
print("XGBOOST MODEL PERFORMANCE (Test Set)")
print("="*60)
print(f"Accuracy:  {accuracy_xgb:.4f} ({accuracy_xgb*100:.2f}%)")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall:    {recall_xgb:.4f}")
print(f"F1-Score:  {f1_xgb:.4f}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_labels))

# Confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred_labels)
print("\nConfusion Matrix:")
print(cm_xgb)
print(f"\nTrue Negatives (BENIGN correctly identified):  {cm_xgb[0][0]}")
print(f"False Positives (BENIGN misclassified as DDoS): {cm_xgb[0][1]}")
print(f"False Negatives (DDoS misclassified as BENIGN): {cm_xgb[1][0]}")
print(f"True Positives (DDoS correctly identified):  {cm_xgb[1][1]}")


In [None]:
# Load Random Forest model for comparison
import joblib

rf_model_path = MODELS_DIR / 'random_forest_nfstream_from_scratch.joblib'
if rf_model_path.exists():
    print("Loading Random Forest model for comparison...")
    rf_model = joblib.load(rf_model_path)
    
    # Make predictions
    y_pred_rf = rf_model.predict(X_test)
    
    # Metrics
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    precision_rf = precision_score(y_test, y_pred_rf, pos_label='DDoS', zero_division=0)
    recall_rf = recall_score(y_test, y_pred_rf, pos_label='DDoS', zero_division=0)
    f1_rf = f1_score(y_test, y_pred_rf, pos_label='DDoS', zero_division=0)
    
    print("\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    print(f"{'Metric':<15} {'Random Forest':<20} {'XGBoost':<20} {'Improvement':<15}")
    print("-"*60)
    print(f"{'Accuracy':<15} {accuracy_rf:.4f} ({accuracy_rf*100:.2f}%){'':<6} {accuracy_xgb:.4f} ({accuracy_xgb*100:.2f}%){'':<6} {(accuracy_xgb-accuracy_rf)*100:+.2f}%")
    print(f"{'Precision':<15} {precision_rf:.4f}{'':<16} {precision_xgb:.4f}{'':<16} {(precision_xgb-precision_rf)*100:+.2f}%")
    print(f"{'Recall':<15} {recall_rf:.4f}{'':<16} {recall_xgb:.4f}{'':<16} {(recall_xgb-recall_rf)*100:+.2f}%")
    print(f"{'F1-Score':<15} {f1_rf:.4f}{'':<16} {f1_xgb:.4f}{'':<16} {(f1_xgb-f1_rf)*100:+.2f}%")
    print("="*60)
    
    if accuracy_xgb > accuracy_rf:
        improvement = (accuracy_xgb - accuracy_rf) / accuracy_rf * 100
        print(f"\n‚úÖ XGBoost is {improvement:.2f}% better than Random Forest!")
    else:
        print(f"\n‚ö†Ô∏è Random Forest performs slightly better on test set")
else:
    print("‚ö†Ô∏è Random Forest model not found for comparison")


## Step 7: Test on Friday PCAP (Real-World Validation)


In [None]:
# Test on actual Friday PCAP file
print("Testing XGBoost model on Friday PCAP file...")
print("(This will extract features and make predictions)")

from nfstream import NFStreamer

friday_pcap = PCAP_DIR / 'Friday-WorkingHours.pcap'
if not friday_pcap.exists():
    print(f"‚ö†Ô∏è Friday PCAP not found: {friday_pcap}")
else:
    print(f"\nExtracting features from: {friday_pcap.name}")
    
    # Extract test flows (different portion than training)
    NFSTREAM_ATTRIBUTES = list(X.columns)  # Use same features as training
    
    test_streamer = NFStreamer(
        source=str(friday_pcap),
        statistical_analysis=True,
        splt_analysis=0,
        n_dissections=0,
    )
    
    # Extract flows starting from 500K (skip training portion)
    test_flows = []
    skip_first = 500000
    test_count = 50000
    
    print(f"Skipping first {skip_first:,} flows, extracting {test_count:,} test flows...")
    
    for i, flow in enumerate(test_streamer):
        if i < skip_first:
            continue
        
        flow_dict = {}
        for attr in NFSTREAM_ATTRIBUTES:
            try:
                value = getattr(flow, attr, 0)
                flow_dict[attr] = 0 if value is None else value
            except:
                flow_dict[attr] = 0
        
        test_flows.append(flow_dict)
        
        if (i - skip_first + 1) % 10000 == 0:
            print(f"  Extracted {i - skip_first + 1:,} flows...")
        
        if len(test_flows) >= test_count:
            break
    
    df_test_pcap = pd.DataFrame(test_flows)
    print(f"\n‚úÖ Extracted {len(df_test_pcap):,} test flows")
    
    # Preprocess
    X_test_pcap = df_test_pcap.copy()
    X_test_pcap = X_test_pcap.replace([np.inf, -np.inf], np.nan)
    X_test_pcap = X_test_pcap.fillna(X_train.median())
    X_test_pcap = X_test_pcap.fillna(0)
    X_test_pcap = X_test_pcap[X_train.columns]  # Ensure same order
    
    # Predictions
    print("\nMaking predictions...")
    test_predictions_xgb = xgb_model.predict(X_test_pcap)
    test_predictions_labels = le.inverse_transform(test_predictions_xgb)
    
    # Compare with RF if available
    if rf_model_path.exists():
        test_predictions_rf = rf_model.predict(X_test_pcap)
    
    # Results
    pred_counts_xgb = pd.Series(test_predictions_labels).value_counts()
    
    print("\n" + "="*60)
    print("FRIDAY PCAP TEST RESULTS")
    print("="*60)
    print(f"Total flows: {len(test_predictions_labels):,}")
    print(f"\nXGBoost Predictions:")
    for label, count in pred_counts_xgb.items():
        pct = count / len(test_predictions_labels) * 100
        print(f"  {label}: {count:,} ({pct:.2f}%)")
    
    if rf_model_path.exists():
        pred_counts_rf = pd.Series(test_predictions_rf).value_counts()
        print(f"\nRandom Forest Predictions:")
        for label, count in pred_counts_rf.items():
            pct = count / len(test_predictions_rf) * 100
            print(f"  {label}: {count:,} ({pct:.2f}%)")
        
        # Compare DDoS detection
        ddos_xgb = pred_counts_xgb.get('DDoS', 0) / len(test_predictions_labels) * 100
        ddos_rf = pred_counts_rf.get('DDoS', 0) / len(test_predictions_rf) * 100
        
        print(f"\n{'='*60}")
        print("DDoS Detection Comparison:")
        print(f"  XGBoost:     {ddos_xgb:.2f}%")
        print(f"  Random Forest: {ddos_rf:.2f}%")
        print(f"  Improvement:   {ddos_xgb - ddos_rf:+.2f}%")
        
        if ddos_xgb > ddos_rf:
            print(f"\n‚úÖ XGBoost detects {ddos_xgb - ddos_rf:.2f}% more DDoS attacks!")
        elif ddos_xgb < ddos_rf:
            print(f"\n‚ö†Ô∏è Random Forest detects {ddos_rf - ddos_xgb:.2f}% more DDoS attacks")
        else:
            print(f"\n‚úì Both models perform similarly")
    
    ddos_detected_xgb = pred_counts_xgb.get('DDoS', 0)
    print(f"\n{'='*60}")
    if ddos_detected_xgb > len(test_predictions_labels) * 0.5:
        print("‚úÖ SUCCESS! XGBoost correctly detects DDoS attacks in Friday PCAP!")
        print(f"   Detected {ddos_detected_xgb:,} DDoS flows ({ddos_detected_xgb/len(test_predictions_labels)*100:.1f}%)")
    else:
        print("‚ö†Ô∏è Low DDoS detection - may need more tuning")
    print("="*60)


## Step 8: Save Best Model


In [None]:
# Save XGBoost model if it performs better
if 'accuracy_xgb' in dir() and 'accuracy_rf' in dir():
    if accuracy_xgb > accuracy_rf:
        print("Saving XGBoost model (performs better than Random Forest)...")
        
        # Save model
        model_filename = MODELS_DIR / 'xgboost_nfstream_from_scratch.joblib'
        joblib.dump(xgb_model, model_filename)
        print(f"‚úÖ Model saved: {model_filename}")
        
        # Save feature names
        feature_names_file = MODELS_DIR / 'feature_names_nfstream_xgboost.joblib'
        joblib.dump(list(X.columns), feature_names_file)
        print(f"‚úÖ Feature names saved: {feature_names_file}")
        
        # Save class names and label encoder
        class_names_file = MODELS_DIR / 'class_names_nfstream_xgboost.joblib'
        joblib.dump(['BENIGN', 'DDoS'], class_names_file)
        print(f"‚úÖ Class names saved: {class_names_file}")
        
        label_encoder_file = MODELS_DIR / 'label_encoder_nfstream_xgboost.joblib'
        joblib.dump(le, label_encoder_file)
        print(f"‚úÖ Label encoder saved: {label_encoder_file}")
        
        print("\nüéâ XGBoost model saved successfully!")
        print("   Update predictor.py to use this model for better performance!")
    else:
        print("Random Forest performs better - keeping Random Forest as primary model")
        print("XGBoost model available but not saved as primary")
else:
    print("Saving XGBoost model...")
    
    model_filename = MODELS_DIR / 'xgboost_nfstream_from_scratch.joblib'
    joblib.dump(xgb_model, model_filename)
    print(f"‚úÖ Model saved: {model_filename}")
    
    feature_names_file = MODELS_DIR / 'feature_names_nfstream_xgboost.joblib'
    joblib.dump(list(X.columns), feature_names_file)
    print(f"‚úÖ Feature names saved: {feature_names_file}")
    
    class_names_file = MODELS_DIR / 'class_names_nfstream_xgboost.joblib'
    joblib.dump(['BENIGN', 'DDoS'], class_names_file)
    print(f"‚úÖ Class names saved: {class_names_file}")
    
    label_encoder_file = MODELS_DIR / 'label_encoder_nfstream_xgboost.joblib'
    joblib.dump(le, label_encoder_file)
    print(f"‚úÖ Label encoder saved: {label_encoder_file}")
