# NFStream From Scratch - Real PCAP Training

**Goal:** Train a model using ACTUAL NFStream-extracted features from PCAP files.

## Why This Approach?
- Previous attempts transformed CSV ‚Üí NFStream format (values still different)
- This notebook extracts REAL features from PCAP files using NFStream
- Model trained on actual NFStream extraction = perfect match for inference

## Steps:
1. Extract features from Monday PCAP (BENIGN) using NFStream
2. Extract features from Friday PCAP (DDoS) using NFStream
3. Combine and label datasets
4. Train model on REAL NFStream features
5. Test on held-out PCAP data

## Success Criteria:
- ‚úÖ Model detects DDoS attacks in Friday PCAP
- ‚úÖ Accuracy > 90%
- ‚úÖ Ready for production use


In [None]:
# Setup
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Check NFStream
try:
    import nfstream
    print(f"‚úÖ NFStream version: {nfstream.__version__}")
except ImportError:
    raise ImportError("NFStream not installed! Run: pip install nfstream")

# Set paths
BASE_DIR = Path.cwd().parent
PCAP_DIR = BASE_DIR / 'pcap'
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'
DATA_DIR = BASE_DIR / 'data_processed'

# Create directories
DATA_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"PCAP directory: {PCAP_DIR}")


## Step 1: Extract Features from Monday PCAP (BENIGN)


In [None]:
from nfstream import NFStreamer

# PCAP files
monday_pcap = PCAP_DIR / 'Monday-WorkingHours.pcap'
friday_pcap = PCAP_DIR / 'Friday-WorkingHours.pcap'

print("Available PCAP files:")
if monday_pcap.exists():
    print(f"  ‚úÖ Monday: {monday_pcap.stat().st_size / (1024**3):.2f} GB (BENIGN)")
else:
    print(f"  ‚ùå Monday: Not found")
    
if friday_pcap.exists():
    print(f"  ‚úÖ Friday: {friday_pcap.stat().st_size / (1024**3):.2f} GB (DDoS attacks)")
else:
    print(f"  ‚ùå Friday: Not found")

if not monday_pcap.exists() or not friday_pcap.exists():
    raise FileNotFoundError("PCAP files not found! Check pcap/ directory.")


In [None]:
# Define NFStream attributes to extract
NFSTREAM_ATTRIBUTES = [
    'dst_port',
    'bidirectional_duration_ms',
    'src2dst_packets', 'dst2src_packets', 'bidirectional_packets',
    'src2dst_bytes', 'dst2src_bytes', 'bidirectional_bytes',
    'src2dst_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps',
    'dst2src_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps',
    'bidirectional_min_ps', 'bidirectional_max_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps',
    'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms', 'bidirectional_max_piat_ms', 'bidirectional_min_piat_ms',
    'src2dst_duration_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'src2dst_min_piat_ms',
    'dst2src_duration_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'dst2src_min_piat_ms',
    'src2dst_psh_packets', 'src2dst_urg_packets', 'src2dst_syn_packets', 'src2dst_fin_packets', 'src2dst_rst_packets', 'src2dst_ack_packets',
    'dst2src_psh_packets', 'dst2src_urg_packets', 'dst2src_syn_packets', 'dst2src_fin_packets', 'dst2src_rst_packets', 'dst2src_ack_packets',
]

print(f"NFStream attributes to extract: {len(NFSTREAM_ATTRIBUTES)}")

def extract_nfstream_features(pcap_path: Path, max_flows: int = 250000, label: str = None):
    """Extract features from PCAP using NFStream."""
    print(f"\n{'='*60}")
    print(f"Extracting from: {pcap_path.name}")
    print(f"Max flows: {max_flows:,}")
    print(f"{'='*60}")
    
    streamer = NFStreamer(
        source=str(pcap_path),
        statistical_analysis=True,
        splt_analysis=0,
        n_dissections=0,
    )
    
    flows_list = []
    start_time = time.time()
    
    for i, flow in enumerate(streamer):
        # Extract only the attributes we need
        flow_dict = {}
        for attr in NFSTREAM_ATTRIBUTES:
            try:
                value = getattr(flow, attr, 0)
                flow_dict[attr] = 0 if value is None else value
            except:
                flow_dict[attr] = 0
        
        # Add label if provided
        if label:
            flow_dict['Label'] = label
        
        flows_list.append(flow_dict)
        
        if (i + 1) % 25000 == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed if elapsed > 0 else 0
            print(f"  Processed {i+1:,} flows... ({rate:.0f} flows/sec)")
        
        if i + 1 >= max_flows:
            break
    
    elapsed = time.time() - start_time
    print(f"\n‚úÖ Extracted {len(flows_list):,} flows in {elapsed:.2f} seconds")
    print(f"   Rate: {len(flows_list)/elapsed:.0f} flows/second")
    
    return pd.DataFrame(flows_list)


In [None]:
# Extract from Monday PCAP (BENIGN)
print("STEP 1: Extracting BENIGN features from Monday PCAP...")
df_monday = extract_nfstream_features(monday_pcap, max_flows=250000, label='BENIGN')

print(f"\nMonday features shape: {df_monday.shape}")
print(f"Columns: {len(df_monday.columns)}")
print(f"Label distribution:")
if 'Label' in df_monday.columns:
    print(df_monday['Label'].value_counts())

# Save for inspection
monday_csv = DATA_DIR / 'nfstream_monday_features.csv'
df_monday.to_csv(monday_csv, index=False)
print(f"\nüíæ Saved to: {monday_csv}")


## Step 2: Extract Features from Friday PCAP (DDoS)


In [None]:
# Extract from Friday PCAP (DDoS attacks)
print("STEP 2: Extracting DDoS features from Friday PCAP...")
df_friday = extract_nfstream_features(friday_pcap, max_flows=250000, label='DDoS')

print(f"\nFriday features shape: {df_friday.shape}")
print(f"Columns: {len(df_friday.columns)}")
print(f"Label distribution:")
if 'Label' in df_friday.columns:
    print(df_friday['Label'].value_counts())

# Save for inspection
friday_csv = DATA_DIR / 'nfstream_friday_features.csv'
df_friday.to_csv(friday_csv, index=False)
print(f"\nüíæ Saved to: {friday_csv}")


## Step 3: Combine & Prepare Data


In [None]:
# Combine datasets
print("STEP 3: Combining datasets...")
df_combined = pd.concat([df_monday, df_friday], ignore_index=True)

print(f"\nCombined dataset shape: {df_combined.shape}")
print(f"Label distribution:")
print(df_combined['Label'].value_counts())

# Balance classes (take equal samples)
benign_count = (df_combined['Label'] == 'BENIGN').sum()
ddos_count = (df_combined['Label'] == 'DDoS').sum()
min_count = min(benign_count, ddos_count)

print(f"\nBalancing classes to {min_count:,} samples each...")
df_benign = df_combined[df_combined['Label'] == 'BENIGN'].sample(n=min_count, random_state=42)
df_ddos = df_combined[df_combined['Label'] == 'DDoS'].sample(n=min_count, random_state=42)
df_balanced = pd.concat([df_benign, df_ddos], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nBalanced dataset shape: {df_balanced.shape}")
print(f"Label distribution:")
print(df_balanced['Label'].value_counts())


In [None]:
# Separate features and labels
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Check for missing/infinite values
print(f"\nMissing values: {X.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(X.select_dtypes(include=[np.number])).sum().sum()}")

# Handle missing and infinite values
print("\nPreprocessing...")
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())
X = X.fillna(0)  # Fill any remaining NaN

print(f"After preprocessing:")
print(f"  Missing values: {X.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(X.select_dtypes(include=[np.number])).sum().sum()}")


## Step 4: Train-Test Split


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining label distribution:")
print(y_train.value_counts())
print(f"\nTest label distribution:")
print(y_test.value_counts())


## Step 5: Train Random Forest Model


In [None]:
from sklearn.ensemble import RandomForestClassifier

print("Training Random Forest Classifier...")
print("="*60)

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    verbose=1
)

# Train
start_time = time.time()
rf_model.fit(X_train, y_train)
training_time = time.time() - start_time

print(f"\n‚úÖ Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")


## Step 6: Evaluate Model


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='DDoS', zero_division=0)
recall = recall_score(y_test, y_pred, pos_label='DDoS', zero_division=0)
f1 = f1_score(y_test, y_pred, pos_label='DDoS', zero_division=0)

print("="*60)
print("MODEL PERFORMANCE (Test Set)")
print("="*60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\nTrue Negatives (BENIGN correctly identified):  {cm[0][0]}")
print(f"False Positives (BENIGN misclassified as DDoS): {cm[0][1]}")
print(f"False Negatives (DDoS misclassified as BENIGN): {cm[1][0]}")
print(f"True Positives (DDoS correctly identified):  {cm[1][1]}")


## Step 7: Test on New PCAP Data (Held-Out) - CRITICAL TEST


In [None]:
# Extract NEW flows from Friday PCAP (different portion) for testing
print("STEP 7: Testing on new PCAP data (held-out)...")
print("Extracting NEW flows from Friday PCAP starting from flow 250,000...")

# Extract from a different portion of Friday PCAP
test_streamer = NFStreamer(
    source=str(friday_pcap),
    statistical_analysis=True,
    splt_analysis=0,
    n_dissections=0,
)

test_flows = []
skip_first = 250000  # Skip flows we already used for training
test_count = 50000   # Test on 50K new flows

print(f"Skipping first {skip_first:,} flows, then extracting {test_count:,} flows...")

for i, flow in enumerate(test_streamer):
    if i < skip_first:
        continue  # Skip training flows
    
    flow_dict = {}
    for attr in NFSTREAM_ATTRIBUTES:
        try:
            value = getattr(flow, attr, 0)
            flow_dict[attr] = 0 if value is None else value
        except:
            flow_dict[attr] = 0
    
    test_flows.append(flow_dict)
    
    if (i - skip_first + 1) % 10000 == 0:
        print(f"  Extracted {i - skip_first + 1:,} test flows...")
    
    if len(test_flows) >= test_count:
        break

df_test_pcap = pd.DataFrame(test_flows)
print(f"\n‚úÖ Extracted {len(df_test_pcap):,} test flows")

# Preprocess test data
X_test_pcap = df_test_pcap.copy()
X_test_pcap = X_test_pcap.replace([np.inf, -np.inf], np.nan)
X_test_pcap = X_test_pcap.fillna(X_train.median())
X_test_pcap = X_test_pcap.fillna(0)

# Ensure same column order
X_test_pcap = X_test_pcap[X_train.columns]

print(f"Test PCAP features shape: {X_test_pcap.shape}")


In [None]:
# Make predictions on test PCAP data
print("Making predictions on test PCAP data...")
test_predictions = rf_model.predict(X_test_pcap)

# Analyze results
pred_counts = pd.Series(test_predictions).value_counts()

print("\n" + "="*60)
print("TEST PCAP ANALYSIS RESULTS")
print("="*60)
print(f"Total flows analyzed: {len(test_predictions):,}")
print(f"\nPrediction Distribution:")
for label, count in pred_counts.items():
    pct = count / len(test_predictions) * 100
    print(f"  {label}: {count:,} ({pct:.2f}%)")

# This is Friday PCAP, so we expect DDoS attacks
ddos_detected = (test_predictions == 'DDoS').sum()
benign_detected = (test_predictions == 'BENIGN').sum()

print(f"\n{'='*60}")
print("VERDICT:")
print(f"{'='*60}")
if ddos_detected > benign_detected:
    print("‚úÖ SUCCESS! Model correctly detects DDoS attacks in Friday PCAP!")
    print(f"   DDoS detected: {ddos_detected:,} ({ddos_detected/len(test_predictions)*100:.1f}%)")
    print(f"   BENIGN detected: {benign_detected:,} ({benign_detected/len(test_predictions)*100:.1f}%)")
    print("\nüéâ NFStream model works! Ready for production.")
else:
    print("‚ö†Ô∏è WARNING: Model did not detect DDoS attacks correctly")
    print(f"   DDoS detected: {ddos_detected:,} ({ddos_detected/len(test_predictions)*100:.1f}%)")
    print(f"   BENIGN detected: {benign_detected:,} ({benign_detected/len(test_predictions)*100:.1f}%)")
    print("\n‚ö†Ô∏è Consider using CICFlowMeter instead.")
print("="*60)


## Step 8: Save Model (If Successful)


In [None]:
# Save model if it works
import joblib

# Check if model detected attacks correctly
if ddos_detected > benign_detected:
    print("Saving model...")
    
    model_filename = MODELS_DIR / 'random_forest_nfstream_from_scratch.joblib'
    joblib.dump(rf_model, model_filename)
    print(f"‚úÖ Model saved: {model_filename}")
    
    # Save feature names
    feature_names_file = MODELS_DIR / 'feature_names_nfstream_from_scratch.joblib'
    joblib.dump(list(X_train.columns), feature_names_file)
    print(f"‚úÖ Feature names saved: {feature_names_file}")
    
    # Save class names
    class_names_file = MODELS_DIR / 'class_names_nfstream_from_scratch.joblib'
    joblib.dump(['BENIGN', 'DDoS'], class_names_file)
    print(f"‚úÖ Class names saved: {class_names_file}")
    
    print("\nüéâ Model saved successfully!")
    print("\nYou can now use this model for PCAP analysis with NFStream.")
else:
    print("‚ö†Ô∏è Model not saved - did not pass validation test")
    print("   Consider using CICFlowMeter instead.")
