# NFStream Multiclass Model Training

**Goal:** Train multiclass Random Forest directly on ALL 5 PCAP files using NFStream.

## Attack Classes (8):
- BENIGN
- Brute Force (FTP-Patator, SSH-Patator)
- DoS (Slowloris, Slowhttp, Hulk, GoldenEye)
- Heartbleed
- Web Attack (Brute Force, XSS, SQL Injection)
- Infiltration
- Bot
- PortScan
- DDoS

## PCAP Locations:
| Day | Location |
|-----|----------|
| Monday | D:\ |
| Tuesday | D:\ |
| Wednesday | F:\ |
| Thursday | F:\ |
| Friday | pcap/ |

## Step 1: Setup & Configuration

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, time
import warnings
import gc
import joblib
warnings.filterwarnings('ignore')

# NFStream
try:
    from nfstream import NFStreamer
    print(f"✅ NFStream ready")
except ImportError:
    raise ImportError("NFStream not installed! Run: pip install nfstream")

# Paths
BASE_DIR = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
MODELS_DIR = BASE_DIR / 'models'
DATA_DIR = BASE_DIR / 'data_processed'
MODELS_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(exist_ok=True)

print(f"Base: {BASE_DIR}")
print(f"Models: {MODELS_DIR}")

In [None]:
# PCAP File Paths - UPDATE THESE IF NEEDED
PCAP_FILES = {
    'Monday': Path('D:/Monday-WorkingHours.pcap'),
    'Tuesday': Path('D:/Tuesday-WorkingHours.pcap'),
    'Wednesday': Path('F:/Wednesday-WorkingHours.pcap'),
    'Thursday': Path('F:/Thursday-WorkingHours.pcap'),
    'Friday': BASE_DIR / 'pcap' / 'Friday-WorkingHours.pcap'
}

# Verify files exist
print("PCAP Files:")
for day, path in PCAP_FILES.items():
    if path.exists():
        size_gb = path.stat().st_size / (1024**3)
        print(f"  ✅ {day}: {path} ({size_gb:.2f} GB)")
    else:
        print(f"  ❌ {day}: {path} NOT FOUND")

In [None]:
# NFStream features to extract (46 features - optimized)
NFSTREAM_FEATURES = [
    'dst_port',
    'bidirectional_duration_ms',
    'src2dst_packets', 'dst2src_packets', 'bidirectional_packets',
    'src2dst_bytes', 'dst2src_bytes', 'bidirectional_bytes',
    'src2dst_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps',
    'dst2src_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps',
    'bidirectional_min_ps', 'bidirectional_max_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps',
    'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms', 'bidirectional_max_piat_ms', 'bidirectional_min_piat_ms',
    'src2dst_duration_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'src2dst_min_piat_ms',
    'dst2src_duration_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'dst2src_min_piat_ms',
    'src2dst_psh_packets', 'src2dst_urg_packets', 'src2dst_syn_packets', 'src2dst_fin_packets', 'src2dst_rst_packets', 'src2dst_ack_packets',
    'dst2src_psh_packets', 'dst2src_urg_packets', 'dst2src_syn_packets', 'dst2src_fin_packets', 'dst2src_rst_packets', 'dst2src_ack_packets',
]

print(f"Features: {len(NFSTREAM_FEATURES)}")

In [None]:
# Attack time schedules (CICIDS2017)
# Format: (start_hour, start_min, end_hour, end_min, label)

ATTACK_SCHEDULES = {
    'Monday': [],  # All BENIGN
    
    'Tuesday': [
        (9, 17, 10, 30, 'Brute Force'),   # FTP-Patator
        (13, 0, 16, 11, 'Brute Force'),   # SSH-Patator
    ],
    
    'Wednesday': [
        (9, 47, 10, 12, 'DoS'),      # Slowloris
        (10, 13, 10, 38, 'DoS'),     # Slowhttptest
        (10, 39, 11, 9, 'DoS'),      # Hulk
        (11, 10, 11, 23, 'DoS'),     # GoldenEye
        (15, 11, 15, 33, 'Heartbleed'),
    ],
    
    'Thursday': [
        (9, 10, 10, 12, 'Web Attack'),    # Brute Force
        (10, 13, 10, 37, 'Web Attack'),   # XSS
        (10, 39, 10, 45, 'Web Attack'),   # SQL Injection
        (14, 15, 15, 50, 'Infiltration'),
    ],
    
    'Friday': [
        (9, 30, 12, 59, 'Bot'),
        (12, 30, 15, 40, 'PortScan'),
        (15, 40, 16, 30, 'DDoS'),
    ],
}

def get_label_for_time(day: str, flow_time_ms: int) -> str:
    """Determine label based on flow timestamp."""
    # Convert milliseconds to time of day
    # Assuming flow_time_ms is ms since epoch
    try:
        dt = datetime.fromtimestamp(flow_time_ms / 1000)
        flow_hour = dt.hour
        flow_min = dt.minute
    except:
        return 'BENIGN'
    
    schedules = ATTACK_SCHEDULES.get(day, [])
    for start_h, start_m, end_h, end_m, label in schedules:
        start_mins = start_h * 60 + start_m
        end_mins = end_h * 60 + end_m
        flow_mins = flow_hour * 60 + flow_min
        
        if start_mins <= flow_mins <= end_mins:
            return label
    
    return 'BENIGN'

print("Attack schedules configured")

## Step 2: Memory-Efficient Feature Extraction

In [None]:
def extract_features_chunked(pcap_path: Path, day: str, 
                             max_flows: int = 100000,
                             chunk_size: int = 25000) -> pd.DataFrame:
    """
    Extract features from PCAP in chunks for memory efficiency.
    Samples evenly across the PCAP to get diverse traffic patterns.
    """
    print(f"\n{'='*60}")
    print(f"Extracting: {pcap_path.name}")
    print(f"Max flows: {max_flows:,}")
    print(f"{'='*60}")
    
    if not pcap_path.exists():
        print(f"  ❌ File not found, skipping")
        return pd.DataFrame()
    
    streamer = NFStreamer(
        source=str(pcap_path),
        statistical_analysis=True,
        splt_analysis=0,
        n_dissections=0,
    )
    
    flows_list = []
    flow_count = 0
    label_counts = {}
    
    import time as time_module
    start_time = time_module.time()
    
    for flow in streamer:
        # Extract features
        flow_dict = {}
        for attr in NFSTREAM_FEATURES:
            try:
                value = getattr(flow, attr, 0)
                # Use float32 to save memory
                flow_dict[attr] = np.float32(value) if value is not None else np.float32(0)
            except:
                flow_dict[attr] = np.float32(0)
        
        # Get timestamp for labeling
        flow_time = getattr(flow, 'bidirectional_first_seen_ms', 0)
        label = get_label_for_time(day, flow_time)
        flow_dict['Label'] = label
        
        flows_list.append(flow_dict)
        flow_count += 1
        
        # Track labels
        label_counts[label] = label_counts.get(label, 0) + 1
        
        if flow_count % chunk_size == 0:
            elapsed = time_module.time() - start_time
            rate = flow_count / elapsed if elapsed > 0 else 0
            print(f"  Processed {flow_count:,} flows ({rate:.0f}/sec)")
            gc.collect()  # Free memory
        
        if flow_count >= max_flows:
            break
    
    elapsed = time_module.time() - start_time
    print(f"\n✅ Extracted {flow_count:,} flows in {elapsed:.1f}s")
    print(f"Label distribution: {label_counts}")
    
    # Convert to DataFrame with optimized dtypes
    df = pd.DataFrame(flows_list)
    
    # Optimize memory
    for col in df.columns:
        if col != 'Label' and df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
    
    gc.collect()
    return df

print("Extraction function ready")

## Step 3: Extract Features from All PCAPs

**Run each cell one at a time** to control memory and monitor progress.

In [None]:
# Extract Monday (BENIGN only)
# Sample less since it's all benign
df_monday = extract_features_chunked(PCAP_FILES['Monday'], 'Monday', max_flows=80000)
print(f"\nMonday shape: {df_monday.shape}")
print(f"Memory: {df_monday.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Extract Tuesday (Brute Force attacks)
df_tuesday = extract_features_chunked(PCAP_FILES['Tuesday'], 'Tuesday', max_flows=100000)
print(f"\nTuesday shape: {df_tuesday.shape}")
print(f"Memory: {df_tuesday.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Extract Wednesday (DoS attacks)
df_wednesday = extract_features_chunked(PCAP_FILES['Wednesday'], 'Wednesday', max_flows=100000)
print(f"\nWednesday shape: {df_wednesday.shape}")
print(f"Memory: {df_wednesday.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Extract Thursday (Web Attack, Infiltration)
df_thursday = extract_features_chunked(PCAP_FILES['Thursday'], 'Thursday', max_flows=100000)
print(f"\nThursday shape: {df_thursday.shape}")
print(f"Memory: {df_thursday.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Extract Friday (Bot, PortScan, DDoS)
df_friday = extract_features_chunked(PCAP_FILES['Friday'], 'Friday', max_flows=150000)
print(f"\nFriday shape: {df_friday.shape}")
print(f"Memory: {df_friday.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Save individual DataFrames to disk (optional - for recovery)
for day, df in [('monday', df_monday), ('tuesday', df_tuesday), 
                ('wednesday', df_wednesday), ('thursday', df_thursday), ('friday', df_friday)]:
    if len(df) > 0:
        path = DATA_DIR / f'nfstream_{day}_multiclass.csv'
        df.to_csv(path, index=False)
        print(f"Saved: {path.name} ({len(df):,} rows)")

## Step 4: Combine and Balance Dataset

In [None]:
# Combine all days
all_dfs = []
for name, df in [('Monday', df_monday), ('Tuesday', df_tuesday), 
                 ('Wednesday', df_wednesday), ('Thursday', df_thursday), ('Friday', df_friday)]:
    if len(df) > 0:
        print(f"{name}: {len(df):,} flows")
        all_dfs.append(df)

df_combined = pd.concat(all_dfs, ignore_index=True)

# Free memory
del df_monday, df_tuesday, df_wednesday, df_thursday, df_friday
gc.collect()

print(f"\nCombined: {len(df_combined):,} flows")
print(f"Memory: {df_combined.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Check label distribution
print("Label Distribution:")
label_counts = df_combined['Label'].value_counts()
for label, count in label_counts.items():
    pct = count / len(df_combined) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")

In [None]:
# Balance classes by undersampling majority class
# This ensures the model learns all attack types

# Get minimum class count (but cap at reasonable level)
min_count = max(label_counts.min(), 1000)  # At least 1000 samples
target_per_class = min(min_count * 3, 50000)  # Cap at 50k per class

print(f"\nBalancing to ~{target_per_class:,} samples per class...")

balanced_dfs = []
for label in label_counts.index:
    df_label = df_combined[df_combined['Label'] == label]
    n_samples = min(len(df_label), target_per_class)
    df_sampled = df_label.sample(n=n_samples, random_state=42)
    balanced_dfs.append(df_sampled)
    print(f"  {label}: {n_samples:,} samples")

df_balanced = pd.concat(balanced_dfs, ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Free memory
del df_combined
gc.collect()

print(f"\nBalanced dataset: {len(df_balanced):,} flows")
print(f"Memory: {df_balanced.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## Step 5: Prepare Features

In [None]:
# Separate features and labels
X = df_balanced.drop(columns=['Label'])
y = df_balanced['Label']

# Handle missing/infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"\nClasses: {sorted(y.unique())}")

In [None]:
# Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training: {len(X_train):,} samples")
print(f"Testing: {len(X_test):,} samples")

## Step 6: Train Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

print("Training Random Forest (multiclass)...")
print("This may take 5-10 minutes.")
print("="*60)

# Optimized for memory and speed without losing accuracy
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=25,              # Limit depth for memory
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',       # Memory efficient
    class_weight='balanced',   # Handle imbalanced classes
    n_jobs=-1,
    random_state=42,
    verbose=1
)

start = time.time()
rf_model.fit(X_train, y_train)
training_time = time.time() - start

print(f"\n✅ Training completed in {training_time:.1f} seconds")

## Step 7: Evaluate Model

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Predict
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("="*60)
print(f"MODEL PERFORMANCE")
print("="*60)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
classes = sorted(y.unique())

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classes, yticklabels=classes)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## Step 8: Save Model

In [None]:
# Save model
model_path = MODELS_DIR / 'random_forest_nfstream_multiclass.joblib'
joblib.dump(rf_model, model_path)
print(f"✅ Model saved: {model_path}")

# Save feature names
features_path = MODELS_DIR / 'feature_names_nfstream_multiclass.joblib'
joblib.dump(list(X.columns), features_path)
print(f"✅ Features saved: {features_path}")

# Save class names
classes_path = MODELS_DIR / 'class_names_nfstream_multiclass.joblib'
joblib.dump(sorted(y.unique()), classes_path)
print(f"✅ Classes saved: {classes_path}")

# Save info
info_path = MODELS_DIR / 'nfstream_multiclass_info.txt'
with open(info_path, 'w') as f:
    f.write(f"accuracy: {accuracy}\n")
    f.write(f"n_classes: {len(classes)}\n")
    f.write(f"n_features: {len(X.columns)}\n")
    f.write(f"classes: {sorted(y.unique())}\n")
    f.write(f"training_samples: {len(X_train)}\n")

print(f"\n{'='*60}")
print("TRAINING COMPLETE!")
print(f"{'='*60}")
print(f"Model: {model_path.name}")
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Classes: {sorted(y.unique())}")

## Step 9: Quick Test (Optional)

In [None]:
# Test on a small sample from Friday PCAP
print("Testing model on fresh Friday PCAP sample...")

test_streamer = NFStreamer(
    source=str(PCAP_FILES['Friday']),
    statistical_analysis=True,
)

test_flows = []
for i, flow in enumerate(test_streamer):
    if i >= 10000:
        break
    flow_dict = {attr: np.float32(getattr(flow, attr, 0)) for attr in NFSTREAM_FEATURES}
    test_flows.append(flow_dict)

df_test = pd.DataFrame(test_flows)
df_test = df_test.replace([np.inf, -np.inf], np.nan).fillna(0)

# Predict
predictions = rf_model.predict(df_test)

print(f"\nTest Results (10,000 flows):")
for label, count in pd.Series(predictions).value_counts().items():
    pct = count / len(predictions) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")

---
## Next Steps

1. Update `predictor.py` to load new model
2. Update `analyzer.py` with `nfstream_multiclass` option
3. Update `ai_service/main.py` to use new model
4. Restart FastAPI and test