# CIC-IDS2017 EDA - Notebook 5: Final Preprocessing Plan

In [1]:
# =============================================================================
# IMPORTS
# =============================================================================

import pandas as pd
import numpy as np
import os
import json
import warnings
from datetime import datetime
import pickle

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print(f"Final Preprocessing Plan: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Final Preprocessing Plan: 2025-12-23 19:35:33


In [2]:
# =============================================================================
# LOAD ALL EDA OUTPUTS
# =============================================================================

OUTPUT_DIR = "eda_outputs/"

# Load global sample
df = pd.read_csv(os.path.join(OUTPUT_DIR, 'global_sample.csv'), low_memory=False)
print(f"Sample: {df.shape[0]:,} rows × {df.shape[1]} columns")

# Load Phase 3 outputs
with open(os.path.join(OUTPUT_DIR, 'preliminary_transform_plan.json'), 'r') as f:
    transform_plan = json.load(f)
print("✓ Phase 3: Transform plan")

with open(os.path.join(OUTPUT_DIR, 'data_quality_issues.json'), 'r') as f:
    data_quality = json.load(f)
print("✓ Phase 3: Data quality issues")

# Load Phase 5 outputs
df_ranking = pd.read_csv(os.path.join(OUTPUT_DIR, 'feature_ranking.csv'), index_col=0)
print(f"✓ Phase 5: Feature ranking ({len(df_ranking)} features)")

with open(os.path.join(OUTPUT_DIR, 'target_analysis_stats.json'), 'r') as f:
    target_stats = json.load(f)
print("✓ Phase 5: Target stats")

# Load Phase 6-7 outputs
df_high_corr = pd.read_csv(os.path.join(OUTPUT_DIR, 'high_correlation_pairs.csv'))
print(f"✓ Phase 6: High correlation pairs ({len(df_high_corr)} pairs)")

with open(os.path.join(OUTPUT_DIR, 'phase67_summary.json'), 'r') as f:
    phase67_summary = json.load(f)
print("✓ Phase 7: Semantic summary")

Sample: 283,138 rows × 87 columns
✓ Phase 3: Transform plan
✓ Phase 3: Data quality issues
✓ Phase 5: Feature ranking (65 features)
✓ Phase 5: Target stats
✓ Phase 6: High correlation pairs (55 pairs)
✓ Phase 7: Semantic summary


---
# 1. Final Feature Inventory

In [3]:
# =============================================================================
# FINAL FEATURE INVENTORY
# =============================================================================

print("=" * 70)
print("FINAL FEATURE INVENTORY")
print("=" * 70)

# All columns
all_columns = set(df.columns)

# Identifiers (never use as features)
IDENTIFIERS = {'Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Source_File'}

# Target columns
TARGET_COLS = {'Label'}

# Derived targets (created during EDA)
DERIVED_TARGETS = {'Is_Attack', 'Attack_Family'}

# Features to DROP - Phase 3 (constant/zero-variance)
DROP_CONSTANT = set(transform_plan.get('drop', []))

# Features to DROP - Phase 6 (redundant, r >= 0.99)
DROP_REDUNDANT = set(phase67_summary['correlation_analysis']['features_to_drop_redundant'])

# All features to drop
DROP_ALL = DROP_CONSTANT | DROP_REDUNDANT

# Categorical features (special handling)
CATEGORICAL = {'Protocol', 'Destination Port'}

# Numerical features (usable)
NUMERICAL_USABLE = []
for col in df.columns:
    if col in IDENTIFIERS | TARGET_COLS | DERIVED_TARGETS | DROP_ALL | CATEGORICAL:
        continue
    if df[col].dtype in ['int64', 'int32', 'float64', 'float32']:
        NUMERICAL_USABLE.append(col)

NUMERICAL_USABLE = sorted(NUMERICAL_USABLE)

print(f"\n--- Column Categories ---\n")
print(f"{'Category':<30} {'Count':>10}")
print("-" * 42)
print(f"{'Total columns':<30} {len(all_columns):>10}")
print(f"{'Identifiers':<30} {len(IDENTIFIERS & all_columns):>10}")
print(f"{'Target columns':<30} {len(TARGET_COLS):>10}")
print(f"{'Derived targets':<30} {len(DERIVED_TARGETS & all_columns):>10}")
print(f"{'DROP (constant)':<30} {len(DROP_CONSTANT):>10}")
print(f"{'DROP (redundant)':<30} {len(DROP_REDUNDANT):>10}")
print(f"{'Categorical':<30} {len(CATEGORICAL):>10}")
print(f"{'Numerical (usable)':<30} {len(NUMERICAL_USABLE):>10}")

# Verify
accounted = len(IDENTIFIERS & all_columns) + len(TARGET_COLS) + len(DERIVED_TARGETS & all_columns) + \
            len(DROP_ALL) + len(CATEGORICAL) + len(NUMERICAL_USABLE)
print(f"\nVerification: {accounted} accounted vs {len(all_columns)} total")

FINAL FEATURE INVENTORY

--- Column Categories ---

Category                            Count
------------------------------------------
Total columns                          87
Identifiers                             5
Target columns                          1
Derived targets                         2
DROP (constant)                        12
DROP (redundant)                       20
Categorical                             2
Numerical (usable)                     45

Verification: 87 accounted vs 87 total


In [4]:
# =============================================================================
# DROP LISTS
# =============================================================================

print("\n--- Features to DROP (Constant/Zero-Variance) ---")
for feat in sorted(DROP_CONSTANT):
    print(f"  • {feat}")

print(f"\n--- Features to DROP (Redundant, r ≥ 0.99) ---")
for feat in sorted(DROP_REDUNDANT):
    print(f"  • {feat}")

print(f"\n→ Total to DROP: {len(DROP_ALL)} features")


--- Features to DROP (Constant/Zero-Variance) ---
  • Bwd Avg Bulk Rate
  • Bwd Avg Bytes/Bulk
  • Bwd Avg Packets/Bulk
  • Bwd PSH Flags
  • Bwd URG Flags
  • CWE Flag Count
  • ECE Flag Count
  • Fwd Avg Bulk Rate
  • Fwd Avg Bytes/Bulk
  • Fwd Avg Packets/Bulk
  • Fwd URG Flags
  • RST Flag Count

--- Features to DROP (Redundant, r ≥ 0.99) ---
  • Active Mean
  • Active Min
  • Average Packet Size
  • Avg Bwd Segment Size
  • Avg Fwd Segment Size
  • Bwd IAT Max
  • Bwd IAT Total
  • Flow Duration
  • Flow IAT Mean
  • Fwd IAT Mean
  • Fwd IAT Total
  • Fwd PSH Flags
  • Fwd Packets/s
  • Idle Mean
  • Idle Min
  • Packet Length Std
  • Subflow Bwd Bytes
  • Subflow Fwd Bytes
  • Total Backward Packets
  • Total Fwd Packets

→ Total to DROP: 32 features


In [5]:
# =============================================================================
# FEATURE TIERS (by discriminative power)
# =============================================================================

print("\n--- Feature Tiers (by Discriminative Power) ---\n")

# Get tiers from ranking (excluding dropped features)
df_ranking_clean = df_ranking[~df_ranking.index.isin(DROP_ALL)]

top_tier = df_ranking_clean[df_ranking_clean['combined_score'] >= 0.5].index.tolist()
mid_tier = df_ranking_clean[(df_ranking_clean['combined_score'] >= 0.2) & 
                            (df_ranking_clean['combined_score'] < 0.5)].index.tolist()
low_tier = df_ranking_clean[df_ranking_clean['combined_score'] < 0.2].index.tolist()

print(f"TOP TIER (score ≥ 0.5): {len(top_tier)} features")
for i, feat in enumerate(top_tier[:15]):
    score = df_ranking_clean.loc[feat, 'combined_score']
    print(f"  {i+1:2}. {feat}: {score:.3f}")
if len(top_tier) > 15:
    print(f"  ... and {len(top_tier) - 15} more")

print(f"\nMID TIER (0.2 ≤ score < 0.5): {len(mid_tier)} features")
print(f"LOW TIER (score < 0.2): {len(low_tier)} features")


--- Feature Tiers (by Discriminative Power) ---

TOP TIER (score ≥ 0.5): 14 features
   1. Init_Win_bytes_backward: 0.945
   2. Min Packet Length: 0.753
   3. Fwd Packet Length Min: 0.739
   4. Bwd Packet Length Std: 0.679
   5. Fwd IAT Std: 0.672
   6. Bwd Packet Length Min: 0.648
   7. Fwd IAT Max: 0.622
   8. Bwd Packets/s: 0.589
   9. Idle Max: 0.585
  10. Fwd Packet Length Mean: 0.556
  11. Fwd Packet Length Max: 0.535
  12. PSH Flag Count: 0.527
  13. Total Length of Fwd Packets: 0.517
  14. Flow IAT Std: 0.502

MID TIER (0.2 ≤ score < 0.5): 21 features
LOW TIER (score < 0.2): 10 features


---
# 2. Preprocessing Pipeline Definition

In [6]:
# =============================================================================
# PREPROCESSING PIPELINE DEFINITION
# =============================================================================

print("=" * 70)
print("PREPROCESSING PIPELINE")
print("=" * 70)

PREPROCESSING_PIPELINE = {
    'step_1_data_cleaning': {
        'description': 'Remove corrupted rows and handle special values',
        'actions': [
            'Remove rows with integer overflow (Fwd/Bwd Header Length < -1e9)',
            'Replace infinite values with 0 (Flow Bytes/s, Flow Packets/s)',
            'Clip small negative timing values to 0 (Flow IAT Min, Flow Duration)',
        ]
    },
    'step_2_feature_engineering': {
        'description': 'Create new features from semantic analysis',
        'new_features': [
            ('has_tcp_handshake', 'Init_Win_bytes_backward != -1'),
            ('is_zero_window', 'Init_Win_bytes_backward == 0'),
            ('is_high_window', 'Init_Win_bytes_backward > 10000'),
            ('is_zero_duration', 'Flow Duration == 0'),
            ('init_win_bwd_clean', 'max(0, Init_Win_bytes_backward)'),
            ('init_win_fwd_clean', 'max(0, Init_Win_bytes_forward)'),
        ]
    },
    'step_3_feature_selection': {
        'description': 'Drop constant and redundant features',
        'drop_constant': list(DROP_CONSTANT),
        'drop_redundant': list(DROP_REDUNDANT),
    },
    'step_4_categorical_encoding': {
        'description': 'Encode categorical features',
        'Protocol': 'One-hot encoding (3 values: TCP, UDP, HOPOPT)',
        'Destination Port': 'Frequency encoding (24K unique values)',
    },
    'step_5_numerical_scaling': {
        'description': 'Scale numerical features',
        'right_skewed': 'log1p + StandardScaler',
        'normal_like': 'StandardScaler',
        'sparse': 'No transform (keep as-is)',
        'binary_engineered': 'No transform (already 0/1)',
    },
}

for step, details in PREPROCESSING_PIPELINE.items():
    print(f"\n{step.upper()}")
    print(f"  {details['description']}")
    for key, value in details.items():
        if key == 'description':
            continue
        if isinstance(value, list):
            print(f"  {key}:")
            for item in value[:5]:
                if isinstance(item, tuple):
                    print(f"    • {item[0]}: {item[1]}")
                else:
                    print(f"    • {item}")
            if len(value) > 5:
                print(f"    ... and {len(value) - 5} more")
        else:
            print(f"  {key}: {value}")

PREPROCESSING PIPELINE

STEP_1_DATA_CLEANING
  Remove corrupted rows and handle special values
  actions:
    • Remove rows with integer overflow (Fwd/Bwd Header Length < -1e9)
    • Replace infinite values with 0 (Flow Bytes/s, Flow Packets/s)
    • Clip small negative timing values to 0 (Flow IAT Min, Flow Duration)

STEP_2_FEATURE_ENGINEERING
  Create new features from semantic analysis
  new_features:
    • has_tcp_handshake: Init_Win_bytes_backward != -1
    • is_zero_window: Init_Win_bytes_backward == 0
    • is_high_window: Init_Win_bytes_backward > 10000
    • is_zero_duration: Flow Duration == 0
    • init_win_bwd_clean: max(0, Init_Win_bytes_backward)
    ... and 1 more

STEP_3_FEATURE_SELECTION
  Drop constant and redundant features
  drop_constant:
    • Fwd Avg Bytes/Bulk
    • ECE Flag Count
    • Bwd Avg Packets/Bulk
    • Bwd Avg Bulk Rate
    • CWE Flag Count
    ... and 7 more
  drop_redundant:
    • Total Backward Packets
    • Fwd IAT Total
    • Subflow Fwd Bytes
 

In [7]:
# =============================================================================
# FEATURE GROUPS FOR SCALING
# =============================================================================

print("\n--- Feature Groups for Scaling ---\n")

# Get from Phase 3 transform plan
SPARSE_FEATURES = set(transform_plan.get('sparse', []))
PENDING_FEATURES = set(transform_plan.get('pending_phase7', []))

# Features that need log transform (right-skewed)
# Most network features are right-skewed
LOG_TRANSFORM = []
STANDARD_SCALE = []
NO_TRANSFORM = []

for feat in NUMERICAL_USABLE:
    if feat in SPARSE_FEATURES:
        NO_TRANSFORM.append(feat)
    elif feat in PENDING_FEATURES:
        # These will be handled by feature engineering
        NO_TRANSFORM.append(feat)
    elif 'Flag' in feat or 'Count' in feat:
        # Count features are sparse
        NO_TRANSFORM.append(feat)
    else:
        # Most continuous features are right-skewed
        LOG_TRANSFORM.append(feat)

print(f"Log1p + StandardScaler: {len(LOG_TRANSFORM)} features")
print(f"No transform (sparse/binary): {len(NO_TRANSFORM)} features")

# Binary features from engineering
BINARY_ENGINEERED = [
    'has_tcp_handshake',
    'is_zero_window',
    'is_high_window',
    'is_zero_duration',
]

print(f"Binary (engineered): {len(BINARY_ENGINEERED)} features")


--- Feature Groups for Scaling ---

Log1p + StandardScaler: 27 features
No transform (sparse/binary): 18 features
Binary (engineered): 4 features


---
# 3. Feature Engineering Implementation

In [8]:
# =============================================================================
# IMPLEMENT FEATURE ENGINEERING
# =============================================================================

print("=" * 70)
print("FEATURE ENGINEERING IMPLEMENTATION")
print("=" * 70)

df_processed = df.copy()

# Step 1: Data Cleaning
print("\n--- Step 1: Data Cleaning ---")

# Remove rows with integer overflow
overflow_mask = (df_processed['Fwd Header Length'] < -1e9) | \
                (df_processed['Bwd Header Length'] < -1e9)
n_overflow = overflow_mask.sum()
df_processed = df_processed[~overflow_mask].copy()
print(f"  Removed {n_overflow} rows with integer overflow")

# Replace infinite values
for col in ['Flow Bytes/s', 'Flow Packets/s']:
    if col in df_processed.columns:
        n_inf = np.isinf(df_processed[col]).sum()
        df_processed[col] = df_processed[col].replace([np.inf, -np.inf], 0)
        print(f"  Replaced {n_inf} inf values in {col}")

# Clip small negative timing values
timing_cols = ['Flow IAT Min', 'Flow Duration']
for col in timing_cols:
    if col in df_processed.columns:
        n_neg = (df_processed[col] < 0).sum()
        df_processed[col] = df_processed[col].clip(lower=0)
        print(f"  Clipped {n_neg} negative values in {col}")

print(f"\n  Dataset after cleaning: {len(df_processed):,} rows")

FEATURE ENGINEERING IMPLEMENTATION

--- Step 1: Data Cleaning ---
  Removed 1 rows with integer overflow
  Replaced 150 inf values in Flow Bytes/s
  Replaced 275 inf values in Flow Packets/s
  Clipped 300 negative values in Flow IAT Min
  Clipped 11 negative values in Flow Duration

  Dataset after cleaning: 283,137 rows


In [9]:
# Step 2: Create Engineered Features
print("\n--- Step 2: Feature Engineering ---")

# From Init_Win_bytes_backward (the #1 discriminator)
if 'Init_Win_bytes_backward' in df_processed.columns:
    # Binary: TCP handshake occurred
    df_processed['has_tcp_handshake'] = (df_processed['Init_Win_bytes_backward'] != -1).astype(int)
    print(f"  Created: has_tcp_handshake (1 if TCP handshake)")
    
    # Binary: PortScan signature (zero window)
    df_processed['is_zero_window'] = (df_processed['Init_Win_bytes_backward'] == 0).astype(int)
    print(f"  Created: is_zero_window (1 if Init_Win = 0, PortScan signature)")
    
    # Binary: Web Attack signature (high window)
    df_processed['is_high_window'] = (df_processed['Init_Win_bytes_backward'] > 10000).astype(int)
    print(f"  Created: is_high_window (1 if Init_Win > 10000, Web Attack signature)")
    
    # Cleaned version (sentinel replaced with 0)
    df_processed['init_win_bwd_clean'] = df_processed['Init_Win_bytes_backward'].clip(lower=0)
    print(f"  Created: init_win_bwd_clean (sentinel replaced with 0)")

# From Init_Win_bytes_forward
if 'Init_Win_bytes_forward' in df_processed.columns:
    df_processed['init_win_fwd_clean'] = df_processed['Init_Win_bytes_forward'].clip(lower=0)
    print(f"  Created: init_win_fwd_clean (sentinel replaced with 0)")

# From Flow Duration
if 'Flow Duration' in df_processed.columns:
    df_processed['is_zero_duration'] = (df_processed['Flow Duration'] == 0).astype(int)
    print(f"  Created: is_zero_duration (1 if instantaneous flow)")

# Ratios
if 'Total Fwd Packets' in df_processed.columns and 'Total Backward Packets' in df_processed.columns:
    df_processed['fwd_bwd_packet_ratio'] = df_processed['Total Fwd Packets'] / \
                                           (df_processed['Total Backward Packets'] + 1)
    print(f"  Created: fwd_bwd_packet_ratio")

if 'Total Length of Fwd Packets' in df_processed.columns and 'Total Length of Bwd Packets' in df_processed.columns:
    df_processed['fwd_bwd_bytes_ratio'] = df_processed['Total Length of Fwd Packets'] / \
                                          (df_processed['Total Length of Bwd Packets'] + 1)
    print(f"  Created: fwd_bwd_bytes_ratio")

print(f"\n  New features created: 8")


--- Step 2: Feature Engineering ---
  Created: has_tcp_handshake (1 if TCP handshake)
  Created: is_zero_window (1 if Init_Win = 0, PortScan signature)
  Created: is_high_window (1 if Init_Win > 10000, Web Attack signature)
  Created: init_win_bwd_clean (sentinel replaced with 0)
  Created: init_win_fwd_clean (sentinel replaced with 0)
  Created: is_zero_duration (1 if instantaneous flow)
  Created: fwd_bwd_packet_ratio
  Created: fwd_bwd_bytes_ratio

  New features created: 8


In [10]:
# Verify engineered features
print("\n--- Verify Engineered Features ---\n")

engineered = ['has_tcp_handshake', 'is_zero_window', 'is_high_window', 
              'is_zero_duration', 'init_win_bwd_clean', 'init_win_fwd_clean',
              'fwd_bwd_packet_ratio', 'fwd_bwd_bytes_ratio']

for feat in engineered:
    if feat in df_processed.columns:
        if df_processed[feat].dtype in ['int64', 'int32']:
            # Binary feature
            pct_1 = df_processed[feat].mean() * 100
            print(f"{feat}: {pct_1:.1f}% = 1")
        else:
            # Continuous feature
            print(f"{feat}: min={df_processed[feat].min():.2f}, median={df_processed[feat].median():.2f}, max={df_processed[feat].max():.2f}")


--- Verify Engineered Features ---

has_tcp_handshake: 49.0% = 1
is_zero_window: 9.6% = 1
is_high_window: 5.2% = 1
is_zero_duration: 0.1% = 1
init_win_bwd_clean: 197217.0% = 1
init_win_fwd_clean: 702987.1% = 1
fwd_bwd_packet_ratio: min=0.01, median=0.67, max=741.00
fwd_bwd_bytes_ratio: min=0.00, median=0.34, max=169105.00


In [11]:
# Validate attack signatures in engineered features
print("\n--- Validate Attack Signatures ---\n")

if 'Label' in df_processed.columns:
    # PortScan should have high is_zero_window
    portscan = df_processed[df_processed['Label'] == 'PortScan']
    print(f"PortScan (n={len(portscan):,}):")
    print(f"  is_zero_window = 1: {portscan['is_zero_window'].mean()*100:.1f}%")
    print(f"  has_tcp_handshake = 1: {portscan['has_tcp_handshake'].mean()*100:.1f}%")
    
    # Web Attacks should have high is_high_window
    web_attack = df_processed[df_processed['Label'].str.contains('Web Attack', na=False)]
    print(f"\nWeb Attacks (n={len(web_attack):,}):")
    print(f"  is_high_window = 1: {web_attack['is_high_window'].mean()*100:.1f}%")
    
    # BENIGN baseline
    benign = df_processed[df_processed['Label'] == 'BENIGN']
    print(f"\nBENIGN (n={len(benign):,}):")
    print(f"  is_zero_window = 1: {benign['is_zero_window'].mean()*100:.1f}%")
    print(f"  is_high_window = 1: {benign['is_high_window'].mean()*100:.1f}%")
    print(f"  has_tcp_handshake = 1: {benign['has_tcp_handshake'].mean()*100:.1f}%")


--- Validate Attack Signatures ---

PortScan (n=15,893):
  is_zero_window = 1: 99.3%
  has_tcp_handshake = 1: 99.9%

Web Attacks (n=237):
  is_high_window = 1: 78.5%

BENIGN (n=227,310):
  is_zero_window = 1: 4.8%
  is_high_window = 1: 6.3%
  has_tcp_handshake = 1: 41.9%


---
# 4. Train/Test Split Strategy

In [12]:
# =============================================================================
# TRAIN/TEST SPLIT STRATEGY
# =============================================================================

print("=" * 70)
print("TRAIN/TEST SPLIT STRATEGY")
print("=" * 70)

# Class distribution
print("\n--- Class Distribution ---")
binary_dist = target_stats['binary']
print(f"\nBinary (BENIGN vs ATTACK):")
print(f"  BENIGN: {binary_dist['benign_pct']:.1f}%")
print(f"  ATTACK: {binary_dist['attack_pct']:.1f}%")
print(f"  Imbalance ratio: {binary_dist['imbalance_ratio']:.2f}:1")

multi_stats = target_stats['multiclass']
print(f"\nMulti-class ({multi_stats['n_classes']} classes):")
print(f"  Max/Min ratio: {multi_stats['max_min_ratio']:.1f}:1")
print(f"  Rare classes (<1%): {len(multi_stats['rare_classes'])}")

print("\n--- Recommended Strategy ---")
print("""
1. STRATIFIED SPLIT:
   - Use stratified train/test split (80/20)
   - Preserve class proportions in both sets
   - For multi-class: stratify by Label (all 15 classes)

2. CROSS-VALIDATION:
   - Use Stratified K-Fold (k=5)
   - Each fold maintains class proportions

3. HANDLING RARE CLASSES:
   - For training: Consider SMOTE on rare classes
   - For evaluation: Use macro F1 (treats all classes equally)
   - Alternative: Hierarchical classification
     Step 1: Binary (BENIGN vs ATTACK)
     Step 2: Family (8 families)
     Step 3: Specific (within family)

4. METRICS:
   Binary: F1, Precision, Recall, ROC-AUC, PR-AUC
   Multi-class: Macro F1, Weighted F1, Confusion Matrix
""")

TRAIN/TEST SPLIT STRATEGY

--- Class Distribution ---

Binary (BENIGN vs ATTACK):
  BENIGN: 80.3%
  ATTACK: 19.7%
  Imbalance ratio: 4.07:1

Multi-class (15 classes):
  Max/Min ratio: 20664.6:1
  Rare classes (<1%): 11

--- Recommended Strategy ---

1. STRATIFIED SPLIT:
   - Use stratified train/test split (80/20)
   - Preserve class proportions in both sets
   - For multi-class: stratify by Label (all 15 classes)

2. CROSS-VALIDATION:
   - Use Stratified K-Fold (k=5)
   - Each fold maintains class proportions

3. HANDLING RARE CLASSES:
   - For training: Consider SMOTE on rare classes
   - For evaluation: Use macro F1 (treats all classes equally)
   - Alternative: Hierarchical classification
     Step 1: Binary (BENIGN vs ATTACK)
     Step 2: Family (8 families)
     Step 3: Specific (within family)

4. METRICS:
   Binary: F1, Precision, Recall, ROC-AUC, PR-AUC
   Multi-class: Macro F1, Weighted F1, Confusion Matrix



In [13]:
# =============================================================================
# IMPLEMENT TRAIN/TEST SPLIT
# =============================================================================

from sklearn.model_selection import train_test_split

# Create Is_Attack if not exists
if 'Is_Attack' not in df_processed.columns:
    df_processed['Is_Attack'] = (df_processed['Label'] != 'BENIGN').astype(int)

# Define feature columns (final list)
FINAL_FEATURES = [col for col in df_processed.columns 
                  if col not in IDENTIFIERS | TARGET_COLS | DERIVED_TARGETS | DROP_ALL
                  and df_processed[col].dtype in ['int64', 'int32', 'float64', 'float32']]

# Add categorical if encoded
# (For now, exclude - will encode during modeling)
FINAL_FEATURES = [f for f in FINAL_FEATURES if f not in CATEGORICAL]

print(f"Final feature count: {len(FINAL_FEATURES)}")

# Prepare X and y
X = df_processed[FINAL_FEATURES]
y_binary = df_processed['Is_Attack']
y_multi = df_processed['Label']

# Stratified split
X_train, X_test, y_train_binary, y_test_binary, y_train_multi, y_test_multi = \
    train_test_split(X, y_binary, y_multi, test_size=0.2, random_state=42, stratify=y_multi)

print(f"\n--- Split Results ---")
print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Verify stratification
print(f"\n--- Binary Distribution ---")
print(f"Train - ATTACK: {y_train_binary.mean()*100:.1f}%")
print(f"Test  - ATTACK: {y_test_binary.mean()*100:.1f}%")

print(f"\n--- Multi-class Distribution (Top 5) ---")
train_dist = y_train_multi.value_counts(normalize=True).head(5) * 100
test_dist = y_test_multi.value_counts(normalize=True).head(5) * 100

for label in train_dist.index:
    print(f"{label}: Train={train_dist[label]:.1f}%, Test={test_dist[label]:.1f}%")

Final feature count: 53

--- Split Results ---
Training set: 226,509 samples
Test set: 56,628 samples

--- Binary Distribution ---
Train - ATTACK: 19.7%
Test  - ATTACK: 19.7%

--- Multi-class Distribution (Top 5) ---
BENIGN: Train=80.3%, Test=80.3%
DoS Hulk: Train=8.2%, Test=8.2%
PortScan: Train=5.6%, Test=5.6%
DDoS: Train=4.5%, Test=4.5%
DoS GoldenEye: Train=0.4%, Test=0.4%


---
# 5. Export Preprocessing Artifacts

In [14]:
# =============================================================================
# EXPORT PREPROCESSING ARTIFACTS
# =============================================================================

print("=" * 70)
print("EXPORT PREPROCESSING ARTIFACTS")
print("=" * 70)

# Create artifacts directory
ARTIFACTS_DIR = os.path.join(OUTPUT_DIR, 'preprocessing_artifacts')
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# 1. Feature lists
feature_lists = {
    'final_features': FINAL_FEATURES,
    'drop_constant': list(DROP_CONSTANT),
    'drop_redundant': list(DROP_REDUNDANT),
    'top_tier': top_tier,
    'mid_tier': mid_tier,
    'low_tier': low_tier,
    'log_transform': LOG_TRANSFORM,
    'no_transform': NO_TRANSFORM,
    'binary_engineered': BINARY_ENGINEERED,
    'categorical': list(CATEGORICAL),
    'identifiers': list(IDENTIFIERS),
}

with open(os.path.join(ARTIFACTS_DIR, 'feature_lists.json'), 'w') as f:
    json.dump(feature_lists, f, indent=2)
print("✓ Saved: feature_lists.json")

# 2. Preprocessing config
preprocessing_config = {
    'pipeline': PREPROCESSING_PIPELINE,
    'class_distribution': {
        'binary': target_stats['binary'],
        'multiclass': target_stats['multiclass'],
    },
    'split_params': {
        'test_size': 0.2,
        'random_state': 42,
        'stratify': 'Label',
    },
    'feature_engineering': {
        'init_win_bwd_sentinel': -1,
        'init_win_fwd_sentinel': -1,
        'portscan_signature': 'Init_Win_bytes_backward == 0',
        'web_attack_signature': 'Init_Win_bytes_backward > 10000',
    },
}

with open(os.path.join(ARTIFACTS_DIR, 'preprocessing_config.json'), 'w') as f:
    json.dump(preprocessing_config, f, indent=2, default=str)
print("✓ Saved: preprocessing_config.json")

# 3. Save processed sample
df_processed.to_csv(os.path.join(ARTIFACTS_DIR, 'sample_processed.csv'), index=False)
print(f"✓ Saved: sample_processed.csv ({len(df_processed):,} rows)")

# 4. Save train/test splits
train_data = pd.concat([X_train, y_train_binary.rename('Is_Attack'), y_train_multi.rename('Label')], axis=1)
test_data = pd.concat([X_test, y_test_binary.rename('Is_Attack'), y_test_multi.rename('Label')], axis=1)

train_data.to_csv(os.path.join(ARTIFACTS_DIR, 'train_set.csv'), index=False)
test_data.to_csv(os.path.join(ARTIFACTS_DIR, 'test_set.csv'), index=False)
print(f"✓ Saved: train_set.csv ({len(train_data):,} rows)")
print(f"✓ Saved: test_set.csv ({len(test_data):,} rows)")

EXPORT PREPROCESSING ARTIFACTS
✓ Saved: feature_lists.json
✓ Saved: preprocessing_config.json
✓ Saved: sample_processed.csv (283,137 rows)
✓ Saved: train_set.csv (226,509 rows)
✓ Saved: test_set.csv (56,628 rows)


In [15]:
# =============================================================================
# FINAL EDA SUMMARY
# =============================================================================

print("=" * 70)
print("EDA COMPLETE: FINAL SUMMARY")
print("=" * 70)

print("\n--- Dataset ---")
print(f"  Original: 2.8M rows × 79 features")
print(f"  Sample: {len(df):,} rows (10% stratified)")
print(f"  After cleaning: {len(df_processed):,} rows")

print("\n--- Features ---")
print(f"  Original numerical: 77")
print(f"  Dropped (constant): {len(DROP_CONSTANT)}")
print(f"  Dropped (redundant): {len(DROP_REDUNDANT)}")
print(f"  Engineered (new): 8")
print(f"  Final features: {len(FINAL_FEATURES)}")

print("\n--- Class Balance ---")
print(f"  Binary: {binary_dist['imbalance_ratio']:.1f}:1 (manageable)")
print(f"  Multi-class: {multi_stats['max_min_ratio']:.0f}:1 (severe, needs handling)")

print("\n--- Top 5 Discriminating Features ---")
for i, feat in enumerate(top_tier[:5]):
    print(f"  {i+1}. {feat}")

print("\n--- Key Attack Signatures ---")
print("  • PortScan: Init_Win = 0 (99.3%)")
print("  • Web Attack: Init_Win > 10,000 (~28,960)")
print("  • DoS: Extreme timing values")

print("\n--- Output Artifacts ---")
for f in os.listdir(ARTIFACTS_DIR):
    fpath = os.path.join(ARTIFACTS_DIR, f)
    size = os.path.getsize(fpath) / 1024
    print(f"  ✓ {f}: {size:.1f} KB")

print("\n--- Ready for Modeling ---")
print("""
Next steps:
1. Load train_set.csv and test_set.csv
2. Apply scaling (log1p + StandardScaler for continuous features)
3. Train baseline models:
   - Binary: Logistic Regression, Random Forest, XGBoost
   - Multi-class: Same with class_weight='balanced'
4. Evaluate on test set
5. Consider hierarchical classification for rare classes
""")

print("\n" + "=" * 70)
print(f"EDA Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)

EDA COMPLETE: FINAL SUMMARY

--- Dataset ---
  Original: 2.8M rows × 79 features
  Sample: 283,138 rows (10% stratified)
  After cleaning: 283,137 rows

--- Features ---
  Original numerical: 77
  Dropped (constant): 12
  Dropped (redundant): 20
  Engineered (new): 8
  Final features: 53

--- Class Balance ---
  Binary: 4.1:1 (manageable)
  Multi-class: 20665:1 (severe, needs handling)

--- Top 5 Discriminating Features ---
  1. Init_Win_bytes_backward
  2. Min Packet Length
  3. Fwd Packet Length Min
  4. Bwd Packet Length Std
  5. Fwd IAT Std

--- Key Attack Signatures ---
  • PortScan: Init_Win = 0 (99.3%)
  • Web Attack: Init_Win > 10,000 (~28,960)
  • DoS: Extreme timing values

--- Output Artifacts ---
  ✓ feature_lists.json: 4.8 KB
  ✓ preprocessing_config.json: 4.9 KB
  ✓ sample_processed.csv: 131832.4 KB
  ✓ test_set.csv: 13887.9 KB
  ✓ train_set.csv: 55503.5 KB

--- Ready for Modeling ---

Next steps:
1. Load train_set.csv and test_set.csv
2. Apply scaling (log1p + StandardSc