# Phase 2 — Advanced Preprocessing & Feature Engineering

Split before scaling, create Standard and MinMax scaled datasets, correlation analysis, feature importance, and save processed data for reproducibility.

**Works with:** CIC-IoT-2023 or CICIDS2017 (auto-detects from Phase 1)

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

# ===================================================================
# Auto-detect dataset from Phase 1
# ===================================================================
# Check which dataset was processed in Phase 1
if Path("../data/processed/CICIoT2023/combined.csv").exists():
    DATASET = "CICIoT2023"
    DATA_DIR = Path("../data/processed/CICIoT2023")
elif Path("../data/processed/CICIDS2017/combined.csv").exists():
    DATASET = "CICIDS2017"
    DATA_DIR = Path("../data/processed/CICIDS2017")
else:
    raise FileNotFoundError("No processed dataset found. Please run Notebook 01 first!")

OUT_DIR = Path("../data/processed/ml_ready")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Detected Dataset: {DATASET}")
print(f"Data Directory: {DATA_DIR}")

# ===================================================================
# Load consolidated data created in Phase 1
# ===================================================================
print(f"\nLoading consolidated data from Phase 1...")
df = pd.read_csv(DATA_DIR / "combined.csv")
print(f"Dataset shape: {df.shape}")

X = df.drop(columns=['label' if 'label' in df.columns else 'Label', 'Label_ID'])
y = df['Label_ID']

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]:,}")
print(f"Classes: {y.nunique()}")

# ===================================================================
# Train-test split BEFORE scaling
# ===================================================================
print("\n" + "=" * 60)
print("Train-Test Split")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train shape: {X_train.shape}")
print(f"Test shape:  {X_test.shape}")

# ===================================================================
# Correlation analysis on training set (original features)
# ===================================================================
print("\n" + "=" * 60)
print("Correlation Analysis")
print("=" * 60)

corr = X_train.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap='coolwarm', center=0, cbar_kws={'label': 'Correlation'})
plt.title(f"Feature Correlation - {DATASET} (Training Set)")
plt.tight_layout()
plt.savefig(OUT_DIR / "feature_correlation.png", dpi=300, bbox_inches='tight')
plt.show()

# ===================================================================
# Compute and remove highly correlated features
# ===================================================================
print("\nIdentifying highly correlated features...")
threshold = 0.95
high_corr = []
cols = corr.columns
to_drop = set()

for i in range(len(cols)):
    for j in range(i):
        if abs(corr.iloc[i,j]) > threshold:
            high_corr.append((cols[i], cols[j], corr.iloc[i,j]))
            # Drop the second feature in the pair
            to_drop.add(cols[j])

print(f'High correlation pairs (>|0.95|): {len(high_corr)} pairs found')

if to_drop:
    print(f"Dropping {len(to_drop)} highly correlated features:")
    for col in sorted(to_drop):
        print(f"  - {col}")
    
    # Remove from both train and test
    X_train = X_train.drop(columns=list(to_drop))
    X_test = X_test.drop(columns=list(to_drop))
    print(f"\nNew feature count: {X_train.shape[1]}")
else:
    print("No highly correlated features to remove")

# ===================================================================
# Fit scalers on training data only (after feature removal)
# ===================================================================
print("\n" + "=" * 60)
print("Scaling Features")
print("=" * 60)

std_scaler = StandardScaler().fit(X_train)
mm_scaler = MinMaxScaler().fit(X_train)

X_train_std = std_scaler.transform(X_train)
X_test_std = std_scaler.transform(X_test)

X_train_mm = mm_scaler.transform(X_train)
X_test_mm = mm_scaler.transform(X_test)

print("✓ StandardScaler fitted and applied")
print("✓ MinMaxScaler fitted and applied")

# ===================================================================
# Save scalers and processed arrays
# ===================================================================
print("\n" + "=" * 60)
print("Saving Processed Data")
print("=" * 60)

joblib.dump(std_scaler, Path("../trained_models/scaler_standard.pkl"))
joblib.dump(mm_scaler, Path("../trained_models/scaler_minmax.pkl"))
print("✓ Saved scalers")

pd.DataFrame(X_train_std, columns=X_train.columns).to_csv(OUT_DIR / "X_train_standard.csv", index=False)
pd.DataFrame(X_test_std, columns=X_test.columns).to_csv(OUT_DIR / "X_test_standard.csv", index=False)
pd.DataFrame(X_train_mm, columns=X_train.columns).to_csv(OUT_DIR / "X_train_minmax.csv", index=False)
pd.DataFrame(X_test_mm, columns=X_test.columns).to_csv(OUT_DIR / "X_test_minmax.csv", index=False)

y_train.to_csv(OUT_DIR / "y_train.csv", index=False)
y_test.to_csv(OUT_DIR / "y_test.csv", index=False)

print(f"✓ Saved processed data to {OUT_DIR}")

# Print file sizes
for file in OUT_DIR.glob("*.csv"):
    size_mb = file.stat().st_size / (1024**2)
    print(f"  {file.name}: {size_mb:.2f} MB")

print(f"\n✓ Phase 2 complete!")
print(f"  Final feature count: {X_train.shape[1]}")
print(f"  Training samples: {X_train.shape[0]:,}")
print(f"  Test samples: {X_test.shape[0]:,}")import psutil
import gc


In [None]:
# ===================================================================
# Memory Optimization Utilities
# ===================================================================
import psutil
import gc

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024**3

def optimize_dtypes(df):
    """Reduce memory usage by optimizing data types"""
    print("\nOptimizing data types...")
    start_mem = df.memory_usage(deep=True).sum() / 1024**3
    print(f"  Initial memory: {start_mem:.2f} GB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**3
    saved = start_mem - end_mem
    print(f"  Final memory: {end_mem:.2f} GB")
    print(f"  Saved: {saved:.2f} GB ({100 * saved / start_mem:.1f}%)")
    
    return df

print(f"System RAM: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available RAM: {psutil.virtual_memory().available / 1024**3:.1f} GB")
print(f"Current process memory: {get_memory_usage():.2f} GB")

In [None]:
# Feature importance-based selection using Random Forest
from sklearn.ensemble import RandomForestClassifier
import numpy as np

print("\n=== Feature Importance Analysis ===")
# Train a quick RF to get feature importances
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_selector.fit(X_train_std, y_train)

# Get feature importances
importances = rf_selector.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(importance_df.head(20))

# Optional: Select top N features or features above threshold
# Uncomment to apply feature selection
# top_n_features = 50
# selected_features = importance_df.head(top_n_features)['feature'].tolist()
# X_train = X_train[selected_features]
# X_test = X_test[selected_features]
# print(f"\nReduced to top {top_n_features} features")

# Save feature importance for reference
importance_df.to_csv(OUT_DIR / "feature_importance.csv", index=False)
print("\nSaved feature importance to", OUT_DIR / "feature_importance.csv")

# Plot top 30 features
plt.figure(figsize=(10, 12))
top_30 = importance_df.head(30)
plt.barh(range(len(top_30)), top_30['importance'])
plt.yticks(range(len(top_30)), top_30['feature'])
plt.xlabel('Importance')
plt.title('Top 30 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()