# Network Threat Detection - Random Forest Binary Classification

This notebook trains a Random Forest model to classify network traffic as BENIGN or ATTACK.

**Approach:**
- Load CICIDS2017 dataset
- Preprocess data (handle missing values, convert to binary classification)
- Train Random Forest model
- Evaluate performance
- Save model for later use


In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set paths
BASE_DIR = Path.cwd().parent
DATASET_DIR = BASE_DIR / 'dataset'
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'

print(f"Base directory: {BASE_DIR}")
print(f"Dataset directory: {DATASET_DIR}")
print(f"Dataset files: {list(DATASET_DIR.glob('*.csv'))}")


## 1. Load Dataset


In [None]:
# Get all CSV files
csv_files = sorted(DATASET_DIR.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files")

# Load all CSV files
dataframes = []
for file in csv_files:
    print(f"Loading {file.name}...")
    try:
        df = pd.read_csv(file, low_memory=False)
        print(f"  Shape: {df.shape}, Columns: {len(df.columns)}")
        dataframes.append(df)
    except Exception as e:
        print(f"  Error loading {file.name}: {e}")

print(f"\nTotal dataframes loaded: {len(dataframes)}")


In [None]:
# Combine all dataframes
print("Combining all dataframes...")
df = pd.concat(dataframes, ignore_index=True)
print(f"Combined dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


## 2. Explore Data


In [None]:
# Check for Label column and unique values
if 'Label' in df.columns:
    print("Label column found!")
    print("\nUnique labels:")
    print(df['Label'].value_counts())
    print(f"\nTotal unique labels: {df['Label'].nunique()}")
else:
    print("Label column not found. Available columns:")
    print(df.columns.tolist())


In [None]:
# Check for missing values
print("Missing values per column:")
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percent': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)
print(f"\nTotal columns with missing values: {len(missing_df)}")


In [None]:
# Check for infinite values
print("Checking for infinite values...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
inf_counts = {}
for col in numeric_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    print("Columns with infinite values:")
    for col, count in inf_counts.items():
        print(f"  {col}: {count}")
else:
    print("No infinite values found.")


## 3. Preprocess Data


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()
print(f"Starting preprocessing with shape: {df_processed.shape}")

# Handle infinite values - replace with NaN first, then handle NaN
print("Handling infinite values...")
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df_processed[col] = df_processed[col].replace([np.inf, -np.inf], np.nan)

# Handle missing values - drop columns with too many missing values (>50%)
print("Handling missing values...")
missing_threshold = 0.5
cols_to_drop = []
for col in df_processed.columns:
    if col != 'Label':
        missing_pct = df_processed[col].isnull().sum() / len(df_processed)
        if missing_pct > missing_threshold:
            cols_to_drop.append(col)

if cols_to_drop:
    print(f"Dropping {len(cols_to_drop)} columns with >50% missing values")
    df_processed = df_processed.drop(columns=cols_to_drop)

# For remaining numeric columns with missing values, fill with median
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)

# Drop rows where Label is missing
if 'Label' in df_processed.columns:
    initial_rows = len(df_processed)
    df_processed = df_processed.dropna(subset=['Label'])
    dropped_rows = initial_rows - len(df_processed)
    if dropped_rows > 0:
        print(f"Dropped {dropped_rows} rows with missing labels")

print(f"\nAfter preprocessing shape: {df_processed.shape}")


In [None]:
# Convert Label to binary classification (BENIGN = 0, ATTACK = 1)
print("Converting labels to binary classification...")
print("\nOriginal label distribution:")
print(df_processed['Label'].value_counts())

# Create binary label
df_processed['Label_Binary'] = df_processed['Label'].apply(
    lambda x: 0 if str(x).strip().upper() == 'BENIGN' else 1
)

print("\nBinary label distribution:")
print(df_processed['Label_Binary'].value_counts())
print(f"\nBENIGN (0): {(df_processed['Label_Binary'] == 0).sum()}")
print(f"ATTACK (1): {(df_processed['Label_Binary'] == 1).sum()}")


In [None]:
# Separate features and target
X = df_processed.drop(columns=['Label', 'Label_Binary'])
y = df_processed['Label_Binary']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {len(X.columns)}")


## 4. Train-Test Split


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set label distribution:")
print(y_train.value_counts())
print(f"\nTest set label distribution:")
print(y_test.value_counts())


## 5. Train Random Forest Model


In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

print("Training Random Forest Classifier...")
print("Initializing model with default parameters...")

# Initialize Random Forest with default parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,  # Use all available CPUs
    verbose=1
)

# Train the model
start_time = time.time()
rf_model.fit(X_train, y_train)
training_time = time.time() - start_time

print(f"\nTraining completed in {training_time:.2f} seconds")


## 6. Evaluate Model


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score
)

# Make predictions
print("Making predictions on test set...")
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of class 1 (ATTACK)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\n" + "="*50)
print("MODEL PERFORMANCE METRICS")
print("="*50)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print("="*50)


In [None]:
# Classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN', 'ATTACK']))


In [None]:
# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nInterpretation:")
print(f"True Negatives (BENIGN correctly identified):  {cm[0][0]}")
print(f"False Positives (BENIGN misclassified as ATTACK): {cm[0][1]}")
print(f"False Negatives (ATTACK misclassified as BENIGN): {cm[1][0]}")
print(f"True Positives (ATTACK correctly identified):  {cm[1][1]}")


In [None]:
# Feature importance
print("\nTop 20 Most Important Features:")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(20).to_string(index=False))


## 7. Save Model and Results


In [None]:
import joblib
from datetime import datetime

# Create directories if they don't exist
MODELS_DIR.mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)

# Save model
model_filename = MODELS_DIR / 'random_forest_binary_20241211.joblib'
joblib.dump(rf_model, model_filename)
print(f"Model saved to: {model_filename}")

# Save feature names (important for later inference)
feature_names_file = MODELS_DIR / 'feature_names_binary.joblib'
joblib.dump(list(X.columns), feature_names_file)
print(f"Feature names saved to: {feature_names_file}")

# Save results to CSV
results = {
    'Model': ['Random Forest (Binary)'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1],
    'ROC-AUC': [roc_auc],
    'Training Time (seconds)': [training_time],
    'Training Samples': [len(X_train)],
    'Test Samples': [len(X_test)],
    'Date': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')]
}

results_df = pd.DataFrame(results)
results_file = RESULTS_DIR / 'random_forest_binary_results.csv'
results_df.to_csv(results_file, index=False)
print(f"\nResults saved to: {results_file}")

# Save feature importance
importance_file = RESULTS_DIR / 'feature_importance_binary.csv'
feature_importance.to_csv(importance_file, index=False)
print(f"Feature importance saved to: {importance_file}")


## 8. Quick Verification Test

Let's test the model on a sample from one of the original files to verify it's working correctly.


In [None]:
# Quick test on a small sample from one file
print("Testing model on a small sample from Monday file...")
test_file = DATASET_DIR / 'Monday-WorkingHours.pcap_ISCX.csv'
test_sample = pd.read_csv(test_file, nrows=1000, low_memory=False)  # Just 1000 rows for quick test

# Preprocess the sample (same steps)
numeric_cols_sample = test_sample.select_dtypes(include=[np.number]).columns
for col in numeric_cols_sample:
    if col in X.columns:
        test_sample[col] = test_sample[col].replace([np.inf, -np.inf], np.nan)

# Prepare features
test_X_sample = test_sample[X.columns].copy()
test_X_sample = test_X_sample.fillna(X_train.median())

# Make predictions
sample_predictions = rf_model.predict(test_X_sample)

# Get actual labels if available
if 'Label' in test_sample.columns:
    test_y_sample = test_sample['Label'].apply(lambda x: 0 if str(x).strip().upper() == 'BENIGN' else 1)
    sample_accuracy = accuracy_score(test_y_sample, sample_predictions)
    print(f"\nSample Test Results:")
    print(f"  Accuracy: {sample_accuracy:.4f} ({sample_accuracy*100:.2f}%)")
    print(f"  Predictions - BENIGN: {(sample_predictions == 0).sum()}, ATTACK: {(sample_predictions == 1).sum()}")
    print(f"  Actual - BENIGN: {(test_y_sample == 0).sum()}, ATTACK: {(test_y_sample == 1).sum()}")
else:
    print(f"\nPredictions on sample:")
    print(f"  BENIGN: {(sample_predictions == 0).sum()}, ATTACK: {(sample_predictions == 1).sum()}")

print("\n" + "="*50)
print("✅ Model training and verification complete!")


## Summary

✅ **Model trained successfully!**

**What we did:**
1. Loaded and combined all CICIDS2017 CSV files
2. Preprocessed data (handled missing/infinite values)
3. Converted multi-class labels to binary (BENIGN vs ATTACK)
4. Trained a Random Forest classifier
5. Evaluated performance with comprehensive metrics
6. Saved the model and results

**Next Steps:**
1. Review the metrics above - is the model detecting attacks well?
2. Check the confusion matrix - are false positives/negatives acceptable?
3. If results look good, we can proceed to multiclass classification
4. If not, we can tune hyperparameters or try different preprocessing
5. Once satisfied, we can prepare for integration into your web app
