In [None]:
# Import libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Add project root to path
sys.path.append('..')
from config import DATA_DIR, DATASET_FILES

print("Libraries loaded successfully!")

## 1. Load Dataset

In [None]:
# Load all CSV files
from src.preprocessing.data_loader import DataLoader

loader = DataLoader()
df = loader.load_all_files()

print(f"\nDataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

In [None]:
# Display basic info
loader.print_summary(df)

In [None]:
# First few rows
df.head()

In [None]:
# Column names and types
print("Columns:")
for i, (col, dtype) in enumerate(zip(df.columns, df.dtypes)):
    print(f"{i+1:3}. {col:40} {dtype}")

## 2. Label Analysis

In [None]:
# Clean labels first
df.columns = df.columns.str.strip()
df['Label'] = df['Label'].str.strip()

# Label distribution
label_counts = df['Label'].value_counts()
print("Label Distribution:")
print("="*60)
for label, count in label_counts.items():
    pct = count / len(df) * 100
    print(f"{label:40} {count:>10,} ({pct:6.2f}%)")

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot
ax1 = axes[0]
colors = ['green' if label == 'BENIGN' else 'red' for label in label_counts.index]
bars = ax1.barh(label_counts.index, label_counts.values, color=colors, alpha=0.7)
ax1.set_xlabel('Count')
ax1.set_title('Label Distribution (Log Scale)')
ax1.set_xscale('log')

# Add count labels
for bar, count in zip(bars, label_counts.values):
    ax1.text(count * 1.1, bar.get_y() + bar.get_height()/2, 
             f'{count:,}', va='center', fontsize=9)

# Pie chart (grouped)
ax2 = axes[1]
benign_count = label_counts.get('BENIGN', 0)
attack_count = label_counts.sum() - benign_count
sizes = [benign_count, attack_count]
labels = ['BENIGN', 'ATTACK']
colors = ['green', 'red']
explode = (0, 0.05)
ax2.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax2.set_title('Binary Classification Distribution')

plt.tight_layout()
plt.savefig('../results/label_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Feature Analysis

In [None]:
# Get numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Number of numeric features: {len(numeric_cols)}")

# Basic statistics
df[numeric_cols].describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    print("Columns with missing values:")
    for col, count in missing.items():
        pct = count / len(df) * 100
        print(f"  {col}: {count:,} ({pct:.2f}%)")
else:
    print("No missing values found!")

In [None]:
# Check for infinite values
inf_counts = {}
for col in numeric_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    print("Columns with infinite values:")
    for col, count in inf_counts.items():
        print(f"  {col}: {count:,}")
else:
    print("No infinite values found!")

In [None]:
# Replace inf with nan and fill
df_clean = df.replace([np.inf, -np.inf], np.nan)
df_clean = df_clean.dropna()
print(f"After cleaning: {len(df_clean):,} rows (removed {len(df) - len(df_clean):,})")

## 4. Feature Distributions

In [None]:
# Key features to visualize
key_features = [
    'Flow Duration', 
    'Total Fwd Packets', 
    'Total Backward Packets',
    'Flow Bytes/s',
    'Flow Packets/s',
    'Fwd Packet Length Mean'
]

# Sample for visualization (full data is too large)
df_sample = df_clean.sample(n=min(50000, len(df_clean)), random_state=42)

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    if feature in df_sample.columns:
        ax = axes[i]
        
        # Plot for each class
        for label in ['BENIGN', df_sample[df_sample['Label'] != 'BENIGN']['Label'].iloc[0] if len(df_sample[df_sample['Label'] != 'BENIGN']) > 0 else 'Attack']:
            if label == 'BENIGN':
                data = df_sample[df_sample['Label'] == 'BENIGN'][feature]
                color = 'green'
            else:
                data = df_sample[df_sample['Label'] != 'BENIGN'][feature]
                color = 'red'
                label = 'Attack'
            
            # Clip outliers for visualization
            data_clipped = data.clip(upper=data.quantile(0.99))
            ax.hist(data_clipped, bins=50, alpha=0.5, label=label, color=color, density=True)
        
        ax.set_title(feature)
        ax.set_xlabel('Value')
        ax.set_ylabel('Density')
        ax.legend()

plt.tight_layout()
plt.savefig('../results/feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlation Analysis

In [None]:
# Select top features for correlation
top_features = [
    'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean',
    'Fwd Packet Length Mean', 'Bwd Packet Length Mean',
    'Packet Length Mean', 'Packet Length Std',
    'Average Packet Size', 'Init_Win_bytes_forward'
]

# Filter existing columns
existing_features = [f for f in top_features if f in df_sample.columns]

# Compute correlation matrix
corr_matrix = df_sample[existing_features].corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='RdYlBu_r', center=0, square=True,
            linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.savefig('../results/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Attack Analysis by File/Day

In [None]:
# Attacks by source file
if 'Source_File' in df.columns:
    attack_by_file = df.groupby(['Source_File', 'Label']).size().unstack(fill_value=0)
    
    # Plot
    fig, ax = plt.subplots(figsize=(14, 8))
    attack_by_file.plot(kind='bar', stacked=True, ax=ax, colormap='tab20')
    ax.set_title('Attack Distribution by Source File')
    ax.set_xlabel('Source File')
    ax.set_ylabel('Count')
    ax.set_yscale('log')
    plt.xticks(rotation=45, ha='right')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('../results/attacks_by_file.png', dpi=150, bbox_inches='tight')
    plt.show()

## 7. Summary Statistics for StealthMesh

In [None]:
# Create summary for paper/research
summary = {
    'Total Records': len(df),
    'Total Features': len(numeric_cols),
    'Benign Records': len(df[df['Label'] == 'BENIGN']),
    'Attack Records': len(df[df['Label'] != 'BENIGN']),
    'Attack Types': df['Label'].nunique() - 1,
    'Imbalance Ratio': f"{len(df[df['Label'] == 'BENIGN']) / len(df[df['Label'] != 'BENIGN']):.2f}:1"
}

print("\n" + "="*60)
print("DATASET SUMMARY FOR RESEARCH PAPER")
print("="*60)
for key, value in summary.items():
    print(f"{key:25} {value:>20}")
print("="*60)

In [None]:
# Save summary to file
with open('../results/dataset_summary.txt', 'w') as f:
    f.write("CICIDS 2017 Dataset Summary for StealthMesh\n")
    f.write("="*60 + "\n\n")
    for key, value in summary.items():
        f.write(f"{key}: {value}\n")
    f.write("\n\nLabel Distribution:\n")
    for label, count in label_counts.items():
        f.write(f"  {label}: {count:,}\n")

print("Summary saved to results/dataset_summary.txt")

## 8. Next Steps

Based on this EDA, the next steps for StealthMesh are:

1. **Data Preprocessing**: Clean data, handle imbalance, scale features
2. **Feature Selection**: Select most important features for lightweight detection
3. **Model Training**: Train ML models (RF, XGBoost, Neural Networks)
4. **Evaluation**: Test detection accuracy and latency

Run the preprocessing pipeline:
```bash
python preprocess_data.py --classification binary --features 40
```