# SDN ML Traffic Management - Data Exploration

This notebook explores the collected datasets for training ML models.

## Setup
1. Upload `flows.csv` and `link_timeseries.csv` from your local `data/processed/` folder
2. Run all cells to explore the data

In [None]:
# Install dependencies (Colab has most of these)
!pip install -q pandas numpy matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Upload Data Files

In [None]:
# Upload files from local machine
print("Upload flows.csv and link_timeseries.csv:")
uploaded = files.upload()

In [None]:
# Load datasets
try:
    flows_df = pd.read_csv('flows.csv')
    print(f"Loaded flows.csv: {len(flows_df)} records")
except FileNotFoundError:
    print("flows.csv not found - using sample data")
    # Create sample data for testing
    flows_df = pd.DataFrame({
        'flow_id': range(100),
        'packet_count': np.random.randint(10, 10000, 100),
        'byte_count': np.random.randint(1000, 1000000, 100),
        'duration_sec': np.random.uniform(1, 120, 100),
        'dst_port': np.random.choice([80, 443, 5001, 5002, 5003, 5000], 100),
        'label': np.random.choice(['P0', 'P1', 'P2', 'P3'], 100, p=[0.2, 0.4, 0.2, 0.2])
    })
    flows_df['bytes_per_packet'] = flows_df['byte_count'] / flows_df['packet_count']
    flows_df['packets_per_sec'] = flows_df['packet_count'] / flows_df['duration_sec']
    flows_df['bytes_per_sec'] = flows_df['byte_count'] / flows_df['duration_sec']

try:
    links_df = pd.read_csv('link_timeseries.csv')
    print(f"Loaded link_timeseries.csv: {len(links_df)} records")
except FileNotFoundError:
    print("link_timeseries.csv not found - using sample data")
    # Create sample data
    links_df = pd.DataFrame({
        'timestamp': pd.date_range('2026-01-01', periods=500, freq='10s'),
        'switch': 's1',
        'port': 1,
        'bytes_delta': np.random.randint(0, 1250000, 500),
        'hour_of_day': np.random.randint(0, 24, 500),
        'label': np.random.choice([0, 1], 500, p=[0.8, 0.2])
    })
    links_df['utilization'] = links_df['bytes_delta'] * 8 / (10_000_000 * 10)

## 2. Flows Dataset Analysis

In [None]:
# Basic info
print("Flows Dataset Info:")
print(flows_df.info())
print("\nFirst 5 rows:")
flows_df.head()

In [None]:
# Class distribution
print("Label Distribution:")
label_counts = flows_df['label'].value_counts()
print(label_counts)

fig, ax = plt.subplots(figsize=(8, 5))
label_counts.plot(kind='bar', ax=ax, color=['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4'])
ax.set_title('Traffic Priority Class Distribution')
ax.set_xlabel('Priority Class')
ax.set_ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by class
numeric_features = ['packet_count', 'byte_count', 'bytes_per_packet', 
                    'packets_per_sec', 'bytes_per_sec', 'duration_sec']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(numeric_features):
    if feature in flows_df.columns:
        for label in flows_df['label'].unique():
            data = flows_df[flows_df['label'] == label][feature]
            axes[i].hist(data, bins=30, alpha=0.5, label=label)
        axes[i].set_title(f'{feature} by Class')
        axes[i].legend()
        axes[i].set_xlabel(feature)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
numeric_cols = flows_df.select_dtypes(include=[np.number]).columns
corr_matrix = flows_df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Link Timeseries Analysis

In [None]:
# Basic info
print("Links Dataset Info:")
print(links_df.info())
print("\nFirst 5 rows:")
links_df.head()

In [None]:
# Utilization distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Histogram
axes[0].hist(links_df['utilization'], bins=50, edgecolor='black')
axes[0].axvline(x=0.7, color='red', linestyle='--', label='Congestion threshold (70%)')
axes[0].set_title('Link Utilization Distribution')
axes[0].set_xlabel('Utilization')
axes[0].set_ylabel('Count')
axes[0].legend()

# Over time
if 'timestamp' in links_df.columns:
    links_df['timestamp'] = pd.to_datetime(links_df['timestamp'])
    axes[1].plot(links_df['timestamp'], links_df['utilization'])
    axes[1].axhline(y=0.7, color='red', linestyle='--', label='Congestion threshold')
    axes[1].set_title('Utilization Over Time')
    axes[1].set_xlabel('Time')
    axes[1].set_ylabel('Utilization')
    axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Congestion by hour of day
if 'hour_of_day' in links_df.columns:
    hourly_util = links_df.groupby('hour_of_day')['utilization'].mean()
    
    plt.figure(figsize=(10, 4))
    hourly_util.plot(kind='bar')
    plt.axhline(y=0.7, color='red', linestyle='--', label='Congestion threshold')
    plt.title('Average Utilization by Hour of Day')
    plt.xlabel('Hour')
    plt.ylabel('Average Utilization')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
# Congestion label distribution
print("Congestion Label Distribution:")
print(links_df['label'].value_counts())
print(f"\nCongestion rate: {links_df['label'].mean():.1%}")

## 4. Summary Statistics

In [None]:
print("="*50)
print("FLOWS DATASET SUMMARY")
print("="*50)
print(f"Total records: {len(flows_df)}")
print(f"\nClass distribution:")
for label in sorted(flows_df['label'].unique()):
    count = len(flows_df[flows_df['label'] == label])
    pct = count / len(flows_df) * 100
    print(f"  {label}: {count} ({pct:.1f}%)")

print("\n" + "="*50)
print("LINKS DATASET SUMMARY")
print("="*50)
print(f"Total records: {len(links_df)}")
print(f"Congestion rate: {links_df['label'].mean():.1%}")
print(f"Average utilization: {links_df['utilization'].mean():.1%}")
print(f"Max utilization: {links_df['utilization'].max():.1%}")

## Next Steps

1. **Train Classifier**: Open `02_train_classifier.ipynb` to train the traffic classifier
2. **Train Predictor**: Open `03_train_predictor.ipynb` to train the congestion predictor
3. **Evaluate Models**: Open `04_model_evaluation.ipynb` to evaluate model performance