# SDN ML Traffic Management - Traffic Classifier Training

This notebook trains a RandomForest classifier to categorize network flows into priority classes:
- **P3**: Banking/Payment (highest priority)
- **P2**: Voice/Video (low jitter)
- **P1**: Web/Office (best effort)
- **P0**: Bulk/Background (lowest priority)

In [None]:
# Install dependencies
!pip install -q pandas numpy scikit-learn matplotlib seaborn joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from google.colab import files

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load Data

In [None]:
# Upload flows.csv if not already present
try:
    df = pd.read_csv('flows.csv')
    print(f"Loaded flows.csv: {len(df)} records")
except FileNotFoundError:
    print("Upload flows.csv:")
    uploaded = files.upload()
    df = pd.read_csv('flows.csv')

In [None]:
# If no data, create synthetic dataset for training
if len(df) < 100:
    print("Creating synthetic training data...")
    np.random.seed(RANDOM_STATE)
    n_samples = 1000
    
    # Generate realistic traffic patterns for each class
    data = []
    
    # P3 Banking: Short duration, small packets, HTTPS
    for _ in range(n_samples // 4):
        data.append({
            'packet_count': np.random.randint(10, 100),
            'byte_count': np.random.randint(5000, 50000),
            'duration_sec': np.random.uniform(0.5, 10),
            'dst_port': np.random.choice([443, 5003]),
            'protocol': 'tcp',
            'label': 'P3'
        })
    
    # P2 Voice: Constant bitrate, small packets, UDP
    for _ in range(n_samples // 4):
        duration = np.random.uniform(30, 300)
        data.append({
            'packet_count': int(duration * 50),  # ~50 packets/sec for voice
            'byte_count': int(duration * 8000),   # ~64kbps
            'duration_sec': duration,
            'dst_port': np.random.choice([5060, 5002]),
            'protocol': 'udp',
            'label': 'P2'
        })
    
    # P1 Web: Variable, medium duration
    for _ in range(n_samples // 4):
        data.append({
            'packet_count': np.random.randint(50, 500),
            'byte_count': np.random.randint(50000, 500000),
            'duration_sec': np.random.uniform(5, 60),
            'dst_port': np.random.choice([80, 443, 8080]),
            'protocol': 'tcp',
            'label': 'P1'
        })
    
    # P0 Bulk: Long duration, high bandwidth
    for _ in range(n_samples // 4):
        duration = np.random.uniform(60, 600)
        data.append({
            'packet_count': np.random.randint(10000, 100000),
            'byte_count': np.random.randint(10000000, 100000000),
            'duration_sec': duration,
            'dst_port': np.random.choice([20, 21, 5000]),
            'protocol': 'tcp',
            'label': 'P0'
        })
    
    df = pd.DataFrame(data)
    
    # Calculate derived features
    df['bytes_per_packet'] = df['byte_count'] / df['packet_count']
    df['packets_per_sec'] = df['packet_count'] / df['duration_sec']
    df['bytes_per_sec'] = df['byte_count'] / df['duration_sec']
    
    print(f"Created {len(df)} synthetic samples")

In [None]:
# Display data info
print("Dataset shape:", df.shape)
print("\nLabel distribution:")
print(df['label'].value_counts())
df.head()

## 2. Feature Engineering

In [None]:
# Select features for training
feature_columns = [
    'packet_count',
    'byte_count', 
    'duration_sec',
    'bytes_per_packet',
    'packets_per_sec',
    'bytes_per_sec',
]

# Add protocol as binary feature
if 'protocol' in df.columns:
    df['is_udp'] = (df['protocol'].str.lower() == 'udp').astype(int)
    feature_columns.append('is_udp')

# Add port-based features
if 'dst_port' in df.columns:
    df['is_https'] = (df['dst_port'] == 443).astype(int)
    df['is_voice_port'] = df['dst_port'].isin([5060, 5061, 5002]).astype(int)
    feature_columns.extend(['is_https', 'is_voice_port'])

print("Features used:", feature_columns)

In [None]:
# Prepare feature matrix and labels
X = df[feature_columns].copy()
y = df['label'].copy()

# Handle missing values
X = X.fillna(0)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i} -> {label}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=RANDOM_STATE,
    stratify=y_encoded
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

## 3. Train Classifier

In [None]:
# Train RandomForest classifier
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("Training classifier...")
clf.fit(X_train, y_train)
print("Training complete!")

In [None]:
# Cross-validation
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

## 4. Evaluate Model

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Traffic Classification')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Save Model

In [None]:
# Save model and related artifacts
model_artifacts = {
    'classifier': clf,
    'label_encoder': label_encoder,
    'feature_columns': feature_columns,
    'accuracy': accuracy,
}

joblib.dump(model_artifacts, 'classifier.pkl')
print("Model saved to classifier.pkl")

In [None]:
# Download model
files.download('classifier.pkl')
print("\nDownload the classifier.pkl file and place it in ml/models/ directory")

## 6. Test Inference

In [None]:
# Test inference with sample flows
def predict_priority(flow_features):
    """Predict priority class for a flow"""
    # Prepare features
    features = pd.DataFrame([flow_features])
    
    # Add derived features if missing
    if 'bytes_per_packet' not in features.columns:
        features['bytes_per_packet'] = features['byte_count'] / features['packet_count']
    if 'packets_per_sec' not in features.columns:
        features['packets_per_sec'] = features['packet_count'] / features['duration_sec']
    if 'bytes_per_sec' not in features.columns:
        features['bytes_per_sec'] = features['byte_count'] / features['duration_sec']
    if 'is_udp' not in features.columns:
        features['is_udp'] = 0
    if 'is_https' not in features.columns:
        features['is_https'] = 0
    if 'is_voice_port' not in features.columns:
        features['is_voice_port'] = 0
    
    # Select features in correct order
    X = features[feature_columns].fillna(0)
    
    # Predict
    pred = clf.predict(X)[0]
    proba = clf.predict_proba(X)[0]
    
    return label_encoder.classes_[pred], proba

# Test samples
test_samples = [
    {'packet_count': 50, 'byte_count': 25000, 'duration_sec': 2, 'dst_port': 443},  # Banking?
    {'packet_count': 3000, 'byte_count': 240000, 'duration_sec': 60, 'is_udp': 1},  # Voice?
    {'packet_count': 200, 'byte_count': 100000, 'duration_sec': 30, 'dst_port': 80},  # Web?
    {'packet_count': 50000, 'byte_count': 50000000, 'duration_sec': 120},  # Bulk?
]

print("Sample Predictions:")
print("-" * 60)
for sample in test_samples:
    priority, proba = predict_priority(sample)
    print(f"Sample: {sample}")
    print(f"  Predicted: {priority}")
    print(f"  Probabilities: {dict(zip(label_encoder.classes_, proba.round(3)))}")
    print()

## Next Steps

1. Download `classifier.pkl` and place it in `ml/models/` directory
2. Train the congestion predictor: `03_train_predictor.ipynb`
3. The orchestrator will automatically use the trained model if present