# SDN ML Traffic Management - Congestion Predictor Training

This notebook trains a model to predict network congestion based on:
- Historical utilization patterns
- Time-of-day features
- Trend analysis

In [None]:
# Install dependencies
!pip install -q pandas numpy scikit-learn matplotlib seaborn joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from google.colab import files

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load Data

In [None]:
# Upload link_timeseries.csv if not present
try:
    df = pd.read_csv('link_timeseries.csv')
    print(f"Loaded link_timeseries.csv: {len(df)} records")
except FileNotFoundError:
    print("Upload link_timeseries.csv:")
    uploaded = files.upload()
    df = pd.read_csv('link_timeseries.csv')

In [None]:
# Create synthetic data if needed
if len(df) < 100:
    print("Creating synthetic training data...")
    
    # Simulate 7 days of data at 10-second intervals
    n_samples = 7 * 24 * 360  # 7 days
    timestamps = pd.date_range('2026-01-01', periods=n_samples, freq='10s')
    
    # Base utilization with daily pattern
    hours = timestamps.hour + timestamps.minute / 60
    
    # Create realistic daily pattern:
    # - Low at night (0-6)
    # - Peak at 9AM (login surge)
    # - High during work hours (9-17)
    # - Decrease after work
    base_util = np.zeros(n_samples)
    for i, h in enumerate(hours):
        if 0 <= h < 6:
            base_util[i] = 0.1 + np.random.uniform(0, 0.1)
        elif 6 <= h < 8:
            base_util[i] = 0.2 + (h - 6) * 0.15 + np.random.uniform(0, 0.1)
        elif 8 <= h < 10:  # Morning peak
            base_util[i] = 0.6 + np.random.uniform(0, 0.3)
        elif 10 <= h < 12:
            base_util[i] = 0.5 + np.random.uniform(0, 0.2)
        elif 12 <= h < 14:  # Lunch dip
            base_util[i] = 0.4 + np.random.uniform(0, 0.15)
        elif 14 <= h < 17:
            base_util[i] = 0.5 + np.random.uniform(0, 0.2)
        elif 17 <= h < 19:
            base_util[i] = 0.4 + np.random.uniform(0, 0.15)
        else:
            base_util[i] = 0.2 + np.random.uniform(0, 0.1)
    
    # Add weekend effect (lower traffic)
    is_weekend = timestamps.dayofweek >= 5
    base_util[is_weekend] *= 0.5
    
    # Add some random spikes
    spike_indices = np.random.choice(n_samples, size=n_samples // 100, replace=False)
    base_util[spike_indices] += np.random.uniform(0.2, 0.4, len(spike_indices))
    
    # Clip to valid range
    base_util = np.clip(base_util, 0, 1)
    
    df = pd.DataFrame({
        'timestamp': timestamps,
        'switch': 's1',
        'port': 1,
        'utilization': base_util,
        'bytes_delta': (base_util * 10_000_000 * 10 / 8).astype(int),  # 10Mbps link, 10s interval
        'hour_of_day': timestamps.hour,
        'minute_of_hour': timestamps.minute,
        'is_weekday': ~is_weekend,
    })
    
    # Create label: congested in next interval
    df['label'] = (df['utilization'].shift(-1) > 0.7).astype(int).fillna(0).astype(int)
    
    print(f"Created {len(df)} synthetic samples")

In [None]:
# Parse timestamp if needed
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['minute_of_hour'] = df['timestamp'].dt.minute
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekday'] = df['day_of_week'] < 5

print("Dataset shape:", df.shape)
df.head()

## 2. Feature Engineering

In [None]:
# Create lag features (previous utilization values)
LAG_STEPS = 6  # Use last 6 intervals (1 minute of history)

for i in range(1, LAG_STEPS + 1):
    df[f'util_lag_{i}'] = df['utilization'].shift(i)

# Rolling statistics
df['util_rolling_mean'] = df['utilization'].rolling(window=6).mean()
df['util_rolling_std'] = df['utilization'].rolling(window=6).std()
df['util_rolling_max'] = df['utilization'].rolling(window=6).max()

# Trend feature
df['util_trend'] = df['utilization'] - df['util_lag_1']

# Drop rows with NaN (from lag features)
df = df.dropna()

print(f"After feature engineering: {len(df)} samples")

In [None]:
# Define features
feature_columns = [
    'util_lag_1', 'util_lag_2', 'util_lag_3', 'util_lag_4', 'util_lag_5', 'util_lag_6',
    'util_rolling_mean', 'util_rolling_std', 'util_rolling_max',
    'util_trend',
    'hour_of_day', 'minute_of_hour',
]

if 'is_weekday' in df.columns:
    feature_columns.append('is_weekday')

print("Features:", feature_columns)

In [None]:
# Prepare data
X = df[feature_columns].copy()

# Convert boolean to int
if 'is_weekday' in X.columns:
    X['is_weekday'] = X['is_weekday'].astype(int)

# Target: next interval utilization (regression) or congestion (classification)
y_regression = df['utilization'].shift(-1).fillna(df['utilization'].iloc[-1])
y_classification = (y_regression > 0.7).astype(int)  # Congested if > 70%

print(f"Congestion rate: {y_classification.mean():.1%}")

In [None]:
# Split data (time-series aware - don't shuffle)
split_idx = int(len(X) * 0.8)

X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train_reg = y_regression.iloc[:split_idx]
y_test_reg = y_regression.iloc[split_idx:]
y_train_cls = y_classification.iloc[:split_idx]
y_test_cls = y_classification.iloc[split_idx:]

print(f"Training: {len(X_train)}, Test: {len(X_test)}")

## 3. Train Regression Model (Predict Utilization)

In [None]:
# Train regressor
regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("Training regressor...")
regressor.fit(X_train, y_train_reg)
print("Done!")

In [None]:
# Evaluate regressor
y_pred_reg = regressor.predict(X_test)

mse = mean_squared_error(y_test_reg, y_pred_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Regression Metrics:")
print(f"  MSE: {mse:.6f}")
print(f"  MAE: {mae:.6f}")
print(f"  R2 Score: {r2:.4f}")

In [None]:
# Plot predictions vs actual
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test_reg, y_pred_reg, alpha=0.3, s=10)
axes[0].plot([0, 1], [0, 1], 'r--', label='Perfect prediction')
axes[0].axhline(y=0.7, color='orange', linestyle=':', label='Congestion threshold')
axes[0].axvline(x=0.7, color='orange', linestyle=':')
axes[0].set_xlabel('Actual Utilization')
axes[0].set_ylabel('Predicted Utilization')
axes[0].set_title('Predicted vs Actual')
axes[0].legend()

# Time series (last 500 samples)
n_plot = min(500, len(y_test_reg))
axes[1].plot(y_test_reg.values[-n_plot:], label='Actual', alpha=0.7)
axes[1].plot(y_pred_reg[-n_plot:], label='Predicted', alpha=0.7)
axes[1].axhline(y=0.7, color='red', linestyle='--', label='Congestion threshold')
axes[1].set_xlabel('Time Index')
axes[1].set_ylabel('Utilization')
axes[1].set_title('Prediction Over Time')
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Train Classification Model (Predict Congestion)

In [None]:
# Train classifier
classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced',  # Handle imbalanced classes
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("Training classifier...")
classifier.fit(X_train, y_train_cls)
print("Done!")

In [None]:
# Evaluate classifier
y_pred_cls = classifier.predict(X_test)

accuracy = accuracy_score(y_test_cls, y_pred_cls)
print(f"Classification Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_cls, y_pred_cls, target_names=['Normal', 'Congested']))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test_cls, y_pred_cls)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Congested'],
            yticklabels=['Normal', 'Congested'])
plt.title('Congestion Prediction Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': regressor.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Regressor):")
print(importance_df)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Congestion Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Save Models

In [None]:
# Save predictor artifacts
predictor_artifacts = {
    'regressor': regressor,
    'classifier': classifier,
    'feature_columns': feature_columns,
    'lag_steps': LAG_STEPS,
    'congestion_threshold': 0.7,
    'metrics': {
        'regression_r2': r2,
        'regression_mae': mae,
        'classification_accuracy': accuracy,
    }
}

joblib.dump(predictor_artifacts, 'predictor.pkl')
print("Models saved to predictor.pkl")

In [None]:
# Download model
files.download('predictor.pkl')
print("\nDownload predictor.pkl and place it in ml/models/ directory")

## 6. Test Inference

In [None]:
def predict_congestion(utilization_history, hour, minute, is_weekday=True):
    """
    Predict if congestion will occur in the next interval.
    
    Args:
        utilization_history: List of last 6 utilization values
        hour: Current hour (0-23)
        minute: Current minute (0-59)
        is_weekday: Whether it's a weekday
    
    Returns:
        tuple: (predicted_utilization, will_be_congested, probability)
    """
    # Pad history if needed
    history = list(utilization_history)[-6:]
    while len(history) < 6:
        history.insert(0, history[0] if history else 0.5)
    
    # Calculate features
    features = {
        'util_lag_1': history[-1],
        'util_lag_2': history[-2],
        'util_lag_3': history[-3],
        'util_lag_4': history[-4],
        'util_lag_5': history[-5],
        'util_lag_6': history[-6],
        'util_rolling_mean': np.mean(history),
        'util_rolling_std': np.std(history),
        'util_rolling_max': np.max(history),
        'util_trend': history[-1] - history[-2] if len(history) > 1 else 0,
        'hour_of_day': hour,
        'minute_of_hour': minute,
        'is_weekday': int(is_weekday),
    }
    
    X = pd.DataFrame([features])[feature_columns]
    
    # Predict
    predicted_util = regressor.predict(X)[0]
    congested_proba = classifier.predict_proba(X)[0][1]
    will_be_congested = predicted_util > 0.7 or congested_proba > 0.5
    
    return predicted_util, will_be_congested, congested_proba

# Test predictions
test_cases = [
    ([0.3, 0.35, 0.4, 0.45, 0.5, 0.55], 9, 0),   # Morning rise
    ([0.6, 0.65, 0.7, 0.72, 0.75, 0.78], 10, 0), # Already congested
    ([0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 3, 0),      # Night, stable
    ([0.4, 0.5, 0.6, 0.65, 0.68, 0.72], 14, 30), # Afternoon spike
]

print("Congestion Predictions:")
print("-" * 70)
for history, hour, minute in test_cases:
    pred_util, congested, proba = predict_congestion(history, hour, minute)
    print(f"History: {history[-3:]}... @ {hour:02d}:{minute:02d}")
    print(f"  Predicted utilization: {pred_util:.1%}")
    print(f"  Congestion: {'YES' if congested else 'NO'} (prob: {proba:.1%})")
    print()

## Next Steps

1. Download `predictor.pkl` and place it in `ml/models/` directory
2. Run model evaluation notebook: `04_model_evaluation.ipynb`
3. The orchestrator will automatically use trained models if present