# üöÄ Enhanced Training Data Generator + Model Training

This notebook will:
1. Generate comprehensive training data with ALL spike types
2. Train XGBoost models on the enhanced data
3. Save .pkl files ready for download

**Run all cells in order!**

## üì¶ Step 1: Install Dependencies

In [None]:
!pip install xgboost scikit-learn pandas numpy joblib -q
print("‚úÖ Dependencies installed!")

## üé≤ Step 2: Generate Enhanced Training Data

In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

print("üöÄ Enhanced Training Data Generator")
print("=" * 60)

# Configuration
TOTAL_DURATION = 10 * 24 * 3600  # 10 days
BASE_PODS = 5

print(f"‚è±Ô∏è  Total duration: {TOTAL_DURATION // 86400} days")
print(f"üìä Expected rows: ~{TOTAL_DURATION:,}")

In [None]:
# Traffic Pattern Generators

def daily_baseline_pattern(t):
    """Realistic daily traffic pattern"""
    hour = (t % 86400) / 3600
    
    # Morning ramp-up
    morning_boost = 20 * ((hour - 6) / 3) if 6 <= hour < 9 else 0
    # Lunch spike
    lunch_boost = 30 if 12 <= hour < 13 else 0
    # Evening peak
    evening_boost = 40 * np.sin(np.pi * (hour - 17) / 3) if 17 <= hour < 20 else 0
    
    base = 60 + 20 * np.sin(2 * np.pi * t / 86400)
    noise = np.random.normal(0, 5)
    
    return max(10, base + morning_boost + lunch_boost + evening_boost + noise)


def generate_sudden_spikes(duration, count=150):
    """Sudden sharp spikes (like test case)"""
    spikes = []
    positions = np.random.choice(duration, count, replace=False)
    for pos in positions:
        start = int(pos)
        spike_duration = np.random.randint(30, 80)
        magnitude = np.random.uniform(3.5, 5.0)
        spikes.append(('sudden', start, spike_duration, magnitude))
    return spikes


def generate_gradual_ramps(duration, count=80):
    """Gradual ramp-ups and downs"""
    ramps = []
    positions = np.random.choice(duration, count, replace=False)
    for pos in positions:
        start = int(pos)
        ramp_duration = np.random.randint(300, 600)
        peak_magnitude = np.random.uniform(2.0, 3.5)
        ramps.append(('gradual', start, ramp_duration, peak_magnitude))
    return ramps


def generate_sustained_bursts(duration, count=60):
    """Long sustained high load"""
    bursts = []
    positions = np.random.choice(duration, count, replace=False)
    for pos in positions:
        start = int(pos)
        burst_duration = np.random.randint(600, 1800)
        magnitude = np.random.uniform(3.0, 4.5)
        bursts.append(('sustained', start, burst_duration, magnitude))
    return bursts


def generate_oscillating_patterns(duration, count=40):
    """Oscillating traffic"""
    oscillations = []
    positions = np.random.choice(duration, count, replace=False)
    for pos in positions:
        start = int(pos)
        pattern_duration = np.random.randint(400, 800)
        base_magnitude = np.random.uniform(1.5, 2.5)
        oscillations.append(('oscillating', start, pattern_duration, base_magnitude))
    return oscillations


def generate_cascading_spikes(duration, count=30):
    """Sequential spikes getting closer"""
    cascades = []
    positions = np.random.choice(duration - 1000, count, replace=False)
    for pos in positions:
        start = int(pos)
        num_spikes = np.random.randint(3, 6)
        for i in range(num_spikes):
            spike_start = start + i * np.random.randint(100, 200)
            spike_duration = np.random.randint(40, 80)
            magnitude = 2.0 + i * 0.5
            cascades.append(('cascading', spike_start, spike_duration, magnitude))
    return cascades


def generate_flash_crowds(duration, count=50):
    """Instant spike + gradual decline"""
    flash_crowds = []
    positions = np.random.choice(duration, count, replace=False)
    for pos in positions:
        start = int(pos)
        spike_duration = 10
        decline_duration = np.random.randint(300, 600)
        magnitude = np.random.uniform(4.0, 5.5)
        flash_crowds.append(('flash_crowd', start, spike_duration + decline_duration, magnitude))
    return flash_crowds


def apply_traffic_events(t, base_traffic, all_events):
    """Apply all events to get final request rate"""
    multiplier = 1.0
    
    for event_type, start, duration, magnitude in all_events:
        if start <= t < start + duration:
            progress = (t - start) / duration
            
            if event_type == 'sudden':
                multiplier = max(multiplier, magnitude)
            elif event_type == 'gradual':
                ramp_factor = np.sin(np.pi * progress)
                multiplier = max(multiplier, 1 + (magnitude - 1) * ramp_factor)
            elif event_type == 'sustained':
                multiplier = max(multiplier, magnitude * (1 + np.random.normal(0, 0.05)))
            elif event_type == 'oscillating':
                osc_factor = np.sin(10 * np.pi * progress)
                multiplier = max(multiplier, 1 + magnitude * osc_factor)
            elif event_type == 'cascading':
                multiplier = max(multiplier, magnitude)
            elif event_type == 'flash_crowd':
                if progress < 0.1:
                    multiplier = max(multiplier, magnitude)
                else:
                    decline_progress = (progress - 0.1) / 0.9
                    multiplier = max(multiplier, magnitude - (magnitude - 1) * decline_progress)
    
    return base_traffic * multiplier

print("‚úÖ Functions defined!")

In [None]:
# Generate all events
print("üìã Generating traffic events...")

all_events = []
all_events.extend(generate_sudden_spikes(TOTAL_DURATION, count=150))
all_events.extend(generate_gradual_ramps(TOTAL_DURATION, count=80))
all_events.extend(generate_sustained_bursts(TOTAL_DURATION, count=60))
all_events.extend(generate_oscillating_patterns(TOTAL_DURATION, count=40))
all_events.extend(generate_cascading_spikes(TOTAL_DURATION, count=30))
all_events.extend(generate_flash_crowds(TOTAL_DURATION, count=50))

print(f"‚úì Total events: {len(all_events)}")
print("  - Sudden spikes: 150")
print("  - Gradual ramps: 80")
print("  - Sustained bursts: 60")
print("  - Oscillating patterns: 40")
print("  - Cascading spikes: ~120")
print("  - Flash crowds: 50")

In [None]:
# Main simulation
print("\n‚öôÔ∏è  Running simulation (this takes 3-5 minutes)...")

rows = []
rr_index = 0
queue = 0

progress_interval = TOTAL_DURATION // 20

for t in range(TOTAL_DURATION):
    if t % progress_interval == 0:
        progress = (t / TOTAL_DURATION) * 100
        print(f"  Progress: {progress:.0f}%")
    
    base_req = daily_baseline_pattern(t)
    request_rate = apply_traffic_events(t, base_req, all_events)
    request_rate = max(5, min(500, request_rate))
    
    payload_kb = np.random.uniform(50, 500)
    pod = rr_index % BASE_PODS
    rr_index += 1
    
    cpu_used = min(100, (request_rate / (BASE_PODS * 15)) * 100)
    memory_used = min(100, cpu_used * 0.8 + np.random.normal(0, 2))
    queue = max(0, queue * 0.9 + request_rate * 0.1 - BASE_PODS * 10)
    
    latency = 50 + cpu_used * 0.8
    if cpu_used > 70:
        latency += (cpu_used - 70) * 2
    latency += queue * 0.05
    latency = max(30, latency + np.random.normal(0, 3))
    
    rows.append([
        t, request_rate, payload_kb, queue,
        cpu_used, memory_used, latency,
        BASE_PODS, pod
    ])

print("‚úì Simulation complete!")

In [None]:
# Create DataFrame
print("\nüìä Creating DataFrame...")

df = pd.DataFrame(rows, columns=[
    "timestamp", "request_rate", "payload_size_kb", "queue_length",
    "cpu_used_pct", "memory_used_pct", "latency_ms",
    "active_pods", "rr_pod_index"
])

print(f"‚úì DataFrame created: {df.shape}")

In [None]:
# Feature Engineering
print("\nüîß Engineering features...")

for window in [5, 10, 30]:
    df[f"req_avg_{window}s"] = df["request_rate"].rolling(window, min_periods=1).mean()
    df[f"req_delta_{window}s"] = df["request_rate"].diff(window).fillna(0)

print("‚úì Rolling features created")

In [None]:
# Create Spike Labels
print("\nüè∑Ô∏è  Creating spike labels...")

for horizon in [10, 30, 60]:
    df[f"future_req_{horizon}s"] = df["request_rate"].shift(-horizon)
    
    # Lower thresholds to catch more spikes
    threshold = {10: 1.5, 30: 1.6, 60: 1.7}[horizon]
    
    df[f"spike_{horizon}s"] = (
        df[f"future_req_{horizon}s"] > df["request_rate"] * threshold
    ).astype(int)
    
    spike_count = df[f"spike_{horizon}s"].sum()
    spike_pct = 100 * df[f"spike_{horizon}s"].mean()
    print(f"  spike_{horizon}s: {spike_count:,} ({spike_pct:.2f}%)")

df.dropna(inplace=True)
print(f"\n‚úì Final dataset: {df.shape}")

In [None]:
# Save CSV
output_file = "synthetic_k8s_load_enhanced.csv"
df.to_csv(output_file, index=False)

print(f"\nüíæ Saved: {output_file}")
print(f"üìä Shape: {df.shape}")
print(f"üìà Request rate range: {df['request_rate'].min():.1f} - {df['request_rate'].max():.1f}")

# Show sample
print("\nüìù Sample data:")
df.head()

## ü§ñ Step 3: Train XGBoost Models

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib

print("üéØ Training XGBoost Models")
print("=" * 60)

# Define features
ALL_COLUMNS = df.columns.tolist()

FEATURES = [
    c for c in ALL_COLUMNS
    if c not in [
        "timestamp",
        "spike_10s", "spike_30s", "spike_60s",
        "future_req_10s", "future_req_30s", "future_req_60s"
    ]
]

print(f"Features: {len(FEATURES)}")
print(FEATURES)

In [None]:
# Train models for each horizon
HORIZONS = [10, 30, 60]
trained_models = {}

for h in HORIZONS:
    print(f"\n{'='*60}")
    print(f"Training model for spike_{h}s")
    print(f"{'='*60}")
    
    X = df[FEATURES]
    y = df[f"spike_{h}s"]
    
    print(f"Dataset: {X.shape[0]:,} samples, {X.shape[1]} features")
    print(f"Positive samples: {y.sum():,} ({100*y.mean():.2f}%)")
    
    # Split data (80/20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )
    
    print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")
    
    # Train XGBoost
    model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        random_state=42
    )
    
    print("\n‚öôÔ∏è  Training...")
    model.fit(X_train, y_train, verbose=False)
    
    # Evaluate
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print("\nüìä Performance:")
    print(classification_report(y_test, y_pred, target_names=['No Spike', 'Spike']))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
    
    trained_models[h] = model
    print(f"\n‚úÖ Model for spike_{h}s trained successfully!")

print("\n" + "="*60)
print("‚úÖ ALL MODELS TRAINED!")
print("="*60)

## üíæ Step 4: Save Models as .pkl Files

In [None]:
print("üíæ Saving models...\n")

for h in HORIZONS:
    filename = f"xgb_spike_{h}s_enhanced.pkl"
    joblib.dump(trained_models[h], filename)
    print(f"‚úì Saved: {filename}")

print("\n" + "="*60)
print("üéâ SUCCESS! All models saved!")
print("="*60)
print("\nüì• Download these files:")
print("  - xgb_spike_10s_enhanced.pkl")
print("  - xgb_spike_30s_enhanced.pkl")
print("  - xgb_spike_60s_enhanced.pkl")
print("\nüîÑ Next steps:")
print("  1. Download the .pkl files from Colab")
print("  2. Copy to: ai_load_balancer_test/models/")
print("  3. Rename (remove '_enhanced' or update main.py)")
print("  4. Re-run tests: python main.py test_cases/sudden_spike.csv")
print("  5. Watch the magic happen! üöÄ")
print("="*60)

## üìä Step 5: Quick Verification Test

In [None]:
# Test on a sudden spike pattern
print("üß™ Testing on synthetic sudden spike...\n")

# Create test scenario: 40 req/s then jump to 180
test_traffic = np.concatenate([
    np.ones(100) * 40,  # Baseline
    np.ones(50) * 180,  # Spike
    np.ones(100) * 40   # Back to baseline
])

# Create features
test_data = []
for i, req in enumerate(test_traffic):
    cpu = min(100, (req / (5 * 15)) * 100)
    mem = cpu * 0.8
    lat = 50 + cpu * 0.8
    
    # Rolling features
    req_avg_5 = np.mean(test_traffic[max(0, i-5):i+1])
    req_avg_10 = np.mean(test_traffic[max(0, i-10):i+1])
    req_avg_30 = np.mean(test_traffic[max(0, i-30):i+1])
    
    test_data.append([
        req, np.random.uniform(50, 500), 0, cpu, mem, lat, 5, i % 5,
        req_avg_5, req - req_avg_5,
        req_avg_10, req - req_avg_10,
        req_avg_30, req - req_avg_30
    ])

test_df = pd.DataFrame(test_data, columns=FEATURES)

# Predict
prob_30 = trained_models[30].predict_proba(test_df)[:, 1]
prob_60 = trained_models[60].predict_proba(test_df)[:, 1]

print("üìà Prediction at spike point (t=100):")
print(f"  Traffic: 40 ‚Üí 180 req/s")
print(f"  Spike prob (30s): {prob_30[100]:.3f}")
print(f"  Spike prob (60s): {prob_60[100]:.3f}")

print("\nüìà Prediction BEFORE spike (t=90):")
print(f"  Traffic: still at 40 req/s")
print(f"  Spike prob (30s): {prob_30[90]:.3f} (should be HIGH if model learned!)")
print(f"  Spike prob (60s): {prob_60[90]:.3f}")

if prob_30[90] > 0.3:
    print("\n‚úÖ Model is detecting spike BEFORE it happens! Perfect!")
else:
    print("\n‚ö†Ô∏è  Model might need more tuning, but should still perform better!")

print("\n" + "="*60)
print("Download the .pkl files and test them! üéâ")
print("="*60)