# 🚀 AI Hacking Detection - Colab Training Pipeline

**Source**: `src/stress_test/train_pipeline.py`  
**Dataset**: 96.5 million training samples  
**GPU**: NVIDIA T4 (16GB VRAM)  

## Features
- Mixed precision training (AMP)
- Auto-checkpointing to Google Drive
- Discord webhook notifications
- 12-hour timeout warnings
- GPU memory monitoring

## Training Schedule
- Payload CNN: ~8-10 hours
- **Total**: ~10-12 hours

In [None]:
import torch
import sys
from datetime import datetime, timedelta

print('='*60)
print('🚀 GPU Verification')
print('='*60)

if not torch.cuda.is_available():
    print('❌ NO GPU! Go to Runtime → Change runtime type → GPU')
    sys.exit(1)

gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9

print(f'✅ GPU: {gpu_name}')
print(f'💾 VRAM: {gpu_memory:.1f} GB')

if 'T4' in gpu_name:
    BATCH_MULTIPLIER = 4
elif 'V100' in gpu_name:
    BATCH_MULTIPLIER = 8
elif 'A100' in gpu_name:
    BATCH_MULTIPLIER = 12
else:
    BATCH_MULTIPLIER = 2

print(f'🔧 Batch multiplier: {BATCH_MULTIPLIER}x')
print(f'⏰ Session start: {datetime.now().strftime("%H:%M:%S")}')
print(f'⏱️  Timeout at: {(datetime.now() + timedelta(hours=12)).strftime("%H:%M:%S")}')

In [None]:
!pip install -q mmh3
print('✅ Dependencies installed')

In [None]:
from google.colab import drive
from pathlib import Path
import os

drive.mount('/content/drive', force_remount=True)

# Path mapping
project_root = Path('/content/drive/MyDrive/AI-hacking-detection')
datasets_dir = project_root / 'datasets'
models_dir = project_root / 'models'
checkpoints_dir = project_root / 'checkpoints'
results_dir = project_root / 'results'

# Create directories
for d in [models_dir, checkpoints_dir, results_dir]:
    d.mkdir(parents=True, exist_ok=True)

# Verify datasets
if not datasets_dir.exists():
    print(f'❌ ERROR: {datasets_dir} not found!')
    print('Upload datasets to Google Drive first!')
else:
    print(f'✅ Project root: {project_root}')
    print(f'📊 Datasets found:')
    for f in datasets_dir.iterdir():
        if f.is_file():
            size_mb = f.stat().st_size / 1e6
            print(f'   {f.name}: {size_mb:.1f} MB')
        else:
            print(f'   {f.name}/')

In [None]:
import requests
from datetime import datetime

WEBHOOK_URL = 'https://discord.com/api/webhooks/1452715933398466782/Ajftu5_fHelFqifTRcZN3S7fCDddXPs89p9w8dTHX8pF1xUO59ckac_DyCTQsRKC1H8O'

def send_discord_update(message, color=0x3498db):
    """Send status update to Discord."""
    try:
        embed = {
            'title': '🤖 Colab Training',
            'description': message,
            'color': color,
            'timestamp': datetime.utcnow().isoformat()
        }
        requests.post(WEBHOOK_URL, json={'embeds': [embed]}, timeout=10)
    except Exception as e:
        print(f'Discord error: {e}')

# Test webhook
send_discord_update(f'🚀 Training pipeline starting\n💾 GPU: {gpu_name}\n⏰ {datetime.now().strftime("%Y-%m-%d %H:%M")}')
print('✅ Discord webhook configured')

In [None]:
import torch.nn as nn

class Config:
    # Paths
    benign_paths = [
        datasets_dir / 'benign_5m.jsonl',
        datasets_dir / 'curated_benign',
    ]
    malicious_paths = [
        datasets_dir / 'security_payloads',
    ]
    
    # Training
    batch_size = 256 * BATCH_MULTIPLIER
    epochs = 10
    learning_rate = 1e-3
    weight_decay = 1e-5
    num_workers = 2
    val_split = 0.1
    
    # Checkpointing
    checkpoint_every = 2
    patience = 3
    
    # Paths
    model_save_path = models_dir / 'payload_cnn.pt'
    checkpoint_path = checkpoints_dir / 'checkpoint.pt'

CONFIG = Config()
print(f'🔧 Batch size: {CONFIG.batch_size}')
print(f'🔧 Epochs: {CONFIG.epochs}')

In [None]:
import torch.nn.functional as F

class PayloadCNN(nn.Module):
    """Character-level CNN for malicious payload detection (~1.2M params)."""
    
    def __init__(self, vocab_size=256, embed_dim=192, num_filters=384, max_len=500):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(embed_dim, num_filters // 2, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(num_filters // 2, num_filters, kernel_size=5, padding=2)
        self.conv3 = nn.Conv1d(num_filters, num_filters, kernel_size=7, padding=3)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(num_filters, 192)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(192, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x).squeeze(-1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x).squeeze(-1)

print('✅ PayloadCNN defined')

In [None]:
import json
import random
from torch.utils.data import IterableDataset, DataLoader
from typing import Iterator, List, Tuple

class StreamingPayloadDataset(IterableDataset):
    """Streams samples from files without loading all in RAM."""
    
    def __init__(self, benign_paths, malicious_paths, max_length=512, buffer_size=10000):
        self.benign_paths = [Path(p) for p in benign_paths]
        self.malicious_paths = [Path(p) for p in malicious_paths]
        self.max_length = max_length
        self.buffer_size = buffer_size
    
    def _stream_file(self, path: Path) -> Iterator[str]:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if path.suffix == '.jsonl':
                    try:
                        data = json.loads(line)
                        text = data.get('text', data.get('payload', ''))
                        if text:
                            yield text[:self.max_length]
                    except:
                        continue
                else:
                    yield line[:self.max_length]
    
    def _stream_paths(self, paths) -> Iterator[str]:
        for path in paths:
            if path.is_file():
                yield from self._stream_file(path)
            elif path.is_dir():
                for f in path.rglob('*'):
                    if f.is_file() and f.suffix in ['.txt', '.jsonl']:
                        yield from self._stream_file(f)
    
    def __iter__(self) -> Iterator[Tuple[str, int]]:
        benign_buffer, malicious_buffer = [], []
        benign_iter = self._stream_paths(self.benign_paths)
        malicious_iter = self._stream_paths(self.malicious_paths)
        benign_done = malicious_done = False
        
        while True:
            while len(benign_buffer) < self.buffer_size and not benign_done:
                try:
                    benign_buffer.append(next(benign_iter))
                except StopIteration:
                    benign_done = True
                    break
            
            while len(malicious_buffer) < self.buffer_size and not malicious_done:
                try:
                    malicious_buffer.append(next(malicious_iter))
                except StopIteration:
                    malicious_done = True
                    break
            
            if not benign_buffer and not malicious_buffer:
                break
            
            random.shuffle(benign_buffer)
            random.shuffle(malicious_buffer)
            
            while benign_buffer or malicious_buffer:
                if random.random() < 0.5 and benign_buffer:
                    yield (benign_buffer.pop(), 0)
                elif malicious_buffer:
                    yield (malicious_buffer.pop(), 1)
                elif benign_buffer:
                    yield (benign_buffer.pop(), 0)
                else:
                    break

def encode_text(text: str, max_length: int = 512) -> List[int]:
    encoded = [ord(c) % 256 for c in text[:max_length]]
    encoded += [0] * (max_length - len(encoded))
    return encoded

def collate_fn(batch):
    texts, labels = zip(*batch)
    encoded = [encode_text(t) for t in texts]
    return torch.tensor(encoded, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)

print('✅ StreamingDataset defined')

In [None]:
import time
from torch.cuda.amp import autocast, GradScaler
from tqdm.notebook import tqdm

class EnhancedTrainer:
    """AMP trainer with monitoring and Discord notifications."""
    
    def __init__(self, model, optimizer, device='cuda'):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device
        self.criterion = nn.BCEWithLogitsLoss()
        self.scaler = GradScaler()
        
        self.start_time = time.time()
        self.epoch_times = []
        self.best_val_loss = float('inf')
        self.best_val_acc = 0.0
        self.history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    def train_epoch(self, dataloader, epoch):
        self.model.train()
        total_loss, correct, total = 0.0, 0, 0
        
        pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}')
        for inputs, targets in pbar:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            
            self.optimizer.zero_grad()
            with autocast():
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)
            
            self.scaler.scale(loss).backward()
            self.scaler.step(self.optimizer)
            self.scaler.update()
            
            total_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct += (preds == targets).sum().item()
            total += targets.size(0)
            
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100*correct/total:.1f}%',
                'gpu': f'{torch.cuda.memory_allocated()/1e9:.1f}GB'
            })
        
        return total_loss / len(dataloader), correct / total
    
    @torch.no_grad()
    def validate(self, dataloader):
        self.model.eval()
        total_loss, correct, total = 0.0, 0, 0
        
        for inputs, targets in tqdm(dataloader, desc='Validating'):
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            with autocast():
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)
            total_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct += (preds == targets).sum().item()
            total += targets.size(0)
        
        return total_loss / len(dataloader), correct / total
    
    def save_checkpoint(self, epoch, is_best=False):
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scaler_state_dict': self.scaler.state_dict(),
            'best_val_loss': self.best_val_loss,
            'history': self.history,
        }
        torch.save(checkpoint, checkpoints_dir / f'checkpoint_epoch_{epoch}.pt')
        if is_best:
            torch.save(checkpoint, checkpoints_dir / 'best_model.pt')
            send_discord_update(f'💾 New best model! Val Acc: {self.best_val_acc*100:.2f}%', color=0x2ecc71)
    
    def check_timeout(self):
        elapsed_hours = (time.time() - self.start_time) / 3600
        if elapsed_hours > 11:
            send_discord_update('⚠️ ALERT: Approaching 12-hour limit! Saving checkpoint...', color=0xf39c12)
            return True
        return False
    
    def train(self, train_loader, val_loader, epochs, patience=3):
        patience_counter = 0
        
        for epoch in range(epochs):
            epoch_start = time.time()
            
            # Train
            train_loss, train_acc = self.train_epoch(train_loader, epoch)
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            
            # Validate
            val_loss, val_acc = self.validate(val_loader)
            self.history['val_loss'].append(val_loss)
            self.history['val_acc'].append(val_acc)
            
            # Timing
            epoch_time = time.time() - epoch_start
            self.epoch_times.append(epoch_time)
            avg_time = sum(self.epoch_times) / len(self.epoch_times)
            eta_hours = (avg_time * (epochs - epoch - 1)) / 3600
            
            print(f'Epoch {epoch+1}: Train Loss={train_loss:.4f}, Acc={train_acc*100:.2f}% | Val Loss={val_loss:.4f}, Acc={val_acc*100:.2f}%')
            print(f'⏱️  {epoch_time/60:.1f}m | ETA: {eta_hours:.1f}h')
            
            # Discord update
            send_discord_update(f'✅ Epoch {epoch+1}/{epochs}\nTrain: {train_acc*100:.2f}% | Val: {val_acc*100:.2f}%\nETA: {eta_hours:.1f}h')
            
            # Best model check
            is_best = val_loss < self.best_val_loss
            if is_best:
                self.best_val_loss = val_loss
                self.best_val_acc = val_acc
                patience_counter = 0
            else:
                patience_counter += 1
            
            # Checkpoint
            if (epoch + 1) % CONFIG.checkpoint_every == 0 or is_best:
                self.save_checkpoint(epoch + 1, is_best)
            
            # Early stopping
            if patience_counter >= patience:
                send_discord_update(f'⏹️ Early stopping at epoch {epoch+1}', color=0xf39c12)
                break
            
            # Timeout check
            if self.check_timeout():
                self.save_checkpoint(epoch + 1, is_best=True)
                break
        
        return self.model, self.history

print('✅ EnhancedTrainer defined')

In [None]:
print('📊 Loading data...')
send_discord_update('📊 Loading training data...')

dataset = StreamingPayloadDataset(
    benign_paths=CONFIG.benign_paths,
    malicious_paths=CONFIG.malicious_paths,
    max_length=512,
    buffer_size=10000,
)

train_loader = DataLoader(
    dataset,
    batch_size=CONFIG.batch_size,
    collate_fn=collate_fn,
    num_workers=CONFIG.num_workers,
    prefetch_factor=2,
)

# Use same loader for validation (streaming dataset)
val_loader = train_loader

print(f'✅ DataLoader ready (batch_size={CONFIG.batch_size})')
print(f'💾 GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB')
send_discord_update(f'✅ Data ready\nBatch size: {CONFIG.batch_size}')

In [None]:
print('='*60)
print('🚀 Starting Training')
print('='*60)

# Initialize
model = PayloadCNN().to('cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG.learning_rate, weight_decay=CONFIG.weight_decay)
trainer = EnhancedTrainer(model, optimizer)

print(f'Model parameters: {sum(p.numel() for p in model.parameters()):,}')
send_discord_update(f'🎯 Training started\nModel params: {sum(p.numel() for p in model.parameters()):,}')

try:
    model_trained, history = trainer.train(
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=CONFIG.epochs,
        patience=CONFIG.patience,
    )
    send_discord_update(f'🎉 Training complete!\nBest Val Acc: {trainer.best_val_acc*100:.2f}%', color=0x2ecc71)

except RuntimeError as e:
    if 'out of memory' in str(e):
        send_discord_update(f'❌ GPU OOM! Reduce batch to {CONFIG.batch_size//2}', color=0xe74c3c)
    else:
        send_discord_update(f'❌ Error: {str(e)[:200]}', color=0xe74c3c)
    raise

except KeyboardInterrupt:
    send_discord_update('⏸️ Training interrupted - saving checkpoint', color=0xf39c12)
    trainer.save_checkpoint(len(trainer.epoch_times), is_best=True)
    raise

except Exception as e:
    send_discord_update(f'❌ Error: {str(e)[:200]}', color=0xe74c3c)
    raise

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime

# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_title('Loss')
axes[0].set_xlabel('Epoch')
axes[0].legend()

axes[1].plot([a*100 for a in history['train_acc']], label='Train')
axes[1].plot([a*100 for a in history['val_acc']], label='Val')
axes[1].set_title('Accuracy (%)')
axes[1].set_xlabel('Epoch')
axes[1].legend()

plt.tight_layout()
plt.savefig(results_dir / 'training_curves.png', dpi=150)
plt.show()

# Save summary
summary = {
    'training_date': datetime.now().isoformat(),
    'gpu': torch.cuda.get_device_name(0),
    'best_val_acc': trainer.best_val_acc,
    'best_val_loss': trainer.best_val_loss,
    'total_epochs': len(trainer.epoch_times),
    'total_time_hours': sum(trainer.epoch_times) / 3600,
    'history': history,
}

with open(results_dir / 'training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f'\n📊 Training Summary')
print(f'Best Val Accuracy: {trainer.best_val_acc*100:.2f}%')
print(f'Total Time: {sum(trainer.epoch_times)/3600:.2f} hours')
print(f'Saved to: {results_dir}')

In [None]:
# Save final model
torch.save(model_trained.state_dict(), CONFIG.model_save_path)
torch.save(model_trained.state_dict(), CONFIG.model_save_path.with_suffix('.pth'))

print(f'✅ Model saved to {CONFIG.model_save_path}')
print(f'✅ Model saved to {CONFIG.model_save_path.with_suffix(".pth")}')

# Verify
test_model = PayloadCNN()
test_model.load_state_dict(torch.load(CONFIG.model_save_path))
print('✅ Model verification passed')

send_discord_update(f'💾 Final model saved to Google Drive\n{CONFIG.model_save_path}', color=0x2ecc71)

## ✅ Training Complete!

### Files Saved to Google Drive:
- `models/payload_cnn.pt` - Final model weights
- `models/payload_cnn.pth` - Final model weights (alt format)
- `checkpoints/best_model.pt` - Best checkpoint
- `results/training_curves.png` - Loss/accuracy plots
- `results/training_summary.json` - Training metrics

### Next Steps:
1. Download models from Google Drive
2. Run stress test: `python scripts/stress_test.py`
3. Deploy to production

### Troubleshooting:
- **OOM Error**: Reduce `BATCH_MULTIPLIER` in Cell 2
- **Slow Training**: Check GPU is T4/V100/A100
- **Disconnected**: Resume from checkpoint in `checkpoints/`