In [1]:
import torch
import numpy as np
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformer_lens import HookedTransformer

# Configuration
# Pivoting to GPT-2 Small for rapid iteration and stability in Codespaces.
MODEL_NAME = "gpt2-small"
TARGET_LAYER = 6  # Middle layer (Structure: 12 layers total). 
                  # Layer 6 is optimal for capturing high-level semantic intent.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"üöÄ Environment Setup: Device={DEVICE} | Model={MODEL_NAME} | Target Layer={TARGET_LAYER}")

  from .autonotebook import tqdm as notebook_tqdm


üöÄ Environment Setup: Device=cpu | Model=gpt2-small | Target Layer=6


In [7]:
# 1. Load the Hooked Transformer
# TransformerLens was built for GPT-2, so this is the native happy path.
try:
    print(f"Loading {MODEL_NAME}... (Fast load)")
    model = HookedTransformer.from_pretrained(
        MODEL_NAME, 
        device=DEVICE
    )
    print("‚úÖ Model Loaded Successfully")
except Exception as e:
    print(f"‚ö†Ô∏è Error loading {MODEL_NAME}.")
    raise e

# 2. Load the Calibration Dataset
# Use the absolute path for reliability
DATA_RAW = Path('/workspaces/Latent-Space-Firewall/data/raw/ground_truth_dataset.json')

if not DATA_RAW.exists():
    # Defensive programming: Ensure the user actually ran the previous step
    raise FileNotFoundError(f"Could not find {DATA_RAW}. Please run 'python -m src.data_loader' first.")

with open(DATA_RAW, 'r') as f:
    dataset = json.load(f)
    
print(f"‚úÖ Loaded {len(dataset)} prompts for harvesting.")

Loading gpt2-small... (Fast load)
Loaded pretrained model gpt2-small into HookedTransformer
‚úÖ Model Loaded Successfully
‚úÖ Loaded 20 prompts for harvesting.


The Harvesting Loop (Layer 6 Hooks)

We hook `blocks.6.hook_resid_post`. This is the exact residual stream that your FirewallEngine will need to intercept later.

In [8]:
def get_activations(text, model, layer):
    """
    Runs a forward pass and returns the residual stream vector 
    at the final token position for the specified layer.
    """
    # Specific hook point for GPT-2
    hook_name = f"blocks.{layer}.hook_resid_post"
    
    # Run with cache, no gradients needed
    with torch.no_grad():
        _, cache = model.run_with_cache(
            text, 
            names_filter=lambda name: name == hook_name
        )
    
    # Extract shape: [batch, pos, d_model] -> [1, n_tokens, 768]
    # We grab [-1] (last token) to capture the full prompt context.
    final_vector = cache[hook_name][0, -1, :].cpu().numpy()
    return final_vector

# Storage containers
activations = []
labels = []
categories = []

print(f"üß† Harvesting Latent Vectors from Layer {TARGET_LAYER}...")

for entry in tqdm(dataset):
    prompt = entry['text']
    label = entry['label']      # 0 = Safe, 1 = Harmful
    category = entry['category'] 
    
    try:
        # Harvest
        vector = get_activations(prompt, model, TARGET_LAYER)
        
        activations.append(vector)
        labels.append(label)
        categories.append(category)
    except Exception as e:
        print(f"‚ùå Failed on prompt: '{prompt[:20]}...' | Error: {e}")

# Convert to numpy (Shape will be [N, 768] for GPT-2)
X = np.array(activations)
y = np.array(labels)
meta = np.array(categories)

print(f"\n‚ú® Harvesting Complete. Tensor Shape: {X.shape} (Expected: [N, 768])")

üß† Harvesting Latent Vectors from Layer 6...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.08it/s]


‚ú® Harvesting Complete. Tensor Shape: (20, 768) (Expected: [N, 768])





In [9]:
# Save to processed directory for the Phase 2 Classifier
OUTPUT_DIR = Path('../data/processed')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = OUTPUT_DIR / "activations.npz"

np.savez(
    OUTPUT_PATH, 
    activations=X, 
    labels=y, 
    categories=meta
 )

print(f"üíæ Artifact Saved: {OUTPUT_PATH}")
print("Next Step: Create 'notebooks/02_train_probe.ipynb' to build the Conformal Predictor.")

üíæ Artifact Saved: ../data/processed/activations.npz
Next Step: Create 'notebooks/02_train_probe.ipynb' to build the Conformal Predictor.
