<a href="https://colab.research.google.com/github/Mudit280/stealth-build/blob/main/Training_Probes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Base Model

ModuleNotFoundError: No module named 'src'

In [3]:
# Clone the repository
!git clone https://github.com/Mudit280/stealth-build

# Add the repository to the system path
import sys
sys.path.append('stealth-build')


Cloning into 'stealth-build'...
remote: Enumerating objects: 310, done.[K
remote: Counting objects: 100% (310/310), done.[K
remote: Compressing objects: 100% (219/219), done.[K
remote: Total 310 (delta 168), reused 204 (delta 69), pack-reused 0 (from 0)[K
Receiving objects: 100% (310/310), 115.68 KiB | 1.19 MiB/s, done.
Resolving deltas: 100% (168/168), done.


In [5]:
import src.models.base_model as base_model
import src.models.gpt2_model as gpt2_model

# Train Probe - Doing a small batch test on IMDB Dataset To Understand Mechanics

In [6]:
"""
1. **Imports and Argument Parsing**
    * Import necessary libraries (transformers, datasets, torch, etc.)
    * Parse command-line arguments for flexibility (e.g., batch size, layer, pooling type)

2. **Load Dataset**
    * Load IMDb dataset using HuggingFace Datasets

3. **Load GPT-2 Model and Tokenizer**
    * Set output_hidden_states=True

4. **Extract Hidden States**
    * Tokenize and batch the dataset
    * Pass through GPT-2
    * Pool/flatten hidden states as features

5. **Train Linear Probe**
    * Use PyTorch (or optionally scikit-learn for quick prototyping)
    * Train on extracted features and labels

6. **Evaluate and Save Results**
    * Evaluate on test set
    * Print and/or save metrics
"""

import transformers
import datasets
import torch
import numpy as np
import argparse
import logging
logging.basicConfig(level=logging.INFO)

def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for flexibility (e.g., batch size, layer, pooling type)"""
    parser = argparse.ArgumentParser(description="Train a linear probe on GPT-2 activations for sentiment.")
    parser.add_argument("--batch_size", type=int, default=8, help="Batch size for processing data")
    parser.add_argument("--probe_layer", type=int, default=-1, help="Which GPT-2 layer to extract (default: last)")
    parser.add_argument("--pooling", type=str, choices=["mean", "last"], default="mean", help="Pooling strategy")
    return parser.parse_args()

def load_imdb() -> datasets.DatasetDict:
    """Load IMDb dataset using HuggingFace Datasets"""
    dataset = datasets.load_dataset("imdb")
    logging.info("Train example: %s", dataset["train"][0])
    logging.info("Train size: %d, Test size: %d", len(dataset['train']), len(dataset['test']))
    return dataset

In [8]:
from src.models.gpt2_model import GPT2Model

In [9]:
# args = parse_args() # This is not needed in Colab
dataset = load_imdb()

# --- Quick batch extraction for sanity check ---
# We run this script from terminal
# from models.gpt2_model import GPT2Model # Not needed as it's defined in the notebook
# Take a small batch
batch_size = 32

# Exploratory/debugging info (visible only at DEBUG level)
logging.debug("Dataset keys: %s", dataset.keys())
logging.debug("First item in train: %s", dataset["train"][0])
logging.debug("Type of dataset['train']: %s", type(dataset["train"]))
logging.debug("Type of dataset['train'][:batch_size]: %s", type(dataset["train"][:batch_size]))
logging.debug("Type of dataset['train'][:batch_size]['text']: %s", type(dataset["train"][:batch_size]['text']))
logging.debug("Type of dataset['train'][:batch_size]['label']: %s", type(dataset["train"][:batch_size]['label']))

train_texts = dataset["train"]["text"][:batch_size]
train_labels = dataset["train"]["label"][:batch_size]

logging.info("Loading GPT-2 model... (this may take 10+ minutes)")

# Load GPT-2 model (on CPU for now)
model = GPT2Model(model_name="gpt2", device="cpu")
model.load_model()

logging.info("Model loaded successfully!")

# Extract mean-pooled activations from layer 7
logging.info("Extracting features from GPT-2...")
features = model.extract_features(train_texts, layer=7, pooling="mean")
logging.info("Feature extraction complete.")

# Final user-facing results
print("Features shape:", features.shape)
# shape is (batch_size, size of model hidden layer - in gpt2, this is 768)
print("First feature vector (first 10 dims):", features[0][:10])
print("First 5 labels:", train_labels[:5])

# === Mini PyTorch probe training on a single batch ===
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(42)

# Prepare data as tensors
X = torch.tensor(features, dtype=torch.float32)  # shape: (32, 768)
y = torch.tensor(train_labels, dtype=torch.long) # shape: (32,)

# Define a simple linear probe (for binary sentiment: 2 classes)
probe = nn.Linear(X.shape[1], 2)  # 768 -> 2
# Link for a visualisation of nn.Linear: https://www.google.com/url?sa=i&url=https%3A%2F%2Fwww.sharetechnote.com%2Fhtml%2FPython_PyTorch_nn_Linear_01.html&psig=AOvVaw1pct9tCSv-KGhvbPSfnqy1&ust=1753167420609000&source=images&cd=vfe&opi=89978449&ved=0CBMQjRxqFwoTCLjR6POvzY4DFQAAAAAdAAAAABAK
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(probe.parameters(), lr=0.01)

print("X shape:", X.shape, "dtype:", X.dtype)
print("y shape:", y.shape, "dtype:", y.dtype)

Features shape: (32, 768)
First feature vector (first 10 dims): [-0.54970014 -0.2438514   0.28589007 -0.83663857 -0.15804946 -0.6619847
  2.523274   -0.13884257  0.08925382 -0.00859352]
First 5 labels: [0, 0, 0, 0, 0]
X shape: torch.Size([32, 768]) dtype: torch.float32
y shape: torch.Size([32]) dtype: torch.int64


Now thinking through how to programme full training run

In [13]:
## Turn logging into prints and see waht happens
# Swithc to gpu as and when neccesary

# Track training time
import time
train_start = time.time()
print("Starting probe training...")

# Training loop
max_epochs = 2
for epoch in range(max_epochs):
    print(f"Epoch {epoch}")
    optimizer.zero_grad()
    logits = probe(X)  # shape: (32, 2)
    loss = criterion(logits, y)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0 or loss.item() < 0.1:
        print(f"Epoch {epoch}: loss = {loss.item():.4f}")
    if loss.item() < 0.1:
        print("Early stopping: loss below threshold.")
        break

train_end = time.time()
print(f"Probe training completed in {train_end - train_start:.2f} seconds.")

# Evaluate on the same batch
with torch.no_grad():
    preds = torch.argmax(probe(X), dim=1)
    accuracy = (preds == y).float().mean().item()
print(f"Probe accuracy on this batch: {accuracy*100:.1f}% (expect high, will not generalize)")

Starting probe training...
Epoch 0
Epoch 0: loss = 1.2076
Epoch 1
Epoch 1: loss = 0.0077
Early stopping: loss below threshold.
Probe training completed in 0.00 seconds.
Probe accuracy on this batch: 100.0% (expect high, will not generalize)


# Training Toxicity Probe

In [14]:
# Let's do more rigorous training for this probe. The above was to ensure the training script worked
# We'll now train on the full training set and evaluate on the test set.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- 1. Prepare Full Dataset ---
# We'll use the full IMDb training and test sets
# Note: This will be much slower than the single-batch example

# Extract features for the training set
print("Extracting features for the training set...")
train_texts_full = dataset["train"]["text"]
train_labels_full = dataset["train"]["label"]
train_features_full = model.extract_features(train_texts_full, layer=7, pooling="mean")

# Extract features for the test set
print("Extracting features for the test set...")
test_texts_full = dataset["test"]["text"]
test_labels_full = dataset["test"]["label"]
test_features_full = model.extract_features(test_texts_full, layer=7, pooling="mean")

# Convert to PyTorch Tensors
X_train = torch.tensor(train_features_full, dtype=torch.float32)
y_train = torch.tensor(train_labels_full, dtype=torch.long)
X_test = torch.tensor(test_features_full, dtype=torch.float32)
y_test = torch.tensor(test_labels_full, dtype=torch.long)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# --- 2. Define and Train the Probe ---
# Re-initialize the probe and optimizer for a fresh start
probe = nn.Linear(X_train.shape[1], 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(probe.parameters(), lr=0.001) # Using a smaller learning rate for more stable training

# Training loop
print("Starting full probe training...")
max_epochs = 5
for epoch in range(max_epochs):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        logits = probe(batch_X)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch}: Average Loss = {epoch_loss / len(train_loader):.4f}")

print("Probe training completed.")

# --- 3. Evaluate the Probe ---
with torch.no_grad():
    test_logits = probe(X_test)
    test_preds = torch.argmax(test_logits, dim=1)
    accuracy = (test_preds == y_test).float().mean().item()
    print(f"Probe accuracy on the full test set: {accuracy*100:.2f}%")

Extracting features for the training set...


TypeError: len() of a 0-d tensor

# Training Toxicity Ptobe

In [None]:
from datasets import load_dataset

# Load the toxicity dataset
toxicity_dataset = load_dataset("civil_comments")

# Let's see some examples
print(toxicity_dataset['train'][0])

In [None]:
# Next steps above and see if working with LLMs - a way to smartly pull base_model.py etc, so don't have to keep updating this notebook?