In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np

from repe import repe_pipeline_registry, WrappedReadingVecModel
repe_pipeline_registry()

from utils import literary_openings_dataset, quotes_dataset, quote_completion_test, historical_year_test, extract_year, eval_completions

In [4]:
model_name_or_path = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False, token=True)
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Reading

In [5]:
rep_token = -1
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
n_difference = 1
direction_method = 'pca'
rep_reading_pipeline =  pipeline("rep-reading", model=model, tokenizer=tokenizer)

Device set to use mps


In [6]:
data_dir = "../../data/memorization"
lit_train_data, lit_train_labels, _ = literary_openings_dataset(data_dir)
quote_train_data, quote_train_labels, _ = quotes_dataset(data_dir)

In [7]:
import repe.rep_readers

def safe_project_onto_direction(H, direction):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Convert to float32 tensors and move to the same device
    H = torch.tensor(H, dtype=torch.float32, device=device) if not isinstance(H, torch.Tensor) else H.to(device=device, dtype=torch.float32)
    direction = torch.tensor(direction, dtype=torch.float32, device=device) if not isinstance(direction, torch.Tensor) else direction.to(device=device, dtype=torch.float32)

    projection = torch.matmul(H, direction) / torch.norm(direction)
    return projection

# Monkey-patch the unsafe function
repe.rep_readers.project_onto_direction = safe_project_onto_direction

lit_rep_reader = rep_reading_pipeline.get_directions(
    lit_train_data, 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    n_difference=n_difference, 
    train_labels=lit_train_labels, 
    direction_method=direction_method,
)

quote_rep_reader = rep_reading_pipeline.get_directions(
    quote_train_data, 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    n_difference=n_difference, 
    train_labels=quote_train_labels, 
    direction_method=direction_method,
)

## Quote Completions Control

In [8]:
# Early layers work
layer_id = list(range(-1,-9,-1))

block_name="decoder_block"
control_method="reading_vec"
batch_size=64
coeff=2.0 # tune this parameter
max_new_tokens=16

### We do manually instead of rep_control_pipeline here as an example
wrapped_model = WrappedReadingVecModel(model, tokenizer)
wrapped_model.unwrap()
# wrap model at desired layers and blocks
wrapped_model.wrap_block(layer_id, block_name=block_name)
inputs, targets = quote_completion_test(data_dir)

In [9]:
print(len(wrapped_model.model.model.layers))

32


In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def apply_activations(wrapped_model, inputs, activations, batch_size, use_tqdm, tokenizer, **generation_kwargs):
    generated = []
    iterator = range(0, len(inputs), batch_size)
    
    if use_tqdm:
        from tqdm import tqdm
        iterator = tqdm(iterator)
    
    for i in iterator:
        inputs_b = inputs[i:i+batch_size]
        
        tokenized_inputs = tokenizer(inputs_b, return_tensors="pt", padding=True, truncation=True)
        input_ids = tokenized_inputs["input_ids"].to(wrapped_model.model.device)
        attention_mask = tokenized_inputs.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(wrapped_model.model.device)
        
        outputs = wrapped_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            **generation_kwargs
        )
        
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        
        decoded_outputs = [o.replace(inp, "", 1) for o, inp in zip(decoded_outputs, inputs_b)]
        generated.extend(decoded_outputs)
    
    return generated



In [15]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    padding_side="left" 
)
tokenizer.pad_token = tokenizer.eos_token

# This is to offload processing to CPU's for Mac
if torch.backends.mps.is_available():
    print("MPS device detected. Using CPU fallback for compatibility.")
    # Set environment variable to avoid MPS issues
    import os
    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
    # Alternative: move model to CPU
    device = 'cpu'
else:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

if hasattr(wrapped_model, 'model'):
    wrapped_model.model = wrapped_model.model.to(device)

for t, rep_reader in zip(['literature openings', 'quotes'], [lit_rep_reader, quote_rep_reader]):
    activations = {}
    for layer in layer_id:
        # Use consistent dtype and device
        direction_tensor = torch.tensor(
            0 * coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer], 
            dtype=torch.float32
        ).to(device)
        activations[layer] = direction_tensor.half() if device != 'cpu' else direction_tensor
        
    print("RepReader:", t)
    print("No Control")
    baseline_outputs = apply_activations(
        wrapped_model,
        inputs, 
        activations,
        batch_size=64,
        max_new_tokens=max_new_tokens, 
        use_tqdm=False,
        tokenizer=tokenizer
    )
    
    print(eval_completions(baseline_outputs, targets))
    
    # + Memorization
    activations = {}
    for layer in layer_id:
        direction_tensor = torch.tensor(
            coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer], 
            dtype=torch.float32
        ).to(device)
        activations[layer] = direction_tensor.half() if device != 'cpu' else direction_tensor
        
    print("+ Memorization")
    pos_outputs = apply_activations(
        wrapped_model,
        inputs, 
        activations,
        batch_size=64,
        max_new_tokens=max_new_tokens, 
        use_tqdm=False,
        tokenizer=tokenizer  
    )
    
    print(eval_completions(pos_outputs, targets))
    
    activations = {}
    for layer in layer_id:
        direction_tensor = torch.tensor(
            -coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer], 
            dtype=torch.float32
        ).to(device)
        activations[layer] = direction_tensor.half() if device != 'cpu' else direction_tensor
        
    print("- Memorization")
    neg_outputs = apply_activations(
        wrapped_model,
        inputs, 
        activations,
        batch_size=64,
        max_new_tokens=max_new_tokens, 
        use_tqdm=False,
        tokenizer=tokenizer  # Add missing tokenizer parameter
    )
    print(eval_completions(neg_outputs, targets))

MPS device detected. Using CPU fallback for compatibility.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RepReader: literature openings
No Control
{'em': 0.7864077669902912, 'sim': 0.9123673641565934}
+ Memorization
{'em': 0.7961165048543689, 'sim': 0.9085250688554014}
- Memorization
{'em': 0.7766990291262136, 'sim': 0.9094798905612196}
RepReader: quotes
No Control
{'em': 0.7864077669902912, 'sim': 0.9073351899397026}
+ Memorization
{'em': 0.7669902912621359, 'sim': 0.9034268602873515}
- Memorization
{'em': 0.7572815533980582, 'sim': 0.8820471857014356}
