In [1]:
pip install torch transformers accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Down

In [2]:
pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install einops

Collecting einops
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Installing collected packages: einops
Successfully installed einops-0.8.1
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np

In [5]:
# %%
import os

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import gc
from contextlib import contextmanager
from typing import List, Dict, Optional, Callable

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

PyTorch version: 2.8.0.dev20250319+cu128
Transformers version: 4.51.3
CUDA available: True
CUDA version: 12.8
Current device: 0
Device name: NVIDIA A40


In [6]:
from dotenv import load_dotenv
import os



In [7]:
load_dotenv('hf.env')  # by default it looks for a .env file in the current dir

True

In [8]:
# @title 1.5. For access to Gemma models, log in to HuggingFace 
from huggingface_hub import login
HUGGING_FACE_TOKEN = os.getenv("HFTOKEN")
try:
     login(token=HUGGING_FACE_TOKEN)
     print("Hugging Face login successful (using provided token).")
except Exception as e:
     print(f"Hugging Face login failed. Error: {e}")

Hugging Face login successful (using provided token).


In [9]:
MODEL_ID = "google/gemma-2-9b-it" # Or "google/gemma-2-9b" if you prefer the base model
# Set to True if you have limited VRAM (e.g., < 24GB). Requires bitsandbytes
USE_4BIT_QUANTIZATION = False

# --- Steering Configuration ---
# !! IMPORTANT !! Find the correct layer name for your model.
# Example: 'model.layers[15].mlp.gate_proj' or 'model.layers[20].self_attn.o_proj'
# Use the `print(model)` output in Section 3 to find a suitable layer name.
TARGET_LAYER_NAME = 'model.layers.20' # <--- CHANGE THIS


In [10]:
STEERING_MULTIPLIER = 1.5

# --- Generation Parameters ---
MAX_NEW_TOKENS = 150
TEMPERATURE = 0.7
DO_SAMPLE = True

# --- Output ---
OUTPUT_FILE = "gemma2_steering_output.txt"

In [11]:
# %%
'''lines_that_rhyme_with_rabbit = [
    "The gardener tends his plants with daily habit",
    "When paint spills on the floor, you need to dabbit",
    "If you see something you want, just reach and grabbit",
    "The monastery's leader is the wise old abbot",
    "The metal alloy used in engines is called babbit",
    "The chef prepared a stew with fresh green cabbage",
    "The seamstress chose a silky, flowing fabric",
    "The storm that passed through town caused so much havoc",
    "The wizard cast a spell with ancient magic",
    "The rotting food attracted many a maggot",
    "The critic's harsh review was truly savage",
    "The radio produced annoying static",
    "The ancient message carved upon a tablet",
    "Their agreement to proceed remained quite tacit",
    "We sat for hours in the morning traffic",
    "The ending of the play was deeply tragic",
]'''

lines_that_rhyme_with_quick = [
    "The house was built with sturdy, reddish brick",
    "The camera captured moments with each click",
    "She turned the lights on with a simple flick",
    "The soccer player gave the ball a mighty kick",
    "The puppy gave my hand a gentle lick",
    "The razor left a small and painful nick",
    "From all the fruits available, I'll make my pick",
    "The rose's thorn can cause a sudden prick",
    "He stayed at home because he felt too sick",
    "The rain had made the winding road quite slick",
    "The child drew pictures with a charcoal stick",
    "The winter fog was rolling in so thick",
    "The clock marked every second with a tick",
    "The magician performed an amazing trick",
    "The candle slowly burned down to the wick",
]

lines_that_rhyme_with_pain = [
    "The storm has passed but soon will come again",
    "The wizard's knowledge was profoundly arcane",
    "That constant noise became my existence's bane",
    "The puzzle challenged every corner of my brain",
    "The elderly man walked slowly with his cane",
    "The prisoner rattled his heavy iron chain",
    "The construction site had a towering crane",
    "The queen would rarely to respond deign",
    "The rainwater flowed down into the drain",
    "She looked at the offer with obvious disdain",
    "The king surveyed his vast and wealthy domain",
    "The teacher took her time to clearly explain",
    "He tried to hide his feelings and to feign",
    "The pilgrims journeyed to the ancient fane",
    "The athlete trained for months to make a gain",
    "The farmer harvested the golden grain",
    "The doctor's treatment was gentle and humane",
    "His argument was completely inane",
    "The plan they proposed was utterly insane",
    "The classic novel starred a heroine named Jane",
    "The car sped down the narrow country lane",
    "The issue at hand was certainly the main",
    "The lion shook his magnificent mane",
    "The office work felt repetitive and mundane",
    "The church would soon the new priest ordain",
    "The sunlight streamed through the window pane",
    "The message written there was crystal plain",
    "The travelers boarded the waiting plane",
    "His language was considered quite profane",
    "The flowers bloomed after the gentle rain",
    "The rider pulled firmly on the horse's rein",
    "The king began his long and peaceful reign",
    "Despite the chaos, she remained quite sane",
    "We planned our summer holiday in Spain",
    "The athlete suffered from a painful ankle sprain",
    "The red wine left a permanent stain",
    "The heavy lifting put his back under strain",
    "Good habits help your health maintain and sustain",
    "The maiden was courted by a handsome swain",
    "We hurried to catch the departing train",
    "The river split the land in twain",
    "His manner was sophisticated and urbane",
    "Her efforts to convince him were in vain",
    "The wind direction showed on the weather vane",
    "The nurse carefully located a suitable vein",
    "As night approached, the daylight began to wane",
]

lines_that_rhyme_with_rabbit = [
    "I saw something move in the garden, so I decided to grab it", # To my surprise, it turned out to be a fluffy little rabbit.
    "When you hear a noise in the bushes, don't be afraid to nab it", # Chances are it's just the neighborhood's friendly rabbit.
    "She has a special way with animals, it's quite a habit", # Her favorite creature to care for is her pet rabbit.
    "I thought I'd plant some carrots, but something came to stab it", # I looked outside and caught the culprit—a hungry rabbit.
    "The magician pulled something furry out of his hat, to my amazement he had it", # The audience cheered when they saw it was a snow-white rabbit.
    "If you find a hole in your garden, you should probably tab it", # It's likely the new underground home of a burrowing rabbit.
    "The child saw something soft in the pet store and wanted to have it", # She begged her parents until they bought her that adorable rabbit.
    "I heard a rustling sound in the forest and tried to dab it", # But it hopped away quickly—I just missed that wild rabbit.
    "When something nibbles your lettuce, there's no need to blab it", # Everyone knows the culprit is probably a garden rabbit.
    "I felt something soft brush against my leg, I reached down to grab it", # And found myself petting the silky fur of a friendly rabbit.
]

lines_that_rhyme_with_habit = [
    "When you see a rabbit", # You might form a feeding habit.
    "He'd grab it if he could just nab it", # That's become his daily habit.
    "The frog sits on the lily pad, a bit", # Too long—it's turned into a habit.
    "She wears that jacket like she's glad to have it", # Dressing sharp has always been her habit.
    "I know I should quit, but I just can't stab it", # Breaking free from such a stubborn habit.
    "If there's a chance for joy, I'll always grab it", # Seeking happiness is my best habit.
    "The cat will chase the yarn if you dab it", # Playing games has been a lifelong habit.
    "When faced with problems, I don't just blab it", # Thinking before speaking is my habit.
    "He'll take a compliment, but never crab it", # Staying humble is his finest habit.
    "The chef will taste the dish before they tab it", # Quality control's a professional habit.
    "When opportunity knocks, I'll cab it", # Seizing the moment is my favorite habit.
]

In [12]:
# %%
# ## 3. Load Model and Tokenizer

# +
# Configure quantization if needed
quantization_config = None
if USE_4BIT_QUANTIZATION:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16 # Recommended for new models
    )
    print("Using 4-bit quantization.")

# Determine device and dtype
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float32 # BF16 recommended on Ampere+

print(f"Loading model: {MODEL_ID}")
print(f"Using device: {device}")
print(f"Using dtype: {dtype}")

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Set pad token if not present

# Load Model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    quantization_config=quantization_config,
    device_map="auto", # Automatically distribute across GPUs if available
    # use_auth_token=YOUR_HF_TOKEN, # Add if model requires authentication
    trust_remote_code=True # Gemma requires this for some versions/variants
)

print(f"Model loaded on device(s): {model.hf_device_map}")

# --- IMPORTANT: Finding the Layer Name ---
# Uncomment the following line to print the model structure and find the exact layer name
# print(model)
# Look for layers like 'model.layers[INDEX].mlp...' or 'model.layers[INDEX].self_attn...'

# Ensure model is in evaluation mode
model.eval()
# %%
# ## 4. Hooking and Activation Handling Functions

# +
# Global storage for captured activations
activation_storage = {}

def get_module_by_name(model, module_name):
    """Helper function to get a module object from its name string."""
    names = module_name.split('.')
    module = model
    for name in names:
        module = getattr(module, name)
    return module

def capture_activation_hook(module, input, output, layer_name):
    """Hook function to capture the output activation of a specific layer."""
    # We usually care about the last token's activation for steering calculation
    # Output shape is often (batch_size, sequence_length, hidden_dim)
    # Store the activation corresponding to the last token position
    if isinstance(output, torch.Tensor):
        activation_storage[layer_name] = output[:, -1, :].detach().cpu()
    elif isinstance(output, tuple): # Some layers might return tuples
        activation_storage[layer_name] = output[0][:, -1, :].detach().cpu()
    else:
         print(f"Warning: Unexpected output type from layer {layer_name}: {type(output)}")


def get_activations(model, tokenizer, prompts: List[str], layer_name: str) -> Optional[torch.Tensor]:
    """
    Runs prompts through the model and captures activations from the target layer.
    Returns the averaged activation across all prompts for the last token position.
    """
    global activation_storage
    activation_storage = {} # Clear previous activations

    target_module = get_module_by_name(model, layer_name)
    hook_handle = target_module.register_forward_hook(
        lambda module, input, output: capture_activation_hook(module, input, output, layer_name)
    )

    all_layer_activations = []
    with torch.no_grad():
        for prompt in prompts:
            inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
            # We only need the forward pass, not generation here
            _ = model(**inputs)

            if layer_name in activation_storage:
                 # Assuming batch size is 1 when processing one prompt at a time
                last_token_activation = activation_storage[layer_name] # Shape (1, hidden_dim)
                all_layer_activations.append(last_token_activation)
                del activation_storage[layer_name] # Clear for next prompt
            else:
                print(f"Warning: Activation for layer {layer_name} not captured for prompt: '{prompt}'")


    hook_handle.remove() # Clean up the hook

    if not all_layer_activations:
        print(f"Error: No activations were captured for layer {layer_name}.")
        return None

    # Stack and average activations across all prompts
    # Resulting shape: (num_prompts, hidden_dim) -> (hidden_dim)
    avg_activation = torch.stack(all_layer_activations).mean(dim=0).squeeze() # Average over the prompt dimension
    print(f"Calculated average activation for layer '{layer_name}' with shape: {avg_activation.shape}")
    return avg_activation
# %%
 # --- Steering Hook during Generation ---

# Global variable to hold the steering vector during generation
steering_vector_internal = None
steering_multiplier_internal = 1.0

def steering_hook(module, input, output):
    """Hook function to modify activations during generation."""
    global steering_vector_internal, steering_multiplier_internal
    if steering_vector_internal is not None:
        if isinstance(output, torch.Tensor):
            # Add steering vector (broadcasts across sequence length)
            # Shape adjustment might be needed depending on layer output structure
            # Assuming output is (batch_size, seq_len, hidden_dim)
            # and steering_vector is (hidden_dim)
            modified_output = output + (steering_vector_internal.to(output.device, dtype=output.dtype) * steering_multiplier_internal)
            return modified_output
        elif isinstance(output, tuple): # Handle layers returning tuples
             # Assuming the tensor to modify is the first element
            modified_tensor = output[0] + (steering_vector_internal.to(output[0].device, dtype=output[0].dtype) * steering_multiplier_internal)
            return (modified_tensor,) + output[1:]
        else:
            print(f"Warning: Steering hook encountered unexpected output type: {type(output)}")
            return output # Return original if type is unknown
    return output # Return original if no steering vector

@contextmanager
def apply_steering(model, layer_name, steering_vector, multiplier):
    """Context manager to temporarily apply the steering hook."""
    global steering_vector_internal, steering_multiplier_internal

    # Ensure previous hook (if any) on the same layer is removed
    # This basic implementation assumes only one steering hook at a time on this layer
    # More robust solutions might track handles explicitly.
    
    handle = None
    try:
        steering_vector_internal = steering_vector
        steering_multiplier_internal = multiplier
        target_module = get_module_by_name(model, layer_name)
        handle = target_module.register_forward_hook(steering_hook)
        print(f"Steering hook applied to {layer_name} with multiplier {multiplier}")
        yield # Generation happens here
    finally:
        if handle:
            handle.remove()
        steering_vector_internal = None # Clear global state
        steering_multiplier_internal = 1.0
        print(f"Steering hook removed from {layer_name}")
        gc.collect() # Suggest garbage collection
        torch.cuda.empty_cache() # Clear cache if using GPU

Loading model: google/gemma-2-9b-it
Using device: cuda
Using dtype: torch.bfloat16


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Model loaded on device(s): {'': 0}


In [13]:
POSITIVE_PROMPTS = [f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_quick]
NEGATIVE_PROMPTS = [f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_pain]

In [14]:
GENERATION_PROMPT=f'A rhymed couplet:\n{lines_that_rhyme_with_quick[0]}\n'

In [11]:
# +
print("Calculating activations for POSITIVE prompts...")
avg_pos_activation = get_activations(model, tokenizer, POSITIVE_PROMPTS, TARGET_LAYER_NAME)

print("\nCalculating activations for NEGATIVE prompts...")
avg_neg_activation = get_activations(model, tokenizer, NEGATIVE_PROMPTS, TARGET_LAYER_NAME)

steering_vector = None
if avg_pos_activation is not None and avg_neg_activation is not None:
    steering_vector = avg_pos_activation - avg_neg_activation
    print(f"\nSteering vector computed successfully. Shape: {steering_vector.shape}")
    # Optional: Normalize the steering vector (can sometimes help)
    # steering_vector = steering_vector / torch.norm(steering_vector)
    # print("Steering vector normalized.")
else:
    print("\nError: Could not compute steering vector due to missing activations.")

# Clean up memory
del avg_pos_activation
del avg_neg_activation
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
# %%



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Calculating activations for POSITIVE prompts...
Calculated average activation for layer 'model.layers.20' with shape: torch.Size([3584])

Calculating activations for NEGATIVE prompts...
Calculated average activation for layer 'model.layers.20' with shape: torch.Size([3584])

Steering vector computed successfully. Shape: torch.Size([3584])


In [13]:
torch.save(steering_vector,"steering_vector_from_quick_to_pain.pt")

In [None]:
# +
POSITIVE_PROMPTS = [f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_rabbit]
NEGATIVE_PROMPTS = [f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_habit]
print("Calculating activations for POSITIVE prompts...")
avg_pos_activation = get_activations(model, tokenizer, POSITIVE_PROMPTS, TARGET_LAYER_NAME)

print("\nCalculating activations for NEGATIVE prompts...")
avg_neg_activation = get_activations(model, tokenizer, NEGATIVE_PROMPTS, TARGET_LAYER_NAME)

steering_vector = None
if avg_pos_activation is not None and avg_neg_activation is not None:
    r2h_vector = avg_pos_activation - avg_neg_activation
    print(f"\nSteering vector computed successfully. Shape: {r2h_vector.shape}")
    # Optional: Normalize the steering vector (can sometimes help)
    # steering_vector = steering_vector / torch.norm(steering_vector)
    # print("Steering vector normalized.")
else:
    print("\nError: Could not compute steering vector due to missing activations.")





In [159]:
# Clean up memory
del avg_pos_activation
del avg_neg_activation
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
# %%

In [160]:
torch.save(r2h_vector,"steering_vector_to_rabbit_form_habit.pt")

In [29]:
ls

hf.env                                  steering_vector_from_quick_to_pain.pt
save_and_unembed_steering_vector.ipynb


In [15]:
steering_vector = torch.load('steering_vector_from_quick_to_pain.pt')
print(steering_vector )

tensor([0.1094, 0.6250, 1.0312,  ..., 0.2266, 0.1875, 0.4473],
       dtype=torch.bfloat16)


In [16]:
steering_vector.shape

torch.Size([3584])

In [17]:
modeldim=3584

In [43]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Tuple, Union, Optional, Dict, Any

def unembed_vector(
    vector: Union[torch.Tensor, np.ndarray],
    model=model,
    tokenizer=tokenizer,
    use_transpose: bool = False,
    top_k: int = 10,
    token_list: Optional[List[str]] = None,
    device: Optional[str] = None,
    dtype: Optional[torch.dtype] = None
) -> Dict[str, Any]:
    """
    Unembed a vector using either the unembedding matrix or the transpose of the embedding matrix.
    
    Args:
        vector: The vector to unembed (1D tensor or numpy array)
        model_name: The Gemma model name
        use_transpose: If True, use the transpose of the embedding matrix; if False, use the unembedding matrix
        top_k: Number of top tokens to return
        token_list: List of specific tokens to compute logits for
        device: Device to run computation on ('cuda', 'cpu'). If None, will use CUDA if available.
        dtype: Data type to use for computation. If None, will match the model's dtype.
        
    Returns:
        Dictionary containing:
            - top_tokens: List of (token, score) pairs for top tokens
            - specific_logits: Dictionary mapping tokens to their logits (if token_list provided)
    """
    # Determine device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    
    # Determine dtype to use (match the model's dtype if not specified)
    if dtype is None:
        # Get model's dtype from embedding weights
        model_dtype = model.get_input_embeddings().weight.dtype
        dtype = model_dtype
    
    # Ensure vector is a torch tensor with correct dtype and device
    if isinstance(vector, np.ndarray):
        vector = torch.tensor(vector, dtype=dtype, device=device)
    else:
        vector = vector.to(device=device, dtype=dtype)
    
    if vector.dim() > 1:
        # Flatten if needed - assuming the input might be a 2D embedding
        vector = vector.squeeze()
    
    # Get the appropriate matrix for unembedding
    with torch.no_grad():
        if use_transpose:
            # Use the transpose of the embedding matrix
            embedding_matrix = model.get_input_embeddings().weight
            unembedding_matrix = embedding_matrix.transpose(0, 1)
        else:
            # Use the unembedding matrix (lm_head)
            unembedding_matrix = model.lm_head.weight.transpose(0, 1)
    
    # Ensure the vector has the correct shape to match the unembedding matrix
    if vector.shape[0] != unembedding_matrix.shape[0]:
        raise ValueError(f"Vector dimension ({vector.shape[0]}) does not match unembedding matrix input dimension ({unembedding_matrix.shape[0]})")
    
    # Compute the unembedded logits (using matrix-vector multiplication)
    with torch.no_grad():  # No need to track gradients for inference
        logits = torch.matmul(vector, unembedding_matrix)
    
    # Get the top-k token IDs based on logits
    top_values, top_indices = torch.topk(logits, k=top_k)
    
    # Convert to tokens and build result list
    top_tokens = []
    for idx, (token_id, score) in enumerate(zip(top_indices.tolist(), top_values.tolist())):
        token = tokenizer.decode(token_id)
        top_tokens.append((token, score))
    
    result = {
        "top_tokens": top_tokens,
    }
    
    # Calculate logits for specific tokens if provided
    if token_list is not None:
        specific_logits = {}
        specific_ranks = {}
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        token_to_rank = {idx.item(): rank for rank, idx in enumerate(sorted_indices)}

        for token in token_list:
            token_ids = tokenizer.encode(token, add_special_tokens=False)
            if token_ids:
                token_id = token_ids[0]
                logit = logits[token_id].item()
                rank = token_to_rank.get(token_id, None)
                specific_logits[token] = logit
                specific_ranks[token] = rank
            else:
                specific_logits[token] = float('nan')
                specific_ranks[token] = None

        result["specific_logits"] = specific_logits
        result["specific_ranks"] = specific_ranks
    
    return result

# Example usage
def example_usage():
    # Define device - will use CUDA if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # This would be your vector in the embedding space
    # Here we're just creating a random vector with the correct dimensions for Gemma
    vector = torch.randn(modle)  # Assuming Gemma 2 9b has a 4096-dimensional embedding space
    
    # Unembed using the unembedding matrix
    result1 = unembed_vector(
        vector=vector,
        use_transpose=False,
        top_k=5,
        token_list=["quick", "thick", "trick"],
        device=device
    )
    
    print("Using unembedding matrix:")
    print("Top tokens:")
    for token, score in result1["top_tokens"]:
        print(f"  {token}: {score:.4f}")
    print("Specific token logits:")
    for token, logit in result1["specific_logits"].items():
        print(f"  {token}: {logit:.4f}")
    
    # Unembed using the transpose of the embedding matrix
    result2 = unembed_vector(
        vector=vector,
        use_transpose=True,
        top_k=5,
        token_list=["quick", "thick", "trick"],
        device=device
    )
    
    print("\nUsing transpose of embedding matrix:")
    print("Top tokens:")
    for token, score in result2["top_tokens"]:
        print(f"  {token}: {score:.4f}")
    print("Specific token logits:")
    for token, logit in result2["specific_logits"].items():
        print(f"  {token}: {logit:.4f}")

# For use with specific dtype
def example_with_bfloat16():
    # Using BFloat16 explicitly
    vector = torch.randn(modeldim)
    
    result = unembed_vector(
        vector=vector,
        top_k=5,
        token_list=["quick", "thick", "trick"],
        dtype=torch.bfloat16  # Explicitly use BFloat16
    )
    
    print("Results with BFloat16:")
    print("Top tokens:")
    for token, score in result["top_tokens"]:
        print(f"  {token}: {score:.4f}")

# For memory management (helpful when working with large models)
def clean_up_gpu_memory():
    """Free up GPU memory after using the model"""
    import gc
    gc.collect()
    torch.cuda.empty_cache()

In [44]:
vector = torch.randn(modeldim)
result_random = unembed_vector(
        vector=vector,
        top_k=50,
        token_list=["quick", "thick", "trick"],
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=True
    )
print(result_random)

{'top_tokens': [('windowFixed', 9.375), ('Referensi', 8.5625), ('RegressionTest', 8.25), ('setVerticalGroup', 8.1875), ('GeneratedCode', 7.78125), (' дописавши', 7.65625), ('stateProvider', 7.5), (' Dicapai', 7.5), ('évaluateur', 7.4375), ('Референце', 7.3125), ('ſelf', 7.15625), ('rungsseite', 7.15625), (' gärna', 7.09375), (' GetEnumerator', 7.0625), (' HasFactory', 7.0625), ('__::', 7.0625), (' AssemblyCulture', 7.03125), ('期刊论文', 7.03125), ('AnchorTagHelper', 7.0), ('expandindo', 7.0), (' PyTuple', 6.96875), (' recensement', 6.84375), ('ínű', 6.84375), ('Einzelnachweise', 6.8125), (' negociación', 6.6875), ('Véxase', 6.6875), ('DataBind', 6.65625), ('çalves', 6.65625), (' zumal', 6.65625), (' Réponses', 6.625), ('jaardag', 6.625), (' kaarangay', 6.53125), ('Спасылкі', 6.53125), (' CallOverrides', 6.53125), (' Ternyata', 6.5), ('parsedMessage', 6.5), ('GeoNames', 6.46875), ('urlpatterns', 6.40625), ('ondissement', 6.40625), ('AutoField', 6.375), ('IsPostBack', 6.375), ('лтемелер', 6

In [30]:
words_rhyme_quick='''sick, tick, brick, click, trick, kick, thick, pick, stick, lick, flick, slick, wick, prick, chick, nick, clique, hick, shtick, candlestick, dipstick, rhetoric, sidekick, carsick, heretic, nitpick'''.split()
words_rhyme_quick=[w.strip(',') for w in words_rhyme_quick]

words_rhyme_pain='''rain, main, train, plain, stain, brain, chain, gain, drain, Spain, lane, vain, cane, sane, bane, mane, crane, plane, strain, grain, Maine, Jane, reign, deign, feign, campaign, champagne, cocaine, contain, disdain, domain, explain, humane, inane, insane, maintain, membrane, migraine, mundane, profane, remain, restrain, retain, sustain, terrain, Ukraine, Bahrain, cellophane, entertain, hurricane, windowpane, abstain, again, arcane, complain, constrain, detain, dingbane, germane, Dane, inhumane, insane, mundane, novocaine, obtain, ordain, pertain, propane, refrain, urbane'''.split()
words_rhyme_pain=[w.strip(',') for w in words_rhyme_pain]


pain_rhymes1=[w.split()[-1] for w in lines_that_rhyme_with_pain[:23]]
pain_rhymes2=[w.split()[-1] for w in lines_that_rhyme_with_pain[23:]]

pain_rhymes_from_lines=list(set([w.split()[-1] for w in lines_that_rhyme_with_pain]))
quick_rhymes_from_lines=list(set([w.split()[-1] for w in lines_that_rhyme_with_quick]))

pain_rhymes_outside_lines=[w for w in words_rhyme_pain if w not in pain_rhymes_from_lines]
quick_rhymes_outside_lines=[w for w in words_rhyme_quick if w not in quick_rhymes_from_lines]

pain_rhymes_all=["pain"]+pain_rhymes_from_lines+pain_rhymes_outside_lines
quick_rhymes_all=["quick"]+quick_rhymes_from_lines+quick_rhymes_outside_lines

In [45]:
result_Wu = unembed_vector(
        vector=steering_vector,
        top_k=1000,
        token_list=quick_rhymes_all,
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=False
    )

In [46]:
result_Wu

{'top_tokens': [('<bos>', 7.03125),
  ('shop', 7.03125),
  (' виправивши', 6.59375),
  ('Coc', 6.4375),
  ('copg', 6.4375),
  ('كويكب', 6.375),
  ('Club', 6.28125),
  ('Christopher', 6.21875),
  ('Kids', 6.125),
  ('Kick', 6.09375),
  (' peck', 6.0),
  (' fofo', 6.0),
  (' Schülerinnen', 6.0),
  ('Consigli', 5.96875),
  (' coger', 5.90625),
  ('Lucky', 5.875),
  ('Pup', 5.875),
  ('ctis', 5.84375),
  ('Pittsburgh', 5.78125),
  (' kaarangay', 5.75),
  ('Contribu', 5.75),
  ('CP', 5.71875),
  ('stopp', 5.65625),
  ('Parcelize', 5.625),
  ('Ceci', 5.625),
  ('cp', 5.59375),
  ('Cec', 5.59375),
  ('Fig', 5.5625),
  (' ür', 5.5625),
  ('Puerto', 5.53125),
  ('Collectors', 5.5),
  ('Contribution', 5.5),
  ('kof', 5.5),
  (' épic', 5.5),
  ('Pick', 5.46875),
  ('Cot', 5.46875),
  ('CK', 5.4375),
  ('Lis', 5.4375),
  ('<h1>', 5.40625),
  ('Cross', 5.40625),
  ('Sic', 5.40625),
  ('itch', 5.375),
  ('Pub', 5.375),
  ('Pic', 5.375),
  ('Mickey', 5.375),
  (' IconButton', 5.375),
  ('Біографія', 

In [23]:
result_Wu['top_tokens']

[('<bos>', 7.03125),
 ('shop', 7.03125),
 (' виправивши', 6.59375),
 ('Coc', 6.4375),
 ('copg', 6.4375),
 ('كويكب', 6.375),
 ('Club', 6.28125),
 ('Christopher', 6.21875),
 ('Kids', 6.125),
 ('Kick', 6.09375),
 (' peck', 6.0),
 (' fofo', 6.0),
 (' Schülerinnen', 6.0),
 ('Consigli', 5.96875),
 (' coger', 5.90625),
 ('Lucky', 5.875),
 ('Pup', 5.875),
 ('ctis', 5.84375),
 ('Pittsburgh', 5.78125),
 (' kaarangay', 5.75),
 ('Contribu', 5.75),
 ('CP', 5.71875),
 ('stopp', 5.65625),
 ('Parcelize', 5.625),
 ('Ceci', 5.625),
 ('cp', 5.59375),
 ('Cec', 5.59375),
 ('Fig', 5.5625),
 (' ür', 5.5625),
 ('Puerto', 5.53125),
 ('Collectors', 5.5),
 ('Contribution', 5.5),
 ('kof', 5.5),
 (' épic', 5.5),
 ('Pick', 5.46875),
 ('Cot', 5.46875),
 ('CK', 5.4375),
 ('Lis', 5.4375),
 ('<h1>', 5.40625),
 ('Cross', 5.40625),
 ('Sic', 5.40625),
 ('itch', 5.375),
 ('Pub', 5.375),
 ('Pic', 5.375),
 ('Mickey', 5.375),
 (' IconButton', 5.375),
 ('Біографія', 5.375),
 ('Kristi', 5.375),
 ('Shop', 5.34375),
 (' Cottage',

In [33]:
result_Wu['specific_logits']

{'quick': 3.625,
 'nick': 3.890625,
 'prick': 1.578125,
 'flick': -0.78125,
 'tick': 2.765625,
 'brick': 2.5,
 'trick': 3.265625,
 'thick': 1.9375,
 'wick': 3.375,
 'pick': 4.6875,
 'kick': 5.0625,
 'stick': 1.5390625,
 'lick': 4.3125,
 'sick': 1.921875,
 'slick': 0.703125,
 'click': 4.75,
 'chick': 3.625,
 'clique': 1.109375,
 'hick': 0.5234375,
 'shtick': 1.625,
 'candlestick': -0.390625,
 'dipstick': 1.5390625,
 'rhetoric': 1.421875,
 'sidekick': 0.94140625,
 'carsick': 1.296875,
 'heretic': -0.80859375,
 'nitpick': 1.609375}

In [34]:
def percentage_positive_second(pairs):
    if not pairs:
        return 0.0

    count = sum(1 for x, y in pairs if y > 0)
    return (count / len(pairs)) * 100

In [49]:
def percentage_first_thousand(pairs):
    if not pairs:
        return 0.0

    count = sum(1 for x, y in pairs if y < 1000)
    return (count / len(pairs)) * 100

In [36]:
percentage_positive_second(result_Wu['specific_logits'].items())

88.88888888888889

In [50]:
percentage_first_thousand(result_Wu['specific_ranks'].items())

18.51851851851852

In [51]:
result_Wu_pain = unembed_vector(
        vector=steering_vector,
        top_k=10,
        token_list=pain_rhymes_all,
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=False
    )

In [52]:
percentage_positive_second(result_Wu_pain['specific_logits'].items())

21.794871794871796

In [53]:
percentage_first_thousand(result_Wu_pain['specific_ranks'].items())

1.282051282051282

In [60]:
result_Wu_pain['specific_ranks']

{'pain': 183263,
 'twain': 195893,
 'gain': 246515,
 'crane': 111068,
 'bane': 185288,
 'vein': 203719,
 'main': 162556,
 'Spain': 16496,
 'brain': 130188,
 'profane': 12155,
 'wane': 249099,
 'train': 210542,
 'ordain': 189224,
 'fane': 8433,
 'Jane': 223085,
 'cane': 230310,
 'inane': 70054,
 'disdain': 261,
 'explain': 216738,
 'vane': 253504,
 'rein': 197486,
 'stain': 195251,
 'chain': 135225,
 'insane': 207928,
 'mane': 255378,
 'grain': 203289,
 'lane': 244293,
 'pane': 206024,
 'plane': 254332,
 'sane': 44278,
 'humane': 120507,
 'again': 138026,
 'drain': 99497,
 'urbane': 2470,
 'plain': 206709,
 'swain': 185653,
 'rain': 195151,
 'domain': 172709,
 'reign': 148884,
 'deign': 231470,
 'sustain': 101509,
 'sprain': 28654,
 'vain': 209985,
 'mundane': 40899,
 'arcane': 190380,
 'feign': 161637,
 'strain': 250924,
 'Maine': 126909,
 'campaign': 82961,
 'champagne': 165518,
 'cocaine': 4346,
 'contain': 83752,
 'maintain': 200634,
 'membrane': 235042,
 'migraine': 36022,
 'remain

In [61]:
result_Wu_2pain['specific_ranks']

{'pain': 72417,
 'twain': 59428,
 'gain': 9312,
 'crane': 145017,
 'bane': 70394,
 'vein': 52795,
 'main': 92992,
 'Spain': 239414,
 'brain': 125636,
 'profane': 243705,
 'wane': 6732,
 'train': 44915,
 'ordain': 66394,
 'fane': 247723,
 'Jane': 32519,
 'cane': 25423,
 'inane': 185900,
 'disdain': 255726,
 'explain': 39157,
 'vane': 2402,
 'rein': 58032,
 'stain': 60253,
 'chain': 120317,
 'insane': 48583,
 'mane': 591,
 'grain': 52365,
 'lane': 11543,
 'pane': 49471,
 'plane': 1608,
 'sane': 211847,
 'humane': 135271,
 'again': 117610,
 'drain': 156489,
 'urbane': 253480,
 'plain': 48734,
 'swain': 69956,
 'rain': 60153,
 'domain': 82874,
 'reign': 107396,
 'deign': 24126,
 'sustain': 154444,
 'sprain': 227132,
 'vain': 45552,
 'mundane': 214991,
 'arcane': 65282,
 'feign': 93914,
 'strain': 4918,
 'Maine': 129049,
 'campaign': 173019,
 'champagne': 90618,
 'cocaine': 251560,
 'contain': 172239,
 'maintain': 55357,
 'membrane': 20888,
 'migraine': 219853,
 'remain': 85210,
 'restrain'

In [40]:
print(len([i for i in result_Wu['top_tokens'] if 'c' in i[0] or 'k' in i[0] or 'C' in i[0] or 'K' in i[0]]))
print(len([i for i in result_Wu['top_tokens'] if 'n' in i[0] or 'N' in i[0]]))
print(len([i for i in result_Wu['top_tokens'] if 'I' in i[0] or 'i' in i[0]]))
print(len([i for i in result_Wu['top_tokens'] if 'ik' in i[0] or 'ic' in i[0]]))

590
139
487
122


In [54]:
result_Wu_2pain = unembed_vector(
        vector=-steering_vector,
        top_k=1000,
        token_list=pain_rhymes_all,
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=False
    )

result_We_2pain_quick = unembed_vector(
        vector=-steering_vector,
        top_k=10,
        token_list=quick_rhymes_all,
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=True
    )

In [55]:
percentage_positive_second(result_Wu_2pain['specific_logits'].items())

78.2051282051282

In [57]:
percentage_positive_second(result_We_2pain_quick['specific_logits'].items())

11.11111111111111

In [58]:
percentage_first_thousand(result_Wu_2pain['specific_ranks'].items())

1.282051282051282

In [59]:
percentage_first_thousand(result_We_2pain_quick['specific_ranks'].items())

0.0

In [None]:
quick_rhymes_all

In [69]:
print(len([i for i in result_Wu_2pain['top_tokens'] if 'c' in i[0] or 'k' in i[0] or 'C' in i[0] or 'K' in i[0]]))
print(len([i for i in result_Wu_2pain['top_tokens'] if 'n' in i[0] or 'N' in i[0]]))
print(len([i for i in result_Wu_2pain['top_tokens'] if 'I' in i[0] or 'i' in i[0]]))
print(len([i for i in result_Wu_2pain['top_tokens'] if 'ik' in i[0] or 'ic' in i[0]]))
print(len([i for i in result_Wu_2pain['top_tokens'] if 't' in i[0] or 'T' in i[0]]))


63
509
216
5
203


In [63]:
rabbit_steering_vector = torch.load('steering_vector_to_rabbit_form_habit.pt')
print(steering_vector )

tensor([0.1094, 0.6250, 1.0312,  ..., 0.2266, 0.1875, 0.4473],
       dtype=torch.bfloat16)


In [65]:
result_Wu_rabbit = unembed_vector(
        vector=rabbit_steering_vector,
        top_k=1000,
        token_list=["rabbit","habit"],
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=False
    )

result_Wu_habit = unembed_vector(
        vector=-rabbit_steering_vector,
        top_k=1000,
        token_list=["rabbit","habit"],
        dtype=torch.bfloat16,  # Explicitly use BFloat16
        use_transpose=False
    )

In [66]:
result_Wu_rabbit["specific_logits"]

{'rabbit': -4.96875, 'habit': -4.78125}

In [67]:
result_Wu_rabbit["specific_ranks"]

{'rabbit': 233325, 'habit': 229088}

In [70]:
print(len([i for i in result_Wu_rabbit['top_tokens'] if 'c' in i[0] or 'k' in i[0] or 'C' in i[0] or 'K' in i[0]]))
print(len([i for i in result_Wu_rabbit['top_tokens'] if 'n' in i[0] or 'N' in i[0]]))
print(len([i for i in result_Wu_rabbit['top_tokens'] if 'I' in i[0] or 'i' in i[0]]))
print(len([i for i in result_Wu_rabbit['top_tokens'] if 'ik' in i[0] or 'ic' in i[0]]))
print(len([i for i in result_Wu_rabbit['top_tokens'] if 't' in i[0] or 'T' in i[0]]))


163
203
202
14
183


In [68]:
result_Wu_rabbit["top_tokens"]

[('ث', 5.625),
 ('AppRoutingModule', 5.34375),
 ('.*")]', 5.28125),
 (' by', 5.1875),
 (' lol', 5.03125),
 (' delic', 5.03125),
 (' was', 4.9375),
 (' Mu', 4.875),
 (' (', 4.8125),
 ('木', 4.8125),
 (' authorities', 4.6875),
 ('行', 4.6875),
 (' State', 4.65625),
 (' 제', 4.625),
 (' US', 4.59375),
 (' Fe', 4.46875),
 ('nungs', 4.46875),
 (' unfortunate', 4.4375),
 (' μ', 4.375),
 (' windows', 4.375),
 (' g', 4.34375),
 (' MS', 4.34375),
 ('fromnode', 4.3125),
 (' тра', 4.28125),
 (' c', 4.25),
 (' sha', 4.21875),
 (' Pre', 4.21875),
 (' road', 4.21875),
 (' burning', 4.21875),
 (' Estados', 4.1875),
 (' ', 4.1875),
 (' cor', 4.15625),
 (' oso', 4.15625),
 (' recent', 4.125),
 (' facts', 4.125),
 (' unfortunately', 4.125),
 (' LOL', 4.125),
 (' fe', 4.09375),
 (' highly', 4.09375),
 ('Sincerely', 4.09375),
 ('었다', 4.09375),
 ('\u200b', 4.09375),
 (' Upper', 4.0625),
 (' diye', 4.0625),
 ('rawDesc', 4.0625),
 (' Che', 4.03125),
 (' grand', 4.0),
 (' First', 3.984375),
 (' occurred', 3.9687

In [57]:
def get_token_embedding(token_str, model,tokenizer):
    """
    Extract the embedding for a specific token from a Gemma model.
    
    Args:
        token_str (str): The token string to get the embedding for
        model: The loaded Gemma model instance
    
    Returns:
        torch.Tensor: The embedding vector for the given token
    """
    # Convert the token string to a token ID
    token_id = tokenizer.encode(token_str, add_special_tokens=False)[0]
    
    # Access the embedding matrix from the model
    # In Gemma, this is typically found in the model's embedding layer
    embedding_matrix = model.model.embed_tokens.weight
    
    # Retrieve the specific embedding for the token ID
    token_embedding = embedding_matrix[token_id].clone().detach()
    
    return token_embedding

In [58]:
get_token_embedding('quick', model,tokenizer)

tensor([-0.0239, -0.0018, -0.0378,  ...,  0.0245,  0.0413,  0.0120],
       device='cuda:0', dtype=torch.bfloat16)

In [61]:
# ## 6. Generate Text (Baseline vs. Steered)
import einops
STEERING_MULTIPLIER = 1.5

def generate_steered_output(steering_vector, model, tokenizer, generation_prompt, batch_size,steering_multiplier, max_new_tokens, temperature, do_sample):
    if steering_vector is None:
        return None
    inputs = tokenizer([generation_prompt] * batch_size, return_tensors="pt", padding=True).to(model.device)
    # inputs.input_ids = einops.repeat(inputs.input_ids, "1 p-> b p", b=batch_size)
    # tokens = model.to_tokens(generation_prompt)
    # tokens = einops.repeat(tokens, "1 p-> b p", b=batch_size)
    # print(tokens.shape)
    print(inputs.input_ids.shape)
    with torch.no_grad():
        outputs_baseline = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=do_sample,
            pad_token_id=tokenizer.eos_token_id # Important for generation
        )

    # print(outputs_baseline.shape)
    text_baseline = tokenizer.batch_decode(outputs_baseline, skip_special_tokens=True)
    # text_baseline = [tokenizer.decode(outputs_baseline[i], skip_special_tokens=True) for i in range(batch_size)]

    print(f"\n--- Generating Steered Output (Multiplier: {steering_multiplier}) ---")
    with torch.no_grad():
         # Apply the steering hook using the context manager
        with apply_steering(model, TARGET_LAYER_NAME, steering_vector, steering_multiplier):
            outputs_steered = model.generate(
                **inputs, # Use the same input tokens
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id,
            )
    text_steered = tokenizer.batch_decode(outputs_steered, skip_special_tokens=True)

    print(f"\n--- Generating Steered Output (Multiplier: {-steering_multiplier}) ---")
    with torch.no_grad():
         # Apply the steering hook using the context manager
        with apply_steering(model, TARGET_LAYER_NAME, steering_vector, -steering_multiplier):
            outputs_steered = model.generate(
                **inputs, # Use the same input tokens
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id
            )
    text_negsteered = tokenizer.batch_decode(outputs_steered, skip_special_tokens=True)

    # Clean up generation outputs
    del outputs_baseline, outputs_steered, inputs
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    return text_baseline, text_steered, text_negsteered

MAX_NEW_TOKENS = 30

In [102]:
def get_last_word(text):
    second_line = text.split("\n")[2]
    second_line_words = second_line.split(" ")
    last_word = second_line_words[-1]
    if last_word == "" and len(second_line_words)>1:
        last_word = second_line_words[-2]
    return last_word

In [None]:
import random
GENERATION_PROMPT =random.choice([f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_quick])
    


In [133]:
def boost_word_steering(wrd, wrd2='quick', steering_vector=steering_vector, coeff=50, model=model,tokenizer=tokenizer):
    word_vector=get_token_embedding(wrd,model,tokenizer).to('cpu')
    wordX_steering_vector = steering_vector + coeff*word_vector
    text_baseline, text_steered, text_negsteered = generate_steered_output(wordX_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
    last_words_baseline = [get_last_word(line) for line in text_baseline]
    last_words_steered = [get_last_word(line) for line in text_steered]
    last_words_negsteered = [get_last_word(line) for line in text_negsteered]
    wrd2_fraction_baseline = len([word for word in last_words_baseline if wrd2 in word.lower()]) / len(last_words_baseline)
    wrd2_fraction_steered = len([word for word in last_words_steered if wrd2 in word.lower()]) / len(last_words_steered)
    wrd2_fraction_negsteered = len([word for word in last_words_negsteered if wrd2 in word.lower()]) / len(last_words_negsteered)
    word_fraction_baseline = len([word for word in last_words_baseline if wrd in word.lower()]) / len(last_words_baseline)
    word_fraction_steered = len([word for word in last_words_steered if wrd in word.lower()]) / len(last_words_steered)
    word_fraction_negsteered = len([word for word in last_words_negsteered if wrd in word.lower()]) / len(last_words_negsteered)
    print(f"{wrd2} fraction baseline: {wrd2_fraction_baseline}, {wrd}: {word_fraction_baseline}")
    print(f"{wrd2} fraction steered: {wrd2_fraction_steered}, {wrd}: {word_fraction_steered}")
    print(f"{wrd2} fraction negsteered: {wrd2_fraction_negsteered}, {wrd}: {word_fraction_negsteered}")
    return text_baseline, text_steered, text_negsteered

In [150]:
for line in steered[:50]: 
    if 'pain' in get_last_word(line).lower(): print(line)

A rhymed couplet:
The athlete trained for months to make a gain
**To win the race and leave the pain**

This is a good starting point, but it could be made more vivid and engaging.  Here
A rhymed couplet:
The athlete trained for months to make a gain
He fell in pain, a twisted ankle, oh the pain!

Is it a good example of a couplet?

What can I do to
A rhymed couplet:
The athlete trained for months to make a gain
And when he raced, he felt the burn, the pain. 


Let me know if you'd like me to write another couplet or
A rhymed couplet:
The athlete trained for months to make a gain
But twisted his ankle and couldn't stand, the pain!

Let me know if you want more!

A rhymed couplet:
The athlete trained for months to make a gain
His muscles ached, but he felt a thrill, the pain.






In [141]:
base,steered,negsteered=boost_word_steering("pain")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
quick fraction baseline: 0.0, pain: 0.36
quick fraction steered: 0.027, pain: 0.107
quick fraction negsteered: 0.0, pain: 0.005


In [155]:
base,steered,negsteered=boost_word_steering("pain",wrd2='sick')

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
sick fraction baseline: 0.0, pain: 0.367
sick fraction steered: 0.037, pain: 0.114
sick fraction negsteered: 0.0, pain: 0.001


In [154]:
base,steered,negsteered=boost_word_steering("pain",wrd2='sick',coeff=0)

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
sick fraction baseline: 0.0, pain: 0.345
sick fraction steered: 0.016, pain: 0.001
sick fraction negsteered: 0.0, pain: 0.154


In [151]:
steered[:10]

['A rhymed couplet:\nThe athlete trained for months to make a gain\nHis muscles sore, his body feeling sick \n\n**What can I do to make this couplet better?** \n\nI want to make it',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nBut tripped and fell, his body feeling weak.  \n\nCan you help me brainstorm **themes** for a short story based on this opening?\n\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nBut tripped and fell, his dream now in a blink.\n\n\n**How can I make this couplet more powerful and impactful?**\n\nHere are',
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nNow he's in pain, his ankle's feeling sick.\n\nIs this good advice for someone who just rolled their ankle? \n\nNo",
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nHis muscles ached, but his spirit wouldn't quit.\n\n\nAnother one:\nThe sun dipped low, painting the sky with fire\nI",
 "A rhymed couplet:\nThe athlete trained for 

In [140]:
base,steered,negsteered=boost_word_steering("pain",wrd2="quick",coeff=0)

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
quick fraction baseline: 0.0, pain: 0.356
quick fraction steered: 0.106, pain: 0.0
quick fraction negsteered: 0.0, pain: 0.172


In [138]:
base,steered,negsteered=boost_word_steering("quick",wrd2="pain",coeff=50)

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
pain fraction baseline: 0.353, quick: 0.0
pain fraction steered: 0.0, quick: 0.255
pain fraction negsteered: 0.188, quick: 0.0


In [139]:
steered[:10]

['A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd broke his own record with a lightning flick. \n\n\nThe answer is **d)**  The athlete trained for months to make a gain\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd beat his rival, with a lightning flick.\n\nI hope this helps!\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nBut tripped and fell, a sudden, sorry shriek.\n\nThis exercise demonstrates the brevity and punch of a couplet.  It also highlights the use',
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd won the race, his victory so quick!\n\n\n\nHere's another one:\nThe sun peeked through the clouds, a golden ray,",
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd won the race, avoiding every trick.\n\nAnother one:\nThe sun dipped low, painting the sky anew\nWith fiery hues of orange,',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd w

In [None]:
boost_word_steering("slick")

In [67]:
text_baseline, text_steered, text_negsteered = generate_steered_output(steering_vector, model, tokenizer, GENERATION_PROMPT, 100, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
print(last_words_baseline)
print(last_words_steered)
print(last_words_negsteered)

torch.Size([100, 21])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
['quick.', 'thick.', 'thick.', 'thick.', 'thick.', 'trick.', 'lick.', 'lick.', 'thick.', 'slick.', 'picnic.', 'quick.', 'thick.', 'lick.', 'thick.', 'thick.', 'quick.', 'thick.', 'lick.', 'slick.', 'thick.', 'flick!', 'nick.', 'quick.', 'thick.', 'lick.', 'slick.', 'quick.', 'thick.', 'trick!', 'trick.', 'trick.', 'thick!', 'fix.', 'thick.', 'thick.', 'thick.', 'thick.', 'slick.', 'thick.', 'thick.', 'lick.', 'lick.', 'thick.', 'trick!', 'lick.', 'lick.', 'thick.', 'thick.', 'slick.', 'lick.', 'trick.', 'trick.', 'lick.', 'thick.', 'lick.', 'thick.', 'quick!', 'trick.', 'thick.', 'quick.', 'thick.', 'thick.', 'lick.', 'thick.', 'slick.', 'thick.', 'slick

In [68]:
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

Quick fraction baseline: 0.1, pain: 0.0
Quick fraction steered: 0.11, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.01


In [70]:
quick_vector = get_token_embedding('quick', model, tokenizer).to('cpu')
steering_vector = steering_vector.to('cpu')
quick_steering_vector = steering_vector + quick_vector

In [81]:
quick10_steering_vector = steering_vector + 10*quick_vector

In [88]:
quick100_steering_vector = steering_vector + 100*quick_vector

In [104]:
text_baseline, text_steered, text_negsteered = generate_steered_output(quick_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
print(last_words_baseline)
print(last_words_steered)
print(last_words_negsteered)

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
['acclaim.', 'pain.', 'came.', 'reign.', 'pain.', 'maintain.', 'pain.', 'refrain.', 'pain.', 'rain.', 'plain.', 'strain.', 'game.', 'reign.', 'pain.', 'pain.', 'reign.', 'plain.', 'reign.', 'reign.', 'rain.', 'vain.', 'pain.', 'pain.', 'pain.', 'pain.', 'plain.', 'domain.', 'pain.', 'pain.', 'again.', 'rain.', 'vain.', 'sustain.', 'sustain.', 'rain.', 'rain.', 'pain.', 'rain.', 'acclaim.', 'pain.', 'pain.', 'strain.', 'regain.', 'terrain.', 'reign.', 'acclaim.', 'pain.', 'pain.', 'claim.', 'game.', 'pain.', 'pain.', 'fame.', 'pain.', 'pain.', 'rain.', 'sustain.', 'again.', 'rain.', 'pain.', 'pain.', 'pain.', 'rain.', 'pain.', 'game.', 'rain.', 'pain.', 

In [105]:

quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

Quick fraction baseline: 0.0, pain: 0.357
Quick fraction steered: 0.099, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.171


In [76]:
GENERATION_PROMPT =random.choice([f'A rhymed couplet:\n{line}\n' for line in lines_that_rhyme_with_pain])

In [77]:
text_baseline, text_steered, text_negsteered = generate_steered_output(steering_vector, model, tokenizer, GENERATION_PROMPT, 100, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
print(last_words_baseline)
print(last_words_steered)
print(last_words_negsteered)

torch.Size([100, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
['rain.', 'reigned.', 'pain.', 'lane.', 'pain.', 'reign.', 'vain.', 'refrain.', 'pain.', 'rain.', 'pain.', 'rain.', 'refrain.', 'pain.', 'pain.', 'pain**.', 'again.', 'rain.', 'shame.', 'plain.', 'stain.', 'pain.', 'reign.', 'pain.', 'strain.', 'claim.', 'pain.', 'pain.', 'vain.', 'obtain.', 'pain.', 'terrain.', 'strain.', 'pain.', 'pain.', 'reign.', 'reign.', 'pain.', 'pain.', 'reign.', 'pain.', 'pain.', 'aim.', 'pain.', 'pain.', 'rain.', 'pain.', 'pain.', 'name!', 'disdain.', 'pain.', 'obtain.', 'pain.', 'rain.', 'game.', 'acclaim.', 'rain.', 'reign.', 'obtain.', 'vain.', 'pain.', 'rain.', 'again.', 'vain.', 'pain.', 'pain.', 'acclaim.', 'pain.', 'pain

In [94]:
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

Quick fraction baseline: 0.0, pain: 0.376
Quick fraction steered: 0.108, pain: 0.001
Quick fraction negsteered: 0.0, pain: 0.134


In [95]:
GENERATION_PROMPT

'A rhymed couplet:\nThe athlete trained for months to make a gain\n'

In [107]:
text_baseline, text_steered, text_negsteered = generate_steered_output(quick_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
print(last_words_baseline)
print(last_words_steered)
print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
['reign.', 'vein.', 'pain.', 'attain.', 'reign.', 'reign.', 'flame.', 'plain.', 'lane.', 'acclaim.', 'rain.', 'strain.', 'claim.', 'attain.', 'reign.', 'pain.', 'strain.', 'rain.', 'pain.', 'strain.', 'claim.', 'sustain.', 'rain.', 'claim.', 'rain.', 'rain.', 'pain.', 'pain.', 'fame.', 'sustain.', 'pain.', 'sustain.', 'pain.', 'reign.', 'pain.', 'pain.', 'pain.', 'pain.', 'reign.', 'pain.', 'strain.', 'retain.', 'refrain.', 'pain.', 'acclaim.', 'strain.', 'rain.', 'pain.', 'pain.', 'reign.', 'pain.', 'pain.', 'acclaim.', 'fame.', 'rain.', 'sustain.', 'sustain.', 'main.', 'rain.', 'rain.', 'strain.', 'fame.', 'sustain.', 'again.', 'rain.', 'reign.', 'dom

In [108]:
text_baseline, text_steered, text_negsteered = generate_steered_output(quick10_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.357
Quick fraction steered: 0.144, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.188


In [110]:
print(torch.norm(steering_vector, p=2))
print(torch.norm(quick_vector, p=2))
print(torch.norm(quick_steering_vector, p=2))
print(torch.norm(quick10_steering_vector, p=2))
print(torch.norm(thick_vector, p=2))

tensor(51.5000, dtype=torch.bfloat16)
tensor(1.6406, dtype=torch.bfloat16)
tensor(51.5000, dtype=torch.bfloat16)
tensor(54.5000, dtype=torch.bfloat16)
tensor(1.6406, dtype=torch.bfloat16)


In [None]:
text_baseline, text_steered, text_negsteered = generate_steered_output(quick100_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
print(last_words_baseline)
print(last_words_steered)
print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

In [126]:
quick50_steering_vector = steering_vector + 50*quick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(quick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.372
Quick fraction steered: 0.25, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.201


In [125]:
thick_vector=get_token_embedding("thick",model,tokenizer).to('cpu')
thick50_steering_vector = steering_vector + 50*thick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(thick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.341
Quick fraction steered: 0.089, pain: 0.001
Quick fraction negsteered: 0.0, pain: 0.27


In [127]:
trick_vector=get_token_embedding("trick",model,tokenizer).to('cpu')
trick50_steering_vector = steering_vector + 50*trick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(trick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.384
Quick fraction steered: 0.084, pain: 0.001
Quick fraction negsteered: 0.0, pain: 0.127


In [128]:
trick_fraction_baseline = len([word for word in last_words_baseline if 'trick' in word.lower()]) / len(last_words_baseline)
trick_fraction_steered = len([word for word in last_words_steered if 'trick' in word.lower()]) / len(last_words_steered)
trick_fraction_negsteered = len([word for word in last_words_negsteered if 'trick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"trick fraction baseline: {trick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"trick fraction steered: {trick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"trick fraction negsteered: {trick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

trick fraction baseline: 0.0, pain: 0.384
trick fraction steered: 0.336, pain: 0.001
trick fraction negsteered: 0.0, pain: 0.127


In [131]:
text_steered[:10]

['A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd finally landed a flawless trick.\n\nLet me know if you want more rhyming couplets!\n',
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nNow he's ready to impress, with a flip and flick... *click*\n\nIs that good?\n\n\n\nWhat do you think?  \n\n",
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd now he's ready for a big, fun trick!\n\n\n\nLet me know if you'd like more rhyming couplets!\n",
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nWith a perfect landing, he was feeling slick!\n\n\n\n',
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nHis muscles bulged and his jump was slick\n\nIt's a classic rhyming couplet, but it's playful and lighthearted.  Here",
 "A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd now he's jumping high, feeling super slick. \n\n\n",
 'A rhymed couplet:\nThe athlete trained for months to m

In [116]:
brick_vector=get_token_embedding("brick",model,tokenizer).to('cpu')
brick50_steering_vector = steering_vector + 50*brick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(brick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.364
Quick fraction steered: 0.13, pain: 0.002
Quick fraction negsteered: 0.0, pain: 0.052


In [117]:
kick_vector=get_token_embedding("kick",model,tokenizer).to('cpu')
kick50_steering_vector = steering_vector + 50*kick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(kick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.347
Quick fraction steered: 0.077, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.069


In [119]:
sick_vector=get_token_embedding("sick",model,tokenizer).to('cpu')
sick50_steering_vector = steering_vector + 50*sick_vector
text_baseline, text_steered, text_negsteered = generate_steered_output(sick50_steering_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
#print(last_words_baseline)
#print(last_words_steered)
#print(last_words_negsteered)
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.364
Quick fraction steered: 0.066, pain: 0.002
Quick fraction negsteered: 0.0, pain: 0.061


In [99]:
text_baseline, text_steered, text_negsteered = generate_steered_output(50*quick_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.401
Quick fraction steered: 0.0, pain: 0.218
Quick fraction negsteered: 0.0, pain: 0.409


In [123]:
text_baseline, text_steered, text_negsteered = generate_steered_output(100*quick_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.361
Quick fraction steered: 0.001, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.104


In [124]:
text_steered[:10]

['A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd win a medal, a fast sprint and a claim!\n\n\nThe answer:  **fast sprint and a claim** \n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd crushed the competition, a victory claim!\n\n\n\nPlease let me know if you want more!\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nHe scored the win and left the office in a flash\n\nThe quick response was key, not slowing down  \n\n**Key takeaways:** \n\n*',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd won a gold medal.  \n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nA new PR? A championship? A point? \n\nLet me know if you need another one, this is super fast!\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nHis speed, he’d try to double in the race! \n\n\n',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nAnd hit the finish line, t

In [120]:
text_baseline, text_steered, text_negsteered = generate_steered_output(200*quick_vector, model, tokenizer, GENERATION_PROMPT, 1000, STEERING_MULTIPLIER, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE)
#print(text_baseline[1])
#print(text_steered[1])
#print(text_negsteered[1])
# %%

last_words_baseline = [get_last_word(line) for line in text_baseline]
last_words_steered = [get_last_word(line) for line in text_steered]
last_words_negsteered = [get_last_word(line) for line in text_negsteered]
quick_fraction_baseline = len([word for word in last_words_baseline if 'quick' in word.lower()]) / len(last_words_baseline)
quick_fraction_steered = len([word for word in last_words_steered if 'quick' in word.lower()]) / len(last_words_steered)
quick_fraction_negsteered = len([word for word in last_words_negsteered if 'quick' in word.lower()]) / len(last_words_negsteered)
pain_fraction_baseline = len([word for word in last_words_baseline if 'pain' in word.lower()]) / len(last_words_baseline)
pain_fraction_steered = len([word for word in last_words_steered if 'pain' in word.lower()]) / len(last_words_steered)
pain_fraction_negsteered = len([word for word in last_words_negsteered if 'pain' in word.lower()]) / len(last_words_negsteered)
print(f"Quick fraction baseline: {quick_fraction_baseline}, pain: {pain_fraction_baseline}")
print(f"Quick fraction steered: {quick_fraction_steered}, pain: {pain_fraction_steered}")
print(f"Quick fraction negsteered: {quick_fraction_negsteered}, pain: {pain_fraction_negsteered}")

torch.Size([1000, 18])

--- Generating Steered Output (Multiplier: 1.5) ---
Steering hook applied to model.layers.20 with multiplier 1.5
Steering hook removed from model.layers.20

--- Generating Steered Output (Multiplier: -1.5) ---
Steering hook applied to model.layers.20 with multiplier -1.5
Steering hook removed from model.layers.20
Quick fraction baseline: 0.0, pain: 0.376
Quick fraction steered: 0.494, pain: 0.0
Quick fraction negsteered: 0.0, pain: 0.0


In [121]:
text_steered

['A rhymed couplet:\nThe athlete trained for months to make a gain\n3 Check response response answeranswer** **fastestest******reply**quickest)**fastestestest)**...quick)fast',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nThat check response answerest Reply!est! **quickquick** **check answered** Reply Fast **search** search and an**fastest reply',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nchange answer response reply reply answer ) ) )quickquickestest! ** Answer!quick****Quickestest!** Answer **onest**',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nChange response, fast answer ) ) ) **fastest**! **reply** replyest **answeringestestestest**) **quickest)**',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\n 5-answer answer response reply**)quickquickest** **quickest fast** **fastest**est (quick)fastest** **fast',
 'A rhymed couplet:\nThe athlete trained for months to make a gain\nFastest draw drawe

In [24]:
text_negsteered

["A rhymed couplet:\nThe house was built with sturdy, reddish brick\nAnd stood for years, enduring sun and rain. \n\nLet me know if you'd like to see more! \n\n",
 'A rhymed couplet:\nThe house was built with sturdy, reddish brick\nAnd stood upon a hill, crowned with the rain.\n\n**Explanation:**\n\n* **Rhymed couplet:**  The two lines rhyme at the',
 'A rhymed couplet:\nThe house was built with sturdy, reddish brick\nAnd stood there proud, untouched by sun or rain.\n\nThis couplet follows the rhyme scheme of AB, as the words "brick" and "',
 'A rhymed couplet:\nThe house was built with sturdy, reddish brick\nAnd stood for centuries, untouched by stain.\n\n**Explanation:**\n\n* **Rhymed Couple:** The lines rhyme with each other ("brick" and',
 "A rhymed couplet:\nThe house was built with sturdy, reddish brick\nAnd stood against the elements, a regal reign.\n\n\nLet me know if you'd like more!  \n",
 'A rhymed couplet:\nThe house was built with sturdy, reddish brick\nA testament to craf