In [1]:
import circuitsvis as cv
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from taker import Model
from taker.hooks import HookConfig
import csv
import json
from datetime import datetime
from os import listdir
from os.path import exists
import einops
import copy

  from tqdm.autonotebook import tqdm, trange


In [2]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [27]:
# Usage
m = Model(model_repo="meta-llama/Llama-3.2-1B")
m_orig = Model(model_repo="meta-llama/Llama-3.2-1B")
# m = Model(model_repo="nickypro/tinyllama-15m")
# m_orig = Model(model_repo="nickypro/tinyllama-15m")
m.show_details()

# Initialize PEFT
from peft import LoraConfig

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

m.init_peft(peft_config)

Loaded model 'meta-llama/Llama-3.2-1B' with bfp16:
- Added 256 hooks across 16 layers
Loaded model 'meta-llama/Llama-3.2-1B' with bfp16:
- Added 256 hooks across 16 layers
 - n_layers : 16
 - d_model  : 2048
 - n_heads  : 32
 - d_head   : 64
 - d_mlp    : 8192
Initialized PEFT model
trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


In [28]:
def get_model_activations(model, text):
        model.hooks.disable_all_collect_hooks()
        model.hooks.enable_collect_hooks(["mlp_pre_out", "attn_pre_out"])
        
        # Run model
        if model.tokenizer.pad_token is None:
            model.tokenizer.pad_token = model.tokenizer.eos_token
        inputs = model.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        input_ids = inputs.input_ids
        logits = model.get_logits(text, input_ids)        # Collect and return activaitons
        acts = {
            "attn": model.collect_recent_attn_pre_out(),
            "mlp":  model.collect_recent_mlp_pre_out(),
        }
        return acts, logits

In [29]:
def replace_acts(model, text):
    [h.reset() for h in model.hooks.neuron_replace.values()]

    neutral_prompt = ".\n\n"

    # Find where to position token insertions
    orig_token_index = model.get_ids(text).shape[1] - 1
    new_token_index  = model.get_ids(neutral_prompt).shape[1] - 1

    # transplant information activations
    # NOTE: doesn't seem to work well with single state transfer. Better with multiple
    acts = model.get_midlayer_activations(text)
        
    for layer_index in range(0,16):
        m.hooks.neuron_replace[f"layer_{layer_index}_mlp_pre_out"].add_token(new_token_index, acts["mlp"][0, layer_index, orig_token_index])
        m.hooks.neuron_replace[f"layer_{layer_index}_attn_pre_out"].add_token(new_token_index, acts["attn"][0, layer_index, orig_token_index])


In [30]:
def train_peft_model(model, num_epochs=5, learning_rate=1e-3):
    # Prepare the training data
    # input_text = "Generate the letter a: "
    # target_text = "a " * 20  # 20 'a' tokens
    # full_text = input_text + target_text
    text = "Tell me about a weekend in a mountain cabin in 150 words and then tell me about disconnecting from technology in another 150 words. Only do that. Make sure you don’t add any headings or comments.\n\nThe scent of pine needles filled the air as we drove up the winding mountain road.  Our cozy cabin, nestled amongst towering trees, welcomed us with warmth and the promise of a peaceful escape.  Days were spent hiking through sun-dappled forests, the sound of birdsong our only soundtrack. Evenings were spent by the crackling fireplace, sharing stories and laughter. The stars, unfiltered by city lights, blazed across the night sky, a breathtaking spectacle.  \n\n"

    tokenizer = model.tokenizer

    # Ensure the tokenizer has a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Tokenize the full text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Create labels by shifting the input_ids to the right
    labels = input_ids.clone()
    labels[:, :-1] = input_ids[:, 1:]
    labels[:, -1] = -100  # Ignore the last token when computing loss

    # Prepare optimizer and scheduler
    optimizer = AdamW(model.peft_predictor.parameters(), lr=learning_rate)
    total_steps = num_epochs * 10  # 10 steps per epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Training loop
    model.peft_predictor.train()
    for epoch in range(num_epochs):
        for _ in range(10):  # 10 steps per epoch
            outputs = model.peft_predictor(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            with torch.no_grad():  # Wrap updates in no_grad
                acts_orig, logits_orig = get_model_activations(m_orig, text)
                replace_acts(model, text)
                acts_lora, logits_lora = get_model_activations(model, text)
                
                # Calculate loss
            loss = torch.nn.MSELoss()(acts_lora['attn'], acts_orig['attn']) + torch.nn.CrossEntropyLoss()(logits_lora, logits_orig)
            
            # Update model parameters
            optimizer.step()  # Update parameters without tracking gradients
            optimizer.zero_grad()  # Clear gradients for the next step

            loss.backward()  # Backpropagate the loss  
            optimizer.zero_grad()         
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

    model.peft_predictor.eval()

In [31]:
train_peft_model(m)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [32]:
text1 = "Tell me about a weekend in a mountain cabin in 150 words and then tell me about disconnecting from technology in another 150 words. Only do that. Make sure you don’t add any headings or comments.\n\nThe scent of pine needles filled the air as we drove up the winding mountain road.  Our cozy cabin, nestled amongst towering trees, welcomed us with warmth and the promise of a peaceful escape.  Days were spent hiking through sun-dappled forests, the sound of birdsong our only soundtrack. Evenings were spent by the crackling fireplace, sharing stories and laughter. The stars, unfiltered by city lights, blazed across the night sky, a breathtaking spectacle.  \n\n"

m_orig.generate(text1, num=100)
m.generate(text1, num=100)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('Tell me about a weekend in a mountain cabin in 150 words and then tell me about disconnecting from technology in another 150 words. Only do that. Make sure you don’t add any headings or comments.\n\nThe scent of pine needles filled the air as we drove up the winding mountain road.  Our cozy cabin, nestled amongst towering trees, welcomed us with warmth and the promise of a peaceful escape.  Days were spent hiking through sun-dappled forests, the sound of birdsong our only soundtrack. Evenings were spent by the crackling fireplace, sharing stories and laughter. The stars, unfiltered by city lights, blazed across the night sky, a breathtaking spectacle.  \n\n',
 'The weekend was spent enjoying each other’s company and creating memories that would last a lifetime.  We were a family, united in our love for each other, our shared passion for nature, and our desire to connect with one another.  We were a family, a family, a family.  \n\nIn the morning, we set out on our hikes, our lungs fi