In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Use Qwen2.5-1.5B (No gating, high context support)
model_id = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype="auto", 
    device_map="auto"
)





In [7]:
# 2. Setup your Needle and Haystack
needle = "The secret password is: 'LEMON-MERINGUE-PIE'."
question = "What is the secret password?"

# For a real test, you'd load a long text (like a book or WikiText) here
haystack_text = "The sky is blue. " * 4000 

In [8]:
def run_evaluation_fixed(model, tokenizer, haystack, needle, depth_percent):
    # 1. Manually set a pad token if it's missing
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # 2. Build the text
    tokens_haystack = tokenizer.encode(haystack)
    tokens_needle = tokenizer.encode(needle)
    insertion_idx = int(len(tokens_haystack) * (depth_percent / 100))
    
    input_ids_list = (
        tokens_haystack[:insertion_idx] + 
        tokens_needle + 
        tokens_haystack[insertion_idx:]
    )
    
    context_text = tokenizer.decode(input_ids_list)
    prompt = f"Context: {context_text}\n\nQuestion: {question}\nAnswer:"

    # 3. Explicitly create the attention_mask
    inputs = tokenizer(
        prompt, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    ).to(model.device)
    
    # 4. Pass the mask to .generate()
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"], # This fixes the error
            max_new_tokens=20,
            pad_token_id=tokenizer.pad_token_id
        )
    
    return tokenizer.decode(output[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)

In [None]:
from transformers import MambaConfig, MambaForCausalLM
