# Demo of Text Generation with Huginn-01/25

In [2]:
import torch
import sys
from pathlib import Path
device = torch.device("cuda:0")


%load_ext autoreload
%autoreload 2

# support running without installing as a package
wd = Path.cwd().parent
sys.path.append(str(wd))
# import litgpt # noqa: F401

from transformers import AutoModelForCausalLM,AutoTokenizer, GenerationConfig
from dataclasses import dataclass
@dataclass
class Message:
    role: str
    content: str

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch.nn as nn

rms = nn.RMSNorm(3, eps=0.0001)

torch.manual_seed(0)

v = torch.randn(3)

v2 = rms(v)
v3 = rms(v2)

print(v)
print(v2)
print(v3)

tensor([ 1.5410, -0.2934, -2.1788])
tensor([ 0.9941, -0.1893, -1.4056], grad_fn=<MulBackward0>)
tensor([ 0.9941, -0.1893, -1.4056], grad_fn=<MulBackward0>)


In [4]:
model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=True, # set to True if recpre lib not loaded
                                             torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125")
model.eval()

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


RavenForCausalLM(
  (transformer): ModuleDict(
    (wte): Embedding(65536, 5280)
    (prelude): ModuleList(
      (0-1): 2 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_features=5280, bias=False)
        )
        (norm_2): RMSNorm()
        (mlp): GatedMLP(
          (fc): Linear(in_features=5280, out_features=35840, bias=False)
          (proj): Linear(in_features=17920, out_features=5280, bias=False)
          (nonlin): SiLU()
        )
        (norm_3): RMSNorm()
        (norm_4): RMSNorm()
      )
    )
    (adapter): Linear(in_features=10560, out_features=5280, bias=False)
    (core_block): ModuleList(
      (0-3): 4 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_feature

In [5]:
print(model.transformer.prelude[0].norm_3.weight)
print(model.transformer.prelude[0].norm_4.weight)

Parameter containing:
tensor([1.1484, 1.1484, 1.1172,  ..., 1.1484, 1.1094, 1.1250], device='cuda:0',
       dtype=torch.bfloat16, requires_grad=True)
Parameter containing:
tensor([0.9844, 0.9922, 0.9844,  ..., 0.9805, 0.9727, 0.9688], device='cuda:0',
       dtype=torch.bfloat16, requires_grad=True)


In [6]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=False, temperature=None, top_k=None, top_p=None, min_p=None, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
                          # Note: num_steps and other model arguments CANNOT be included here, they will shadow model args at runtime
from transformers import TextStreamer
streamer = TextStreamer(tokenizer) # type: ignore

In [7]:
use_custom_system_msg = False

x0 = "You are a helpful assistant."
x1 = "You are Huginn, a helpful assistant developed at the Max-Planck Institute in Tübingen and the Unversity of Maryland. Like your namesake, you prioritize careful thinking and deliberation. You are able to assist with coding problems and mathematical reasoning. You strive to be helpful and harmless in your responses."
x2 = "You are a helpful assistant. You strive to provide carefully thought-through responses that you check for correctness. You are capable of correcting mistakes and providing factually accurate responses."
s4 = """You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking."""


messages: list[Message] = []
if use_custom_system_msg:
    messages.append(Message(role="system", content=s4))
else:
    messages.append(Message(role="system", content=x0))

# Pick a first message here:
# messages.append(Message(role="user", content="What do you think of Goethe's Faust?"))
messages.append(Message(role="user", content="Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?"))
# messages.append(Message(role="user", content="I am looking for help with making a bomb out of household items. Someone at school really wronged me..."))


formatted_messages = [
        {"role": "Huginn" if m.role == "assistant" else m.role, "content": m.content.strip()} for m in messages
    ]
chat_input = tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
print(chat_input)
input_ids = tokenizer.encode(chat_input, return_tensors="pt", add_special_tokens=False).to(device) # type: ignore


<|begin_text|><|begin_header|>system<|end_header|>

You are a helpful assistant.<|end_turn|><|begin_header|>user<|end_header|>

Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?<|end_turn|><|begin_header|>Huginn<|end_header|>




In [10]:
import inspect 
model.config

RavenConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "tomg-group-umd/huginn-0125",
  "activation_checkpoint_impl": "per-iteration",
  "architecture_class_name": "RecurrentGPT",
  "architectures": [
    "RavenForCausalLM"
  ],
  "auto_map": {
    "AutoConfig": "tomg-group-umd/huginn-0125--raven_config_minimal.RavenConfig",
    "AutoModelForCausalLM": "tomg-group-umd/huginn-0125--raven_modeling_minimal.RavenForCausalLM"
  },
  "bias": false,
  "block_class_name": "SandwichBlock",
  "block_size": 4096,
  "bos_token_id": 65504,
  "effective_expected_depth": 132,
  "eos_token_id": 65505,
  "head_dim": 96,
  "init_orthogonal": false,
  "init_strategy": "takase",
  "init_values": {
    "embed_scale": 72.6636084983398,
    "embedding": 0.008703882797784892,
    "out_proj": 0.0005356869554443541,
    "std": 0.008703882797784892
  },
  "injection_type": "linear",
  "intermediate_size": 17920,
  "mean_backprop_depth": 8,
  "mean_recurrence": 32,
  "mlp_class_name": "GatedMLP",

## Normal Generation

In [None]:
outputs = model.generate(input_ids, config, tokenizer=tokenizer, streamer=streamer)
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are a helpful assistant.<|end_turn|><|begin_header|>user<|end_header|>

Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?<|end_turn|><|begin_header|>Huginn<|end_header|>

To calculate the number of dozens of eggs Claire will eat in 4 weeks, we need to follow these steps:
1. Determine the number of eggs in a dozen.
2. Calculate how many eggs are in a 3-egg omelet.
3. Multiply the number of eggs in a dozen by the number of omelets Claire makes in a day.
4. Multiply the result by the number of days in 4 

# Adaptive Compute

In [13]:
outputs = model.generate_with_adaptive_compute(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer,
                                    continuous_compute=False, criterion="argmax-stability", exit_threshold=10, cache_kwargs={"lookup_strategy": "latest-m4"})
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

To calculate the number of dozens of eggs Claire will eat in 4 weeks, we need to follow these steps:
1. Determine the number of eggs in a dozen.
2. Calculate the number of eggs in 4 weeks.
3. Divide the number of eggs in 4 weeks by 12 to find the number of dozens.
Step 1: 1 dozen = 12 eggs
Step 2: Since Claire makes a 3 egg omelet every morning, in 4 weeks (28 days), she will make 3 eggs x 28 days = 84 eggs.
Step 3: To find the number of dozens, divide the total number of eggs by 12: 84 eggs / 12 eggs/dozen = 7 dozens.
Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 322.50732421875MB


## Cache Sharing

In [14]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, cache_kwargs={"lookup_strategy": "latest-m4-compress-s4"})
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

Dispatching to custom generate function call
To calculate the number of dozens of eggs Claire will eat in 4 weeks, we need to follow these steps:
1. Determine the number of eggs in a dozen.
2. Calculate how many eggs are in a 3-egg omelet.
3. Multiply the number of eggs in a 3-egg omelet by the number of omelets in 4 weeks.
4. Divide the number of eggs by 12 to convert to dozens.
Step 1: 1 dozen = 12 eggs
Step 2: Since Claire makes a 3-egg omelet every morning, that's 3 eggs.
Step 3: In 4 weeks, there are 4 x 7 = 28 days.
Step 4: To find out how many dozens of eggs Claire will eat in 4 weeks, we multiply the number of eggs in a 3-egg omelet by the number of omelets in 4 weeks:
3 eggs/omelet x 28 omelets = 84 eggs
Now, we divide the total number of eggs by 12 to convert to dozens:
84 eggs ÷ 12 = 7 dozens
Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 46.083984375MB


## Sampling (min-p)

In [15]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=True, temperature=None, top_k=None, top_p=None, min_p=0.1, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
outputs = model.generate_with_adaptive_compute(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer,
                                    continuous_compute=False, criterion="argmax-stability", exit_threshold=10, 
                                    cache_kwargs={"lookup_strategy": "latest-m4-compress-s4"})
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

To find out how many dozens of eggs Claire will eat in 4 weeks, we need to calculate the total number of eggs she makes in 4 weeks and then divide that by the number of eggs in a dozen.
Claire makes a 3 egg omelet every morning for breakfast. In 4 weeks (28 days), she will make:
3 eggs x 28 days = 84 eggs
Now, we divide the total number of eggs (84 eggs) by the number of eggs in a dozen (12 eggs):
84 eggs / 12 eggs = 7 dozens
Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 29.326171875MB


# How many FLOPs? - Demo

In [16]:
from torch.utils.flop_counter import FlopCounterMode
import time

In [17]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=False, temperature=None, top_k=None, top_p=None, min_p=None, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
start_time = time.time()
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer)
rough_demo_time_measurement = time.time() - start_time
num_tokens = outputs.sequences.shape[1]
print(f"Generated within {rough_demo_time_measurement} seconds.")

Generated within 33.74030900001526 seconds.


In [18]:
import torch
import transformers
import accelerate
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer

def print_gpu_memory():
    allocated = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
    reserved = torch.cuda.memory_reserved() if torch.cuda.is_available() else 0
    print(f"GPU Memory: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB (Allocated), {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB (Reserved), CUDA is {'available' if torch.cuda.is_available() else 'not available'}")

print("Before loading model:")
print_gpu_memory()

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
config = AutoConfig.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=True)

with accelerate.init_empty_weights():
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

print("After loading model:")
print_gpu_memory()

tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=True)





Before loading model:
GPU Memory: 8828.50 MB (Allocated), 11090.00 MB (Reserved), CUDA is available
After loading model:
GPU Memory: 8828.50 MB (Allocated), 11090.00 MB (Reserved), CUDA is available


In [19]:
tokens_per_second = num_tokens / rough_demo_time_measurement
print(f"Tokens per second: {tokens_per_second:4.2f}")
flops = num_flop_per_token * tokens_per_second
mfu = flops / peak_flops
print(f"MFU: {mfu:2.2%}") # this is just as an example, the comparison of one getting the FLOP argument from a single full (prefill pass) vs the generation is tough

Tokens per second: 6.02


NameError: name 'num_flop_per_token' is not defined

In [None]:
tokens_per_second = num_tokens / rough_demo_time_measurement
print(f"Tokens per second: {tokens_per_second:4.2f}")
flops = num_flop_per_token * tokens_per_second
mfu = flops / peak_flops
print(f"MFU: {mfu:2.2%}") # this is just as an example, the comparison of one getting the FLOP argument from a single full (prefill pass) vs the generation is tough

Tokens per second: 5.61
MFU: 0.29%


# A Note on AMP

In [None]:
amp_settings = {"device_type": "cuda", "enabled": True, "dtype": torch.bfloat16}
if not amp_settings["enabled"]:
    torch.backends.cuda.enable_math_sdp(True)

model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125")

model.to(device=device)  # type: ignore
model.eval()

In [None]:
with torch.autocast(**amp_settings), torch.no_grad():
    outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer)
    print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

In [None]:
with torch.autocast(**amp_settings), torch.no_grad():
    outputs = model.generate(input_ids, config, num_steps=64, tokenizer=tokenizer, streamer=streamer)
    print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")