In [1]:
import torch

## Import Tiny llamas

In [23]:
from model import *
device = "cpu"
ckpt_name = "stories260K/stories260K"
ckpt_path = "tinyllamas/"+ckpt_name+".pt"
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint["model_args"]

model_args = dict()

for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
    model_args[k] = checkpoint_model_args[k]
# create the model
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
state_dict = checkpoint["model"]
# fix the keys of the state dictionary :(
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
model.load_state_dict(state_dict)
iter_num = checkpoint["iter_num"]
best_val_loss = checkpoint["best_val_loss"]
model.to(device)

model.eval()

Transformer(
  (tok_embeddings): Embedding(512, 64)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-4): 5 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=64, out_features=64, bias=False)
        (wk): Linear(in_features=64, out_features=32, bias=False)
        (wv): Linear(in_features=64, out_features=32, bias=False)
        (wo): Linear(in_features=64, out_features=64, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=64, out_features=172, bias=False)
        (w2): Linear(in_features=172, out_features=64, bias=False)
        (w3): Linear(in_features=64, out_features=172, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=64, out_features=512,

## Test model

In [31]:
from tokenizer import Tokenizer

vocab_size = gptconf.vocab_size
# tokenizer_model = "tokenizer.model"
tokenizer_model = "tinyllamas/stories260K/tok512.model"
enc = Tokenizer(tokenizer_model=tokenizer_model)

start = "Lily is happy"

start_ids = enc.encode(start, bos=True, eos=False)
input_data = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
print(input_data)

tensor([[  1, 317, 410, 293, 393]])


## Quantization

In [35]:
model_int8 = torch.ao.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

import os

checkpoint_int8 = {
    "model": model_int8.state_dict(),
    "model_args": model_args,
    "iter_num": iter_num,
    "best_val_loss": best_val_loss,
}
out_dir = "quant/"
torch.save(checkpoint, os.path.join(out_dir, ckpt_name+"_quant.pt"))
from contextlib import nullcontext

num_samples = 1
max_new_tokens = 256
top_k = 300

with torch.no_grad():
    with nullcontext():
            for k in range(num_samples):
                y = model.generate(input_data, max_new_tokens, temperature=0.0, top_k=top_k)
                print(enc.decode(y[0].tolist()))
                print('---------------')

                y_q = model_int8.generate(input_data, max_new_tokens, temperature=0.0, top_k=top_k)
                print(enc.decode(y[0].tolist()))
                print('---------------')
model_int8

Lily is happy to see her mom. She liked to play with her toys and see what was inside. She saw a big box with a big box. She wanted to play with it.
"Look, Mommy, can I play with you?" Lily asked.
"Yes, I can help you."
"OK," Lily said. "It's a small box. It is a small box. It is a small box."
Lily and Mommy went to the box. They saw a big box. They wanted to play with the box. They did not know what to do.
"Wow, it's so pretty!" Lily said.
"Yes, it's a good boy. You are a good friend. You are a good friend. You are a good friend."
Lily
---------------
Lily is happy to see her mom. She liked to play with her toys and see what was inside. She saw a big box with a big box. She wanted to play with it.
"Look, Mommy, can I play with you?" Lily asked.
"Yes, I can help you."
"OK," Lily said. "It's a small box. It is a small box. It is a small box."
Lily and Mommy went to the box. They saw a big box. They wanted to play with the box. They did not know what to do.
"Wow, it's so pretty!" Lily sa

Transformer(
  (tok_embeddings): Embedding(512, 64)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-4): 5 x TransformerBlock(
      (attention): Attention(
        (wq): DynamicQuantizedLinear(in_features=64, out_features=64, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (wk): DynamicQuantizedLinear(in_features=64, out_features=32, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (wv): DynamicQuantizedLinear(in_features=64, out_features=32, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (wo): DynamicQuantizedLinear(in_features=64, out_features=64, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): DynamicQuantizedLinear(in_features=64, out_features=172, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (w2): DynamicQuantizedLinear(in_features=172, o

## llama2.c internal quantization

In [None]:
from export import model_export

model_export(model, os.path.join(out_dir, ckpt_name+"_quant.bin"), version=2)