In [1]:
import sys
import os

# we want to import some llama source later
os.getcwd()
project_path = os.path.abspath("LLM")

if project_path not in sys.path:
    sys.path.append(project_path)

In [2]:
os.chdir('/home/matt/.llama/checkpoints/Llama3.2-1B')
os.getcwd()

'/home/matt/.llama/checkpoints/Llama3.2-1B'

In [3]:
os.chdir('/home/matt/.llama/checkpoints')

In [4]:
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM
import torch

break # It would be great to run this, but it OOMs with my meagre 8Gb of RAM

model_path = 'Llama3.2-1B'
#https://stackoverflow.com/a/78911943

# Load the tokenizer directly from the model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
print("tokenizer loaded")

# Load model configuration from params.json
config = LlamaConfig.from_json_file(f'{model_path}/params.json')
print("config loaded")

# load the model with the specific configs. 
model = LlamaForCausalLM(config=config)
print("model loaded")

# Load the weights of the model
state_dict = torch.load(f'{model_path}/consolidated.00.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
print("weights loaded")

model.eval()
print("eval called")

SyntaxError: 'break' outside loop (1351421061.py, line 4)

## Converting from the default file download format

When downloaded from llama.com, the files look like this

```
checklist.chk  config.json  consolidated.00.pth  params.json  tokenizer.model
```

We want them in the HuggingFace format, to do that I ran this script from the `transformers` package (included here for convenience)

```bash
python3 convert_llama_to_hf.py --input_dir /home/matt/.llama/checkpoints/Llama3.2-1B --model_size 1B --output_dir /home/matt/.llama/checkpoints/Llama3.2-1B-hf --llama_version 3.2
```

That then populates the output directory with the desired files, which look like:

```
config.json  generation_config.json  model.safetensors  special_tokens_map.json  tokenizer.json  tokenizer_config.json
```

In [None]:
from transformers import AutoTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

model_path = "Llama3.2-1B-hf"

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_path)
print("tokenizer loaded")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer loaded


In [None]:
from transformers import AutoModelForCausalLM

model_path = "Llama3.2-1B-hf"

# load model with reduced precision
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",  # automatically uses float16/bfloat16 if available
    low_cpu_mem_usage=True,  # prevents high RAM usage
    device_map="auto"  # automatica`lly assigns layers to GPU/CPU based on available memory
)

print("model loaded")


In [None]:
input_text = "hello how are you?"

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()

with torch.no_grad():  # reduces memory usage
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,  
        temperature=1.0,
        pad_token_id=tokenizer.pad_token_id 
    )

output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output)

## Loading into `Transfomer` instance from source code

We are interested in the activations through the layers of this model, so it would be good to load create an instance of the `Transformer` object defined in the `model.py` file

In [4]:
import json
import torch
from safetensors.torch import load_file
from llama.model_new import Transformer, ModelArgs  # Ensure your model classes are imported

config_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf/config.json"
with open(config_path, "r") as f:
    config = json.load(f)

# extract the necessary fields
model_args = ModelArgs(
    dim=config.get("hidden_size", 4096), 
    n_layers=config.get("num_hidden_layers", 32),  
    n_heads=config.get("num_attention_heads", 32), 
    n_kv_heads=config.get("num_key_value_heads", None), 
    vocab_size=config.get("vocab_size", -1), 
    multiple_of=256, # not in config so use the default
    norm_eps=config.get("rms_norm_eps", 1e-5),  # map "rms_norm_eps"
    max_batch_size=32,  # not in config so use the default
    rotary_embed_len=config.get("max_position_embeddings", 2048),  # map "max_position_embeddings"
    cache_len = 2048,
    ffn_dim_multiplier = 16 / 11
)

print(model_args)


ModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=256, ffn_dim_multiplier=1.4545454545454546, norm_eps=1e-05, rope_theta=500000, use_scaled_rope=False, max_batch_size=32, rotary_embed_len=131072, cache_len=2048, vision_chunk_size=-1, vision_max_num_chunks=4, vision_num_cross_attention_layers=-1)


In [5]:
from llama.model_new import Transformer

# I upgraded to 16Gb of RAM and now this will run - just need to tune the max sequence length as it will preallocate
# the caches in the attention blocks based on that value

# RAM preservation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16 

model = Transformer(model_args)

#.to(dtype=torch_dtype)

print("Transformer created")

safetensors_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf/model.safetensors"  
weights = load_file(safetensors_path)

print("weights in RAM")


attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
attention block initialised
Transformer created
weights in RAM


In [6]:
# the weights' names don't match what is in the code I found
# so we rename them

weights = load_file(safetensors_path)

# create a new state dict with corrected names
fixed_state_dict = {}

for key in weights.keys():
    
    new_key = key
    new_key = new_key.replace("model.", "")
    
    new_key = new_key.replace("embed_tokens.weight", "tok_embeddings.weight")

    new_key = new_key.replace("self_attn.q_proj", "attention.wq")
    new_key = new_key.replace("self_attn.k_proj", "attention.wk")
    new_key = new_key.replace("self_attn.v_proj", "attention.wv")
    new_key = new_key.replace("self_attn.o_proj", "attention.wo")

    new_key = new_key.replace("mlp.gate_proj", "feed_forward.w1")
    new_key = new_key.replace("mlp.up_proj", "feed_forward.w3")
    new_key = new_key.replace("mlp.down_proj", "feed_forward.w2")

    new_key = new_key.replace("input_layernorm", "attention_norm")
    new_key = new_key.replace("post_attention_layernorm", "ffn_norm")

    new_key = new_key.replace("model.norm", "norm")

    fixed_state_dict[new_key] = weights[key]

In [7]:
fixed_state_dict.keys()

dict_keys(['tok_embeddings.weight', 'layers.0.attention_norm.weight', 'layers.0.feed_forward.w2.weight', 'layers.0.feed_forward.w1.weight', 'layers.0.feed_forward.w3.weight', 'layers.0.ffn_norm.weight', 'layers.0.attention.wk.weight', 'layers.0.attention.wo.weight', 'layers.0.attention.wq.weight', 'layers.0.attention.wv.weight', 'layers.1.attention_norm.weight', 'layers.1.feed_forward.w2.weight', 'layers.1.feed_forward.w1.weight', 'layers.1.feed_forward.w3.weight', 'layers.1.ffn_norm.weight', 'layers.1.attention.wk.weight', 'layers.1.attention.wo.weight', 'layers.1.attention.wq.weight', 'layers.1.attention.wv.weight', 'layers.10.attention_norm.weight', 'layers.10.feed_forward.w2.weight', 'layers.10.feed_forward.w1.weight', 'layers.10.feed_forward.w3.weight', 'layers.10.ffn_norm.weight', 'layers.10.attention.wk.weight', 'layers.10.attention.wo.weight', 'layers.10.attention.wq.weight', 'layers.10.attention.wv.weight', 'layers.11.attention_norm.weight', 'layers.11.feed_forward.w2.weight',

In [8]:
# Load the corrected state dict
model.load_state_dict(fixed_state_dict, strict = False) # since the output weights are tied, these are already correct and not loaded - but the Transformer class expects the; so load non-strictly

print("loaded model with corrected state_dict")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

loaded model with corrected state_dict


In [28]:
model

Transformer(
  (tok_embeddings): Embedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x TransformerBlock(
      (attention): Attention(
        (wq): ColumnParallelLinear(in_features=2048, out_features=2048, bias=False)
        (wk): ColumnParallelLinear(in_features=2048, out_features=512, bias=False)
        (wv): ColumnParallelLinear(in_features=2048, out_features=512, bias=False)
        (wo): RowParallelLinear(in_features=2048, out_features=2048, bias=False)
      )
      (feed_forward): FeedForward(
        (w1): ColumnParallelLinear(in_features=2048, out_features=8192, bias=False)
        (w2): RowParallelLinear(in_features=8192, out_features=2048, bias=False)
        (w3): ColumnParallelLinear(in_features=2048, out_features=8192, bias=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): ColumnParallelLinear(in_features=2048, out_features=128256, bias=False)
)

In [None]:
# tie the output embedding manually

model.output.weight.data = model.tok_embeddings.weight.data


In [50]:
from llama.tokenizer import Tokenizer

# go here to find the model file
# https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.model

new_tok_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf-tok/tokenizer.model"
new_tok = Tokenizer(new_tok_path)
new_tok

<llama.tokenizer.Tokenizer at 0x7f39a6d08c20>

In [51]:
from llama.generation_new import Llama

llama = Llama(model, new_tok, model_args)

In [52]:
llama

<llama.generation_new.Llama at 0x7f39a6d09190>

In [None]:
from llama.generation_new import CompletionPrediction

# A high top_p seems necessary here, otherwise we just get strings of numbers
res: CompletionPrediction = llama.text_completion("The capital of France is", max_gen_len=50, top_p = 0.99, temperature = 0.3)

[31mInput to model:
<|begin_of_text|>The capital of France is
[0m


In [69]:
res.generation

' Paris, the capital of France. It is the largest city of France. It is the capital of France. It is the largest city of France. It is the capital of France. It is the capital of France. It is the capital of France'

In [70]:
res.logprobs

[[-2.0769612789154053],
 [-1.274505853652954],
 [-1.5088263750076294],
 [-0.7424589395523071],
 [-0.27164074778556824],
 [-0.2794206738471985],
 [-1.534311294555664],
 [-1.5697606801986694],
 [-0.44362616539001465],
 [-1.2413744926452637],
 [-2.3812146186828613],
 [-0.43809354305267334],
 [-0.9235026836395264],
 [-0.547737181186676],
 [-0.9087198376655579],
 [-1.3606833219528198],
 [-0.4652999937534332],
 [-0.7953785061836243],
 [-0.6358326077461243],
 [-0.19882136583328247],
 [-0.17556649446487427],
 [-0.364422082901001],
 [-0.3728608191013336],
 [-0.08018721640110016],
 [-0.10402780026197433],
 [-0.2573114037513733],
 [-0.08796743303537369],
 [-0.537945568561554],
 [-0.08411218225955963],
 [-0.33591169118881226],
 [-0.20135679841041565],
 [-0.04040507599711418],
 [-0.04492664337158203],
 [-0.27832546830177307],
 [-0.06479234248399734],
 [-0.07717431336641312],
 [-0.6966521143913269],
 [-0.3307095766067505],
 [-0.08723539859056473],
 [-0.07333430647850037],
 [-0.4753025770187378],
 [-