In [1]:
import sys
import os

# we want to import some llama source later
os.getcwd()
project_path = os.path.abspath("LLM")

if project_path not in sys.path:
    sys.path.append(project_path)

In [2]:
os.chdir('/home/matt/.llama/checkpoints/Llama3.2-1B')
os.getcwd()

'/home/matt/.llama/checkpoints/Llama3.2-1B'

In [3]:
os.chdir('/home/matt/.llama/checkpoints')

In [None]:
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM
import torch

break # It would be great to run this, but it OOMs with my meagre 8Gb of RAM

model_path = 'Llama3.2-1B'
#https://stackoverflow.com/a/78911943

# Load the tokenizer directly from the model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
print("tokenizer loaded")

# Load model configuration from params.json
config = LlamaConfig.from_json_file(f'{model_path}/params.json')
print("config loaded")

# load the model with the specific configs. 
model = LlamaForCausalLM(config=config)
print("model loaded")

# Load the weights of the model
state_dict = torch.load(f'{model_path}/consolidated.00.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
print("weights loaded")

model.eval()
print("eval called")

## Converting from the default file download format

When downloaded from llama.com, the files look like this

```
checklist.chk  config.json  consolidated.00.pth  params.json  tokenizer.model
```

We want them in the HuggingFace format, to do that I ran this script from the `transformers` package (included here for convenience)

```bash
python3 convert_llama_to_hf.py --input_dir /home/matt/.llama/checkpoints/Llama3.2-1B --model_size 1B --output_dir /home/matt/.llama/checkpoints/Llama3.2-1B-hf --llama_version 3.2
```

That then populates the output directory with the desired files, which look like:

```
config.json  generation_config.json  model.safetensors  special_tokens_map.json  tokenizer.json  tokenizer_config.json
```

In [None]:
from transformers import AutoTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

model_path = "Llama3.2-1B-hf"

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_path)
print("tokenizer loaded")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer loaded


In [None]:
from transformers import AutoModelForCausalLM

model_path = "Llama3.2-1B-hf"

# load model with reduced precision
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",  # automatically uses float16/bfloat16 if available
    low_cpu_mem_usage=True,  # prevents high RAM usage
    device_map="auto"  # automatica`lly assigns layers to GPU/CPU based on available memory
)

print("model loaded")


In [None]:
input_text = "hello how are you?"

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()

with torch.no_grad():  # reduces memory usage
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,  
        temperature=1.0,
        pad_token_id=tokenizer.pad_token_id 
    )

output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output)

## Loading into `Transfomer` instance from source code

We are interested in the activations through the layers of this model, so it would be good to load create an instance of the `Transformer` object defined in the `model.py` file

In [4]:
import json
import torch
from safetensors.torch import load_file
from llama.model import Transformer, ModelArgs  # Ensure your model classes are imported

config_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf/config.json"
with open(config_path, "r") as f:
    config = json.load(f)

# extract the necessary fields
model_args = ModelArgs(
    dim=config.get("hidden_size", 4096), 
    n_layers=config.get("num_hidden_layers", 32),  
    n_heads=config.get("num_attention_heads", 32), 
    n_kv_heads=config.get("num_key_value_heads", None), 
    vocab_size=config.get("vocab_size", -1), 
    multiple_of=256, # not in config so use the default
    ffn_dim_multiplier=None,  # not in config so use the default
    norm_eps=config.get("rms_norm_eps", 1e-5),  # map "rms_norm_eps"
    max_batch_size=32,  # not in config so use the default
    rotary_embed_len=config.get("max_position_embeddings", 2048),  # map "max_position_embeddings"
    cache_len = 2048,
)

print(model_args)


ModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, rotary_embed_len=131072, cache_len=2048)


In [5]:
from llama.model import Transformer

# I upgraded to 16Gb of RAM and now this will run - just need to tune the max sequence length as it will preallocate
# the caches in the attention blocks based on that value

# RAM preservation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16 

model = Transformer(model_args)

#.to(dtype=torch_dtype)

print("Transformer created")

safetensors_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf/model.safetensors"  
weights = load_file(safetensors_path)

print("weights in RAM")

model.load_state_dict(weights, strict=False)

print("weights in model")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 done wv
 done wo
 done cache k
 done cache v
attention initialising
 done wq
 done wk
 d

In [6]:
model

Transformer(
  (tok_embeddings): ParallelEmbedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x TransformerBlock(
      (attention): Attention(
        (wq): ColumnParallelLinear(in_features=2048, out_features=2048, bias=False)
        (wk): ColumnParallelLinear(in_features=2048, out_features=512, bias=False)
        (wv): ColumnParallelLinear(in_features=2048, out_features=512, bias=False)
        (wo): RowParallelLinear(in_features=2048, out_features=2048, bias=False)
      )
      (feed_forward): FeedForward(
        (w1): ColumnParallelLinear(in_features=2048, out_features=5632, bias=False)
        (w2): RowParallelLinear(in_features=5632, out_features=2048, bias=False)
        (w3): ColumnParallelLinear(in_features=2048, out_features=5632, bias=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): ColumnParallelLinear(in_features=2048, out_features=128256, bias=False)
)

In [None]:
from llama.tokenizer import Tokenizer

# go here to find the model file
# https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.model

new_tok_path = "/home/matt/.llama/checkpoints/Llama3.2-1B-hf-tok/tokenizer.model"
new_tok = Tokenizer(new_tok_path)
new_tok

<llama.tokenizer.Tokenizer at 0x7fc4821f49b0>