# Running Llama locally

In this notebook we load the Llama3.2 1 Billion parameter model, and perform text completion

In [1]:
import gc

gc.collect()

31

In [2]:
import os

os.chdir('/home/matt/.llama/checkpoints/Llama3.2-1B')
os.getcwd()

'/home/matt/.llama/checkpoints/Llama3.2-1B'

In [3]:
os.chdir('/home/matt/.llama/checkpoints')

In [4]:
if False: # It would be great to run this, but it OOMs 

    from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
    import torch

    model_path = 'Llama3.2-1B'
    #https://stackoverflow.com/a/78911943

    # Load the tokenizer directly from the model path
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print("tokenizer loaded")

    # Load model configuration from params.json
    config = LlamaConfig.from_json_file(f'{model_path}/params.json')
    print("config loaded")

    # load the model with the specific configs. 
    model = LlamaForCausalLM(config=config)
    print("model loaded")

    # Load the weights of the model
    state_dict = torch.load(f'{model_path}/consolidated.00.pth', map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    print("weights loaded")

    model.eval()
    print("eval called")

## Converting from the default file download format

When downloaded from llama.com, the files look like this

```
checklist.chk  config.json  consolidated.00.pth  params.json  tokenizer.model
```

We want them in the HuggingFace format, to do that I ran this script from the `transformers` package (included here for convenience)

```bash
python3 convert_llama_to_hf.py --input_dir /home/matt/.llama/checkpoints/Llama3.2-1B --model_size 1B --output_dir /home/matt/.llama/checkpoints/Llama3.2-1B-hf --llama_version 3.2
```

That then populates the output directory with the desired files, which look like:

```
config.json  generation_config.json  model.safetensors  special_tokens_map.json  tokenizer.json  tokenizer_config.json
```

In [5]:
from transformers import AutoTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

model_path = "Llama3.2-1B-hf"

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_path)
print("tokenizer loaded")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer loaded


In [6]:
from transformers import AutoModelForCausalLM

model_path = "Llama3.2-1B-hf"

# load model with reduced precision
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",  # automatically uses float16/bfloat16 if available
    low_cpu_mem_usage=True,  # prevents high RAM usage
    device_map="auto"  # automatically assigns layers to GPU/CPU based on available memory
)

print("model loaded")


model loaded


In [None]:
import torch

input_text = "The capital of France is "

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()

with torch.no_grad():  # reduces memory usage
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,  
        temperature=1.0,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.0
    )

output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output)

The capital of France is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is  France  is


In [None]:
import torch

input_text = "The capital of France is "

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()

with torch.no_grad():  # reduces memory usage
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,  
        temperature=1.2,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2
    )

output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output)

The capital of France is 100% French (in terms of culture and traditions), and Paris has something for everybody to see and do. It is a must-see city for anyone wishing to explore France.
The city of Paris has a population of 2.14 million (city area), and 20.2 million people (including suburbs and metropolitan area).
It has the largest GDP in France.
It has the best infrastructure in France, with one of the world’s busiest airports (Charles de Gaulle).
The capital city of France is 100% French (in terms of culture and traditions), and Paris has something for everybody to see and do. It is a must-see city for anyone wishing to explore France.
The city of


## Penalising repetition

This is done via a parameter - see the large difference it makes

In [19]:
from transformers import AutoConfig

# check the rope params for when we're doing stuff with the source code

model_path = "Llama3.2-1B-hf"

# Load the model configuration
config = AutoConfig.from_pretrained(model_path)

# Print configuration details
print(config.rope_scaling)


{'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}


In [9]:
print(config.rope_theta)

500000.0


In [10]:
model.__dict__.keys()



In [11]:
model._modules['model'].__dict__.keys()



In [12]:
model._modules['model']._modules['rotary_emb'].original_inv_freq

tensor([1.0000e+00, 6.6360e-01, 4.4037e-01, 2.9223e-01, 1.9392e-01, 1.2869e-01,
        8.5397e-02, 5.6670e-02, 3.7606e-02, 2.4955e-02, 1.6560e-02, 1.0990e-02,
        7.2927e-03, 4.8394e-03, 3.2114e-03, 1.2905e-03, 4.2956e-04, 9.7083e-05,
        1.9462e-05, 1.2915e-05, 8.5703e-06, 5.6872e-06, 3.7741e-06, 2.5045e-06,
        1.6620e-06, 1.1029e-06, 7.3187e-07, 4.8567e-07, 3.2229e-07, 2.1387e-07,
        1.4193e-07, 9.4183e-08])

In [13]:
model._modules['model']._modules['rotary_emb'].__dict__.keys()

dict_keys(['training', '_parameters', '_buffers', '_non_persistent_buffers_set', '_backward_pre_hooks', '_backward_hooks', '_is_full_backward_hook', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_hooks_always_called', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_state_dict_hooks', '_state_dict_pre_hooks', '_load_state_dict_pre_hooks', '_load_state_dict_post_hooks', '_modules', 'rope_type', 'max_seq_len_cached', 'original_max_seq_len', 'config', 'rope_init_fn', 'attention_scaling', 'original_inv_freq', '_is_hf_initialized'])

In [14]:
model._modules['model']._modules['rotary_emb'].max_seq_len_cached

131072

In [15]:
model._modules['model']._modules['rotary_emb'].original_max_seq_len

131072

In [16]:
model._modules['model']._modules['rotary_emb'].attention_scaling

1.0

In [17]:
model._modules['model']

LlamaModel(
  (embed_tokens): Embedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=512, bias=False)
        (v_proj): Linear(in_features=2048, out_features=512, bias=False)
        (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
        (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
        (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
    )
  )
  (norm): LlamaRMSNorm((2048,), eps=1e-05)
  (rotary_emb): LlamaRotaryEmbedding()
)