# Use Torchtune Llama_32_1b_lora for Classification Tasks

## 1 Imports

In [2]:
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchtune.modules import RotaryPositionalEmbeddings,MultiHeadAttention,RMSNorm, TransformerDecoder,KVCache
from torchtune.models.llama3 import llama3_tokenizer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# for pre-trained weights
from safetensors.torch import load
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [8]:
import os
os.getcwd()

'c:\\Users\\MRM\\Desktop\\Data_Analytics\\Medium_and_PPB\\Machine_Learning\\Machine_Learning_Projects\\NLP\\Build_a_LLM_from_scratch\\llm_from_scratch'

In [None]:
tokenizer = llama3_tokenizer(
    path="./Llama32_1B/original/tokenizer.model"
    )
print(tokenizer.special_tokens)

FileNotFoundError: [Errno 2] No such file or directory: '/Llama32_1B/original/tokenizer.model'

In [3]:
# Calling a transformer encoder from torchtune with 1B
from torchtune.models.llama3_2 import lora_llama3_2_1b, llama3_2_1b
base_model = llama3_2_1b()
#  ["q_proj", "k_proj", "v_proj", "output_proj"]
latt_mods = ["q_proj", "v_proj"]
lora_llama32_1b = lora_llama3_2_1b(lora_attn_modules=latt_mods)
lora_llama32_1b


TransformerDecoder(
  (tok_embeddings): Embedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x TransformerSelfAttentionLayer(
      (attn): MultiHeadAttention(
        (q_proj): LoRALinear(
          (dropout): Identity()
          (lora_a): Linear(in_features=2048, out_features=8, bias=False)
          (lora_b): Linear(in_features=8, out_features=2048, bias=False)
        )
        (k_proj): Linear(in_features=2048, out_features=512, bias=False)
        (v_proj): LoRALinear(
          (dropout): Identity()
          (lora_a): Linear(in_features=2048, out_features=8, bias=False)
          (lora_b): Linear(in_features=8, out_features=512, bias=False)
        )
        (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (pos_embeddings): Llama3ScaledRoPE()
      )
      (mlp): FeedForward(
        (w1): Linear(in_features=2048, out_features=8192, bias=False)
        (w2): Linear(in_features=8192, out_features=2048, bias=False)
        (w3): Linear(in_

In [19]:
base_model.parameters

<bound method Module.parameters of TransformerDecoder(
  (tok_embeddings): Embedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x TransformerSelfAttentionLayer(
      (attn): MultiHeadAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=512, bias=False)
        (v_proj): Linear(in_features=2048, out_features=512, bias=False)
        (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (pos_embeddings): Llama3ScaledRoPE()
      )
      (mlp): FeedForward(
        (w1): Linear(in_features=2048, out_features=8192, bias=False)
        (w2): Linear(in_features=8192, out_features=2048, bias=False)
        (w3): Linear(in_features=2048, out_features=8192, bias=False)
        (activation): SiLU()
      )
      (sa_norm): RMSNorm()
      (mlp_norm): RMSNorm()
      (sa_scale): Identity()
      (mlp_scale): Identity()
    )
  )
  (norm): RMSNorm()
)>

In [4]:
total_params = sum(p.numel() for p in lora_llama32_1b.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 1,236,666,368


In [5]:
def model_memory_size(model, input_dtype=torch.float32):
    total_params = 0
    total_grads = 0
    for param in model.parameters():
        # Calculate total number of elements per parameter
        param_size = param.numel()
        total_params += param_size
        # Check if gradients are stored for this parameter
        if param.requires_grad:
            total_grads += param_size

    # Calculate buffer size (non-parameters that require memory)
    total_buffers = sum(buf.numel() for buf in model.buffers())

    # Size in bytes = (Number of elements) * (Size of each element in bytes)
    # We assume parameters and gradients are stored in the same type as input dtype
    element_size = torch.tensor(0, dtype=input_dtype).element_size()
    total_memory_bytes = (total_params + total_grads + total_buffers) * element_size

    # Convert bytes to gigabytes
    total_memory_gb = total_memory_bytes / (1024**3)

    return total_memory_gb

print('lora_llama32_1b memory footprint')
print(f"float32 (PyTorch default): {model_memory_size(lora_llama32_1b, input_dtype=torch.float32):.2f} GB")
print(f"bfloat16: {model_memory_size(lora_llama32_1b, input_dtype=torch.bfloat16):.2f} GB")
print('')
print('base llama32_1b memory footprint')
print(f"float32 (PyTorch default): {model_memory_size(base_model, input_dtype=torch.float32):.2f} GB")
print(f"bfloat16: {model_memory_size(base_model, input_dtype=torch.bfloat16):.2f} GB")



lora_llama32_1b memory footprint
float32 (PyTorch default): 9.25 GB
bfloat16: 4.62 GB

base llama32_1b memory footprint
float32 (PyTorch default): 9.24 GB
bfloat16: 4.62 GB


In [None]:
from safetensors.torch import load

file_path = "./Llama3p2_1B/model.safetensors"
with open(file_path, "rb") as f:
    data = f.read()

loaded_llama = load(data)

In [16]:
loaded_llama.keys()

dict_keys(['model.layers.7.self_attn.o_proj.weight', 'model.layers.9.self_attn.k_proj.weight', 'model.layers.15.mlp.down_proj.weight', 'model.layers.6.mlp.down_proj.weight', 'model.layers.3.self_attn.k_proj.weight', 'model.layers.9.self_attn.o_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.12.self_attn.v_proj.weight', 'model.layers.13.self_attn.k_proj.weight', 'model.layers.5.mlp.gate_proj.weight', 'model.layers.8.post_attention_layernorm.weight', 'model.layers.15.self_attn.o_proj.weight', 'model.layers.3.post_attention_layernorm.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.15.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.6.mlp.gate_proj.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.14.self_attn.q_proj.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.11.mlp.gate_proj.weight', 'model.layers.12.mlp.gat

In [20]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")

    if isinstance(right, torch.Tensor):
        return torch.nn.Parameter(right.clone().detach())
    else:
        return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_llama(model, param_config, params):
    
    model.tok_embeddings.weight = assign(model.tok_embeddings.weight, params.embed_tokens.weight)

    for l in range(param_config["n_layers"]):

        # Load attention weights
        model.trf_blocks[l].att.W_query.weight = assign(
            model.trf_blocks[l].att.W_query.weight,
            params[f"layers.{l}.attention.wq.weight"]
        )
        model.trf_blocks[l].att.W_key.weight = assign(
            model.trf_blocks[l].att.W_key.weight,
            params[f"layers.{l}.attention.wk.weight"]
        )
        model.trf_blocks[l].att.W_value.weight = assign(
            model.trf_blocks[l].att.W_value.weight,
            params[f"layers.{l}.attention.wv.weight"]
        )
        model.trf_blocks[l].att.out_proj.weight = assign(
            model.trf_blocks[l].att.out_proj.weight,
            params[f"layers.{l}.attention.wo.weight"]
        )
        model.trf_blocks[l].norm1.weight = assign(
            model.trf_blocks[l].norm1.weight,
            params[f"layers.{l}.attention_norm.weight"]
        )

        # Load FeedForward weights
        model.trf_blocks[l].ff.fc1.weight = assign(
            model.trf_blocks[l].ff.fc1.weight,
            params[f"layers.{l}.feed_forward.w1.weight"]
        )
        # For some reason w2 and w3 are provided in the wrong order in the weights file
        model.trf_blocks[l].ff.fc2.weight = assign(
            model.trf_blocks[l].ff.fc2.weight,
            params[f"layers.{l}.feed_forward.w3.weight"]
        )
        model.trf_blocks[l].ff.fc3.weight = assign(
            model.trf_blocks[l].ff.fc3.weight,
            params[f"layers.{l}.feed_forward.w2.weight"]
        )
        model.trf_blocks[l].norm2.weight = assign(
            model.trf_blocks[l].norm2.weight,
            params[f"layers.{l}.ffn_norm.weight"]
        )

    # Load output layer weights
    model.final_norm.weight = assign(model.final_norm.weight, params["norm.weight"])
    model.out_head.weight = assign(model.out_head.weight, params["output.weight"])


In [21]:
load_weights_into_llama(lora_llama32_1b, base_model, loaded_llama)
lora_llama32_1b.to(device)

AttributeError: 'dict' object has no attribute 'embed_tokens'

In [6]:
# Assuming that base_model already has the pretrained Llama2 weights,
# this will directly load them into your LoRA model without any conversion necessary.
lora_llama32_1b.load_state_dict(base_model.state_dict(), strict=False)

_IncompatibleKeys(missing_keys=['layers.0.attn.q_proj.lora_a.weight', 'layers.0.attn.q_proj.lora_b.weight', 'layers.0.attn.v_proj.lora_a.weight', 'layers.0.attn.v_proj.lora_b.weight', 'layers.1.attn.q_proj.lora_a.weight', 'layers.1.attn.q_proj.lora_b.weight', 'layers.1.attn.v_proj.lora_a.weight', 'layers.1.attn.v_proj.lora_b.weight', 'layers.2.attn.q_proj.lora_a.weight', 'layers.2.attn.q_proj.lora_b.weight', 'layers.2.attn.v_proj.lora_a.weight', 'layers.2.attn.v_proj.lora_b.weight', 'layers.3.attn.q_proj.lora_a.weight', 'layers.3.attn.q_proj.lora_b.weight', 'layers.3.attn.v_proj.lora_a.weight', 'layers.3.attn.v_proj.lora_b.weight', 'layers.4.attn.q_proj.lora_a.weight', 'layers.4.attn.q_proj.lora_b.weight', 'layers.4.attn.v_proj.lora_a.weight', 'layers.4.attn.v_proj.lora_b.weight', 'layers.5.attn.q_proj.lora_a.weight', 'layers.5.attn.q_proj.lora_b.weight', 'layers.5.attn.v_proj.lora_a.weight', 'layers.5.attn.v_proj.lora_b.weight', 'layers.6.attn.q_proj.lora_a.weight', 'layers.6.attn.q_p