In [3]:
import certifix
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchtune.modules import RotaryPositionalEmbeddings,MultiHeadAttention,RMSNorm, TransformerDecoder,KVCache
from torchtune.models.llama3 import llama3_tokenizer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# for pre-trained weights
from safetensors.torch import load
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Llama 3.2 1B

LLAMA32_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "max_seq_length": 131_072,  # Context length
    "embed_dim": 2048,           # Embedding dimension for ecah head in self-attention
    "num_heads": 32,            # Number of attention heads
    "head_dim":2048 // 32,       # embed_dim // num_heads.
    "layers": 16,               # Number of layers
    "hidden_dim": 8192,         # Size of the intermediate dimension in FeedForward
    "num_kv_heads": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}
'''
# Llama 3.2 3B
LLAMA32_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "max_seq_length": 131_072,  # Context length
    "emb_dim": 3072,           # Embedding dimension = to emb_dim in S.Raschka book
    "num_heads": 24,            # Number of attention heads
    "head_dim":3072 // 24,       # embed_dim // num_heads.
    "layers": 28,               # Number of layers
    "hidden_dim": 8192,         # Size of the intermediate dimension in FeedForward
    "num_kv_heads": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}
'''
LLAMA_SIZE_STR = "1B" if LLAMA32_CONFIG["embed_dim"] == 2048 else "3B"
config = LLAMA32_CONFIG

In [5]:

class ChatFormat:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def encode_header(self, message):
        tokens = []
        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
        return tokens

    def encode(self, text):
        message = {
            "role": "user",
            "content": text
        }

        tokens = self.encode_header(message)
        tokens.extend(
            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
        )
        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
        return tokens

    def decode(self, token_ids):
        return self.tokenizer.decode(token_ids)

In [6]:
from GPT2Model import text_to_token_ids, token_ids_to_text


tokenizer = llama3_tokenizer(
    path="./Llama3p2_1B/tokenizer.model"
    )
print(tokenizer.special_tokens)

tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
print(tokenized_text)

{'<|begin_of_text|>': 128000, '<|end_of_text|>': 128001, '<|reserved_special_token_0|>': 128002, '<|reserved_special_token_1|>': 128003, '<|finetune_right_pad_id|>': 128004, '<|step_id|>': 128005, '<|start_header_id|>': 128006, '<|end_header_id|>': 128007, '<|eom_id|>': 128008, '<|eot_id|>': 128009, '<|python_tag|>': 128010, '<|image|>': 128256, '<|video|>': 128012, '<|reserved_special_token_2|>': 128013, '<|reserved_special_token_3|>': 128014, '<|reserved_special_token_4|>': 128015, '<|reserved_special_token_5|>': 128016, '<|reserved_special_token_6|>': 128017, '<|reserved_special_token_7|>': 128018, '<|reserved_special_token_8|>': 128019, '<|reserved_special_token_9|>': 128020, '<|reserved_special_token_10|>': 128021, '<|reserved_special_token_11|>': 128022, '<|reserved_special_token_12|>': 128023, '<|reserved_special_token_13|>': 128024, '<|reserved_special_token_14|>': 128025, '<|reserved_special_token_15|>': 128026, '<|reserved_special_token_16|>': 128027, '<|reserved_special_toke

In [17]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(config["embed_dim"],config['hidden_dim'], dtype=config["dtype"], bias=False)
        self.fc2 = nn.Linear(config["embed_dim"], config['hidden_dim'], dtype=config["dtype"], bias=False)
        self.fc3 = nn.Linear(config['hidden_dim'], config["embed_dim"], dtype=config["dtype"], bias=False)
        self.silu = nn.SiLU()

    def forward(self, x):
        x_fc1 = self.fc1(x)
        x_fc2 = self.fc2(x)
        x = self.silu(x_fc1) * x_fc2
        return self.fc3(x)

In [8]:

from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
import torch

dim = config['embed_dim']
context_len = 18
base = 10000
# Settings
batch_size = 1 # Bigger than 1 does not work independently
context_len = 5
num_heads = 4
head_dim = 16
# Instantiate the model
rope = Llama3ScaledRoPE(dim = dim)  # Example dimension

queries = torch.randn(batch_size, num_heads, context_len, head_dim)

# Apply the RoPE to the input tensor
output_tensor = rope(queries)

print(output_tensor)

#help(Llama3ScaledRoPE)

tensor([[[[-0.2245, -0.6918,  0.7142,  ...,  0.7376, -0.0289,  1.7565],
          [ 0.6138,  1.4070, -1.2883,  ...,  0.6648, -0.6648, -0.7108],
          [-0.5665,  0.5587,  1.8857,  ...,  0.4827,  0.8625, -0.9762],
          [-0.8107, -0.6530,  0.2214,  ...,  0.2573,  1.4244,  1.3021],
          [-1.2610,  0.9528, -1.1208,  ...,  0.8364,  2.1360, -1.1294]],

         [[ 0.4216, -1.0776, -0.8886,  ...,  0.0750, -0.5544,  0.7537],
          [-0.9787, -1.6997,  1.5933,  ...,  0.3558, -0.5989, -0.9587],
          [ 0.7587, -1.4337, -0.1143,  ..., -1.2783,  0.0923, -0.6573],
          [-1.9951,  0.5230,  0.3864,  ...,  1.1472,  0.9759,  0.5960],
          [ 1.2686, -0.9858,  0.8565,  ..., -0.7090, -1.0185,  0.4284]],

         [[ 0.3841,  0.4506, -1.6809,  ...,  1.7402,  0.6556, -0.9067],
          [-0.6769,  2.3183, -0.1740,  ..., -1.2995, -0.3551,  0.8997],
          [ 1.1586,  0.7523,  0.8372,  ..., -0.1053,  0.2025, -0.2722],
          [-0.2000, -0.0749,  0.3194,  ..., -1.1773,  0.1397

In [9]:
pos_embeddings = RotaryPositionalEmbeddings(config['embed_dim'],config['rope_base'],config['max_seq_length'])
pos_embeddings(queries) # This works

tensor([[[[-0.2245, -0.6918,  0.7142,  ...,  0.7376, -0.0289,  1.7565],
          [ 0.6138,  1.4070, -1.2883,  ...,  0.6648, -0.6648, -0.7108],
          [-0.5665,  0.5587,  1.8857,  ...,  0.4827,  0.8625, -0.9762],
          [-0.8107, -0.6530,  0.2214,  ...,  0.2573,  1.4244,  1.3021],
          [-1.2610,  0.9528, -1.1208,  ...,  0.8364,  2.1360, -1.1294]],

         [[ 0.4216, -1.0776, -0.8886,  ...,  0.0750, -0.5544,  0.7537],
          [-0.9787, -1.6997,  1.5933,  ...,  0.3558, -0.5989, -0.9587],
          [ 0.7587, -1.4337, -0.1143,  ..., -1.2783,  0.0923, -0.6573],
          [-1.9951,  0.5230,  0.3864,  ...,  1.1472,  0.9759,  0.5960],
          [ 1.2686, -0.9858,  0.8565,  ..., -0.7090, -1.0185,  0.4284]],

         [[ 0.3841,  0.4506, -1.6809,  ...,  1.7402,  0.6556, -0.9067],
          [-0.6769,  2.3183, -0.1740,  ..., -1.2995, -0.3551,  0.8997],
          [ 1.1586,  0.7523,  0.8372,  ..., -0.1053,  0.2025, -0.2722],
          [-0.2000, -0.0749,  0.3194,  ..., -1.1773,  0.1397

In [18]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.att =  MultiHeadAttention(
            embed_dim=config["embed_dim"],
            max_seq_len=config["max_seq_length"],
            num_heads=config["num_heads"],
            num_kv_heads=config["num_kv_heads"],
            head_dim=config['head_dim'],
            q_proj = nn.Linear(config['embed_dim'], config["embed_dim"], bias=False, dtype=config["dtype"]), 
            k_proj = nn.Linear(config['embed_dim'], config["num_kv_heads"] * config['head_dim'], bias=False, dtype=config["dtype"]), 
            v_proj = nn.Linear(config['embed_dim'], config["num_kv_heads"] * config['head_dim'], bias=False, dtype=config["dtype"]), 
            output_proj = nn.Linear(config["embed_dim"], config["embed_dim"], bias=False, dtype=config["dtype"]),
            pos_embeddings = Llama3ScaledRoPE(config['embed_dim'])
        )
        self.ff = FeedForward(config)
        self.norm1 = RMSNorm(config["head_dim"], eps=1e-06)
        self.norm2 = RMSNorm(config["head_dim"], eps=1e-06)

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x.to(torch.bfloat16))   # Shape [batch_size, num_tokens, emb_size]
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x.to(torch.bfloat16))
        x = x + shortcut  # Add the original input back

        return x

In [19]:
config

{'vocab_size': 128256,
 'max_seq_length': 131072,
 'embed_dim': 2048,
 'num_heads': 32,
 'head_dim': 64,
 'layers': 16,
 'hidden_dim': 8192,
 'num_kv_heads': 8,
 'rope_base': 500000.0,
 'dtype': torch.bfloat16,
 'rope_freq': {'factor': 32.0,
  'low_freq_factor': 1.0,
  'high_freq_factor': 4.0,
  'original_context_length': 8192}}

In [20]:
class Llama3Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config["vocab_size"], config["embed_dim"], dtype=config["dtype"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(config) for _ in range(config["layers"])])

        self.final_norm = RMSNorm(config["embed_dim"], eps=1e-06)
        self.out_head = nn.Linear(config["embed_dim"], config["vocab_size"], bias=False, dtype=config["dtype"])

    def forward(self, in_idx):
        tok_embeds = self.tok_emb(in_idx)
        x = tok_embeds
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x.to(torch.bfloat16))
        return logits
    
model = Llama3Model(LLAMA32_CONFIG)
model.eval()

Llama3Model(
  (tok_emb): Embedding(128256, 2048)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=512, bias=False)
        (v_proj): Linear(in_features=2048, out_features=512, bias=False)
        (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (pos_embeddings): Llama3ScaledRoPE()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=2048, out_features=8192, bias=False)
        (fc2): Linear(in_features=2048, out_features=8192, bias=False)
        (fc3): Linear(in_features=8192, out_features=2048, bias=False)
        (silu): SiLU()
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_feat

In [21]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 1,498,419,200


In [22]:
# Account for weight tying
total_params_normalized = total_params - model.tok_emb.weight.numel()
print(f"\nTotal number of unique parameters: {total_params_normalized:,}")


Total number of unique parameters: 1,235,750,912


In [14]:
# Calling a transformer encoder from torchtune with 1B
from torchtune.models.llama3_2 import llama3_2_1b

llama32_1b = llama3_2_1b()
llama32_1b


TransformerDecoder(
  (tok_embeddings): Embedding(128256, 2048)
  (layers): ModuleList(
    (0-15): 16 x TransformerSelfAttentionLayer(
      (attn): MultiHeadAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=512, bias=False)
        (v_proj): Linear(in_features=2048, out_features=512, bias=False)
        (output_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (pos_embeddings): Llama3ScaledRoPE()
      )
      (mlp): FeedForward(
        (w1): Linear(in_features=2048, out_features=8192, bias=False)
        (w2): Linear(in_features=8192, out_features=2048, bias=False)
        (w3): Linear(in_features=2048, out_features=8192, bias=False)
        (activation): SiLU()
      )
      (sa_norm): RMSNorm()
      (mlp_norm): RMSNorm()
      (sa_scale): Identity()
      (mlp_scale): Identity()
    )
  )
  (norm): RMSNorm()
)

In [15]:
total_params = sum(p.numel() for p in llama32_1b.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 1,235,814,400


In [17]:
llama32_1b.tok_embeddings.weight.numel()

262668288

In [23]:
llama32_1b.tok_embeddings.weight

Parameter containing:
tensor([[ 0.2699,  1.5265, -0.4588,  ..., -1.5401,  1.0571, -2.0669],
        [-1.7566,  0.1267, -0.1152,  ...,  1.6729,  1.3174, -0.8286],
        [ 2.0697, -0.7493, -2.4904,  ..., -0.6723,  1.1954, -0.7240],
        ...,
        [ 1.3217, -0.1147,  1.1989,  ...,  0.4049,  0.7400,  2.4869],
        [ 0.6836, -1.8381, -0.2505,  ..., -1.2482,  0.3985, -0.7639],
        [-0.9251, -1.3305,  0.3496,  ..., -0.6322, -0.3491, -0.7308]],
       requires_grad=True)

In [24]:
# Account for weight tying
total_params_normalized = total_params - llama32_1b.tok_embeddings.weight.numel()
print(f"\nTotal number of unique parameters: {total_params_normalized:,}")


Total number of unique parameters: 1,235,750,912
