In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

llm_prompt_recovery_path = kagglehub.competition_download('llm-prompt-recovery')
tsr564_gemma_rewrite_nbroad_1_path = kagglehub.dataset_download('tsr564/gemma-rewrite-nbroad-1')
mistral_ai_mixtral_pytorch_8x7b_instruct_v0_1_hf_1_path = kagglehub.model_download('mistral-ai/mixtral/PyTorch/8x7b-instruct-v0.1-hf/1')

print('Data source import complete.')


In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git  -U
!pip install -q git+https://github.com/huggingface/accelerate.git  -U
!pip install -q -U bitsandbytes
!pip install -q git+https://github.com/huggingface/peft.git  -U

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
import random
import numpy as np
import pandas as pd
import time



#https://github.com/Lightning-AI/lit-gpt/issues/327
# torch.backends.cuda.enable_mem_efficient_sdp(False)
# torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

To save and load a PyTorch model, follow these steps:

### Saving the Model

1. **Save the Entire Model**:
   ```python
   torch.save(model, 'model.pth')
   ```

2. **Save Only the Model State Dict**:
   ```python
   torch.save(model.state_dict(), 'model_state_dict.pth')
   ```

### Loading the Model

1. **Load the Entire Model**:
   ```python
   model = torch.load('model.pth')
   model.eval()  # Set the model to evaluation mode
   ```

2. **Load the Model State Dict**:
   ```python
   model = EnhancedRNN(...)  # Initialize the model architecture
   model.load_state_dict(torch.load('model_state_dict.pth'))
   model.eval()  # Set the model to evaluation mode
   ```


### Base Model Files Overview:

1. **`config.json`**: Contains model architecture settings like hyperparameters and initialization details.
2. **`generation_config.json`**: Includes text generation settings such as sequence length and sampling strategies.
3. **`model.safetensors.index.json`**: Stores metadata for managing model weights in `safetensors` format.
4. **`model-*.safetensors`**: Contains quantized model weights split across multiple files in the `safetensors` format.
5. **`special_tokens_map.json`**: Maps special tokens to their respective identifiers.
6. **`tokenizer.json`**: Includes the tokenizer’s vocabulary and configuration.
7. **`tokenizer.model`**: The binary model used for tokenization.
8. **`tokenizer_config.json`**: Configures how the tokenizer processes text.

### Summary:
- **Config Files**: Define model and tokenizer setup.
- **Model Weights**: Contain trained and quantized weights.
- **Tokenizer Files**: Used for text tokenization and detokenization, including vocabulary and special tokens.

These files are needed to properly load the model and tokenizer, typically handled by libraries like `transformers`.

## Inference with Alpaca style Prompt

```python
# {
#     "description": "Template used by Alpaca-LoRA.",
#     "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
#     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
#     "response_split": "### Response:"    
# }
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token
def format_prompt(sample):
    instructions=sample["instruction"] # here system_prompt
    inputs = sample["input"]           # here user_prompt
    responses = sample["output"]        # here "" preset but will be in training dataset
    texts = []
    for instruction,input,response in zip(instructions,inputs,responses):
        text = alpaca_prompt.format(instruction,input,response)+EOS_TOKEN
        texts.append(text)
    return {"text":texts,} # add data in 1 column for SFTTrainer
    
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned",split="train")
dataset = dataset.map(format_prompt,batched=True)


def prepare_for_peft(model):
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.dim() == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    model.config.gradient_checkpointing = True  # enable gradient checkpointing
    model.config.use_cache = False  # disable cache for memory efficiency
    model.config.output_hidden_states = True  # set to True if you want hidden states
    model.config.output_attentions = True  # set to True if you want attention weights

    # No need to define a separate class, we can use nn.Sequential directly
    model.lm_head = nn.Sequential(nn.Linear(model.config.hidden_size, model.config.vocab_size))
    return model

```

In [None]:
#this can help speed up inference
max_new_tokens = 30

#output test is trimmed according to this
max_sentences_in_response = 1

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig


model_name = '/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1'

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    model_max_length=512,  # Reduce maximum sequence length
)

# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload= True
)


config = AutoConfig.from_pretrained(model_name)
config.gradient_checkpointing = True

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        attn_implementation = "eager",
        device_map="auto",
        trust_remote_code=True,
        config=config,
        # max_memory={0: "8GiB",1:"13GiB"},  # Limit GPU memory usage
        offload_folder="offload",  # Specify offload directory
        offload_state_dict=True  # Enable state dict offloading
)



Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
# # LoRA config
# from peft import LoraConfig,PeftModel, prepare_model_for_kbit_training, get_peft_model
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
# )
# model = get_peft_model(model, peft_config)

In [None]:
import torch

torch.cuda.empty_cache()

import gc
gc.collect()

111

## Testing Model

In [None]:
colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

In [None]:
prompt = """Qwen2.5 vs Phi-3-mini which one is better for what purpose ? Tell me technical usecase comparison"""

tokens = tokenizer(prompt, return_tensors='pt').to("cuda")
# input_ids = tokens.input_ids.to("cuda")
# attention_masks = tokens.attention_mask.to("cuda")
print(f" input_ids : {tokens.input_ids.shape}")

for idx, token_id in enumerate(tokens.input_ids[0]):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(token_id) +
            '\x1b[0m',
            end=' '
        )

generation_output = model.generate(**tokens,max_new_tokens=100)
print("generated output(max_tokens + input_ids) : ",generation_output[0].shape) #
print(tokenizer.decode(generation_output[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 input_ids : torch.Size([1, 29])
[0;30;48;2;102;194;165m<s>[0m [0;30;48;2;252;141;98mQ[0m [0;30;48;2;141;160;203mwen[0m [0;30;48;2;231;138;195m2[0m [0;30;48;2;166;216;84m.[0m [0;30;48;2;255;217;47m5[0m [0;30;48;2;102;194;165mvs[0m [0;30;48;2;252;141;98mPh[0m [0;30;48;2;141;160;203mi[0m [0;30;48;2;231;138;195m-[0m [0;30;48;2;166;216;84m3[0m [0;30;48;2;255;217;47m-[0m [0;30;48;2;102;194;165mmin[0m [0;30;48;2;252;141;98mi[0m [0;30;48;2;141;160;203mwhich[0m [0;30;48;2;231;138;195mone[0m [0;30;48;2;166;216;84mis[0m [0;30;48;2;255;217;47mbetter[0m [0;30;48;2;102;194;165mfor[0m [0;30;48;2;252;141;98mwhat[0m [0;30;48;2;141;160;203mpurpose[0m [0;30;48;2;231;138;195m?[0m [0;30;48;2;166;216;84mTell[0m [0;30;48;2;255;217;47mme[0m [0;30;48;2;102;194;165mtechnical[0m [0;30;48;2;252;141;98muse[0m [0;30;48;2;141;160;203mcase[0m [0;30;48;2;231;138;195mcomparison[0m [0;30;48;2;166;216;84m</s>[0m generated output(max_tokens + input_ids) :  torch.S

## Forward pass



#### logits & past_key_values will have same shape , here we're passing the tokens through (model.model + model.lm_head)

In [None]:
output = model(**tokens)
#
print("logits shape ([batch_sz,seq_len,vocab_sz]) : ",output.logits.shape)
############################
print("-"*100)
print("-"*100)
print(f"No. of Decoder : {len(output.past_key_values)}")
print(f"Cached attention state of Key(K) & Value(V) for each attention head  : layers past_key_values[layer=0] : {len(output.past_key_values[0])}")
print(f"Key(K) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : {output.past_key_values[0][0].shape}")
print(f"Value(V) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : {output.past_key_values[0][1].shape}")
############################
print("-"*100)
print("-"*100)
print("Predicted token after the forward pass")
for idx,logits_value in enumerate(output.logits[0]):
    token_id = torch.argmax(logits_value,dim=-1)
    print(f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +tokenizer.decode(token_id) +'\x1b[0m',end=' ')

logits shape ([batch_sz,seq_len,vocab_sz]) :  torch.Size([1, 29, 32000])
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
No. of Decoder : 32
Cached attention state of Key(K) & Value(V) for each attention head  : layers past_key_values[layer=0] : 2
Key(K) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : torch.Size([1, 8, 29, 128])
Value(V) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : torch.Size([1, 8, 29, 128])
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Predicted token after the forward pass
[0;30;48;2;102;194;165mQ[0m [0;30;48;2;252;141;98m:[0m [0;30

#### Forward pass of input through model except the lm_head then passing through lm_head

In [None]:
model_output = model.model(tokens.input_ids)

print("last_hidden_state shape ([batch_sz,seq_len,hidden_sz]) : ",model_output[0].shape)
############################
print("-"*100)
print("-"*100)
print(f"No. of Decoder : {len(model_output.past_key_values)}")
print(f"Cached attention state of Key(K) & Value(V) for each attention head  : layers past_key_values[layer=0] : {len(model_output.past_key_values[0])}")
print(f"Key(K) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : {model_output.past_key_values[0][0].shape}")
print(f"Value(V) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : {model_output.past_key_values[0][1].shape}")
############################
print("-"*100)
print("-"*100)
lm_head_output = model.lm_head(model_output[0])
print("lm_head/logits shape ([batch_sz,seq_len,vocab_sz]) : ",lm_head_output.shape)

last_hidden_state shape ([batch_sz,seq_len,hidden_sz]) :  torch.Size([1, 29, 4096])
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
No. of Decoder : 32
Cached attention state of Key(K) & Value(V) for each attention head  : layers past_key_values[layer=0] : 2
Key(K) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : torch.Size([1, 8, 29, 128])
Value(V) [batch_size, num_attention_heads, seq_len, head_dim (hidden_size / num_attention_heads =  4096 / 32 = 128)] : torch.Size([1, 8, 29, 128])
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
lm_head/logits shape ([batch_sz,seq_len,vocab_sz]) :  torch.Size([1, 29, 32000])


    Theoretically, both should match but
    If False, then small numerical differences exist due to normalization, dropout, or caching.



##### **Why Are the Values Different?**
1. **Dropout (During Training Mode)**
   - If the model is in **training mode (`model.train()`)**, dropout is applied, leading to slightly different hidden states.
   - Try running in **evaluation mode**:
     ```python
     model.eval()
     ```

2. **LayerNorm / Residual Connection Effects**
   - Some architectures (e.g., GPT) apply **Layer Normalization and residual connections differently** inside `forward()`.
   - If you extract `model.model(input_ids)`, it **may not be identical** to the forward method.

3. **Past Key-Value Caching (for Decoding)**
   - When using `model(**tokens)`, **past key-value states** are handled inside the model, which can slightly alter values.
   - Try disabling caching:
     ```python
     output = model(**tokens, use_cache=False)
     ```


### Manual inference

In [None]:
import torch

num_tokens_to_generate = 50  # Number of tokens to predict

tokens = tokenizer(prompt, return_tensors="pt").to("cuda:0")
input_ids = tokens.input_ids  # Initial tokenized input

for token_idx in range(num_tokens_to_generate):
    with torch.no_grad():  # No need to compute gradients during inference
        model_output = model.model(input_ids)  # Get hidden states
        lm_head_output = model.lm_head(model_output[0]).to("cuda:0")  # Get logits

    # Get the predicted token ID (argmax over vocabulary)
    next_token_id = lm_head_output[:, -1, :].argmax(dim=-1)
    if token_idx==35:
        print(f"Generated Token no : {token_idx}")
        print(f"lm_head_output [batch_sz,updated_seq_len,vocab_sz] : {lm_head_output.shape}")
        print(f"lm_head_output[:, -1, :] = [batch_sz,vocab_sz] : { lm_head_output[:, -1, :].shape}")
        print(f"next_token_id [predicted_seq_len] : {next_token_id.shape}")
        print(f"next_token_id.unsqueeze(-1) [batch_sz,predicted_seq_len] : {next_token_id.unsqueeze(-1).shape}")
        print("-"*100)
        print("-"*100)

    # Append predicted token to input_ids
    input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)

# Decode the generated sequence
predicted_text = tokenizer.decode(input_ids[0])

print("Generated Text:", predicted_text)


Generated Token no : 35
lm_head_output [batch_sz,updated_seq_len,vocab_sz] : torch.Size([1, 64, 32000])
lm_head_output[:, -1, :] = [batch_sz,vocab_sz] : torch.Size([1, 32000])
next_token_id [predicted_seq_len] : torch.Size([1])
next_token_id.unsqueeze(-1) [batch_sz,predicted_seq_len] : torch.Size([1, 1])
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Generated Text: <s> Qwen2.5 vs Phi-3-mini which one is better for what purpose ? Tell me technical usecase comparison</s>R: Both Qwen2.5 and Phi-3-mini are single-board computers (SBCs) that can be used for various purposes, such as prototyping, IoT projects, and education. Here is


In [None]:
import torch

torch.cuda.empty_cache()

import gc
gc.collect()

0

## Backward pass output

##### **Why No `loss.backward()` or `optimizer.step()` in Token Prediction?**  

The key reason is that we are **only performing inference (prediction)** and **not training**.  
Since we are only **generating tokens**, we don't need loss computation or gradient updates.

    The sequence length is 1 less because we shift the logits for next-token prediction alignment.
    The first token does not have a previous token, so it’s ignored for loss computation.
    This is expected behavior in causal language modeling (like GPT).




In [None]:
import torch
import torch.nn.functional as F


# Forward pass
model_output = model.model(input_ids)  # Hidden states before lm_head
lm_head_output = model.lm_head(model_output[0])  # Logits from the language model head

# Shift input_ids and logits for loss calculation
shift_logits = lm_head_output[:, :-1, :].contiguous().to("cuda:0") # [batch_sz,seq_len-1,vocab_sz]
shift_labels = input_ids[:, 1:].contiguous().to("cuda:0")          # [batch_sz,seq_len-1]

# Compute loss
# pred :  shift_logits.view(-1, vocab_sz) = [seq_len-1,vocab_sz]
# actual : shift_labels.view(-1) = [seq_len-1]
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

# Backward pass(single) - retain_graph=True of you want to run multiple backward pass (it might raise CUDA OutOfMemory)
loss.backward(retain_graph=False)


In [None]:
# Check gradients
print("Gradients computed for model parameters:")
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"layer : {name} --> grad : {param.grad.norm().item()}")

Gradients computed for model parameters:
layer : model.embed_tokens.weight --> grad : 41.375
layer : model.layers.0.input_layernorm.weight --> grad : 0.43701171875
layer : model.layers.0.post_attention_layernorm.weight --> grad : 0.2274169921875
layer : model.layers.1.input_layernorm.weight --> grad : 0.155029296875
layer : model.layers.1.post_attention_layernorm.weight --> grad : 0.1331787109375
layer : model.layers.2.input_layernorm.weight --> grad : 0.1600341796875
layer : model.layers.2.post_attention_layernorm.weight --> grad : 0.1279296875
layer : model.layers.3.input_layernorm.weight --> grad : 0.0419921875
layer : model.layers.3.post_attention_layernorm.weight --> grad : 0.09185791015625
layer : model.layers.4.input_layernorm.weight --> grad : 0.13671875
layer : model.layers.4.post_attention_layernorm.weight --> grad : 0.08013916015625
layer : model.layers.5.input_layernorm.weight --> grad : 0.08551025390625
layer : model.layers.5.post_attention_layernorm.weight --> grad : 0.05

In [None]:
import torch

torch.cuda.empty_cache()

import gc
gc.collect()

0

# Mistral Model Architecture

```python

MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): Linear(in_features=4096, out_features=14336, bias=False)
              (w2): Linear(in_features=14336, out_features=4096, bias=False)
              (w3): Linear(in_features=4096, out_features=14336, bias=False)
              (act_fn): SiLU()
            )
          )
        )
        (input_layernorm): MixtralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MixtralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MixtralRMSNorm((4096,), eps=1e-05)
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)


```

### Model Config

```python

MixtralConfig {
  "_name_or_path": "mistralai/Mixtral-8x7B-v0.1",
  "architectures": [
    "MixtralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mixtral",
  "num_attention_heads": 32,
  "num_experts_per_tok": 2,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "num_local_experts": 8,
  "output_router_logits": false,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.02,
  "router_jitter_noise": 0.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.1",
  "use_cache": true,
  "vocab_size": 32000
}

```

## These are the most Important Parts of the Model


1. **Embedding Layer**: This converts token IDs to embeddings.
2. **Self-Attention Layer**: This performs the self-attention mechanism.
3. **Block Sparse MoE Experts**: This applies the Mixture of Experts (MoE) mechanism.
4. **Post-Attention LayerNorm**: This normalizes the output after the attention mechanism.
5. **Final Norm Layer**: This normalizes the final output of the model.
6. **Language Model Head**: This converts the final hidden states to logits.


In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params_billions = total_params / 1e9  # Convert to billions
trainable_params_billions = trainable_params / 1e9
non_trainable_params_billions = (total_params - trainable_params) / 1e9

print(f"Total Parameters: {total_params_billions:.2f}B")  # Format to 2 decimal places
print(f"Trainable Parameters: {trainable_params_billions:.2f}B")
print(f"Non-Trainable Parameters: {non_trainable_params_billions:.2f}B")

Total Parameters: 23.48B
Trainable Parameters: 0.26B
Non-Trainable Parameters: 23.22B


## Define a dictionary to store the outputs




In [None]:

outputs = {
    "embed_tokens": None,
    "self_attn_layer_1": None,
    "block_sparse_moe_experts": None,
    "post_attention_layernorm": None,
    "norm": None,
    "lm_head": None,
    "input_layernorm": None,  # Adding hook for input layernorm
    "self_attn_q_proj": None,  # Adding hook for q_proj in self_attn
    "self_attn_k_proj": None,  # Adding hook for k_proj in self_attn
    "self_attn_v_proj": None,  # Adding hook for v_proj in self_attn
    "self_attn_o_proj": None,  # Adding hook for o_proj in self_attn
    "block_sparse_moe_gate": None,  # Adding hook for gate in block_sparse_moe
}


## Define & Register hooks

In [None]:
# Define the hook functions
def hook_fn(name):
    def hook(module, input, output):
        outputs[name] = output
    return hook

# Register hooks
model.model.embed_tokens.register_forward_hook(hook_fn("embed_tokens"))
model.model.layers[0].self_attn.register_forward_hook(hook_fn("self_attn_layer_1"))
model.model.layers[0].block_sparse_moe.experts[0].register_forward_hook(hook_fn("block_sparse_moe_experts"))
model.model.layers[0].post_attention_layernorm.register_forward_hook(hook_fn("post_attention_layernorm"))
model.model.norm.register_forward_hook(hook_fn("norm"))
model.lm_head.register_forward_hook(hook_fn("lm_head"))

# Additional hooks
model.model.layers[0].input_layernorm.register_forward_hook(hook_fn("input_layernorm"))
model.model.layers[0].self_attn.q_proj.register_forward_hook(hook_fn("self_attn_q_proj"))
model.model.layers[0].self_attn.k_proj.register_forward_hook(hook_fn("self_attn_k_proj"))
model.model.layers[0].self_attn.v_proj.register_forward_hook(hook_fn("self_attn_v_proj"))
model.model.layers[0].self_attn.o_proj.register_forward_hook(hook_fn("self_attn_o_proj"))
model.model.layers[0].block_sparse_moe.gate.register_forward_hook(hook_fn("block_sparse_moe_gate"))

<torch.utils.hooks.RemovableHandle at 0x7f5af66b7eb0>

## Forward Pass

In [None]:
input_text = "The quick brown fox jumps over the lazy dog !"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")
print("Tokenized inputs {'input_ids','attention_mask'} - ",inputs)
print("Decoded tokens : ",tokenizer.decode(inputs['input_ids'][0]))

Tokenized inputs {'input_ids','attention_mask'} -  {'input_ids': tensor([[    1,   415,  2936,  9060,   285,  1142,   461, 10575,   754,   272,
         17898,  3914,   918,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded tokens :  <s> The quick brown fox jumps over the lazy dog !</s>


In [None]:

with torch.no_grad():
    model_output = model(**inputs)

In [None]:
# for layer, output in outputs.items():
#     print(f"Output at {layer}: ")
#     if isinstance(output, torch.Tensor):
#         print(output.shape, type(output))
#     elif isinstance(output, tuple):
#         for i, o in enumerate(output):
#             print(f"Output {i}: {o.shape if isinstance(o, torch.Tensor) else type(o)}")
#     else:
#         print(type(output))
#     print("-" * 100)



| Layer Name                   | Shape Format                      | Dimensions | Notes                                                      |
|------------------------------|------------------------------------|------------|------------------------------------------------------------|
| `embed_tokens`               | `(batch_size, seq_len, embed_dim)` | `[1, 13, 4096]` | Embedding tokens from vocabulary.                           |
| `self_attn_layer_1`           | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output of first attention layer.                            |
| `block_sparse_moe_experts`    | `(num_experts, expert_embed_dim)`  | `[3, 4096]` | Expert outputs in MoE block.                                |
| `post_attention_layernorm`    | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Layer norm after attention.                                 |
| `norm`                       | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Final normalization layer.                                  |
| `lm_head`                    | `(batch_size, seq_len, vocab_size)`| `[1, 13, 32000]` | Logits for each token over the vocabulary.                  |
| `input_layernorm`             | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Input layer normalization.                                  |
| `self_attn_q_proj`            | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Query projection in self-attention.                         |
| `self_attn_k_proj`            | `(batch_size, seq_len, key_dim)`   | `[1, 13, 1024]` | Key projection in self-attention.                           |
| `self_attn_v_proj`            | `(batch_size, seq_len, value_dim)` | `[1, 13, 1024]` | Value projection in self-attention.                         |
| `self_attn_o_proj`            | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output projection after attention.                          |
| `block_sparse_moe_gate`       | `(seq_len, num_experts)`           | `[13, 8]`   | Gating decisions for the mixture of experts.                |



In [None]:
#