# Finetuning Llama 3.2 Model into Embedding Model

## Goal

While LLaMA 3.2 is a powerful large language model (LLM) pre-trained on diverse datasets, its application to specific downstream tasks—such as semantic search, document retrieval, or natural language understanding—requires adapting the model to effectively generate dense vector representations (embeddings). In this tutorial, we will demonstrate how to finetune this model and convert it into a state-of-the-art embedding model for retrieval-augmented generation (RAG) tasks.

The key architectural change involves modifying the LLaMA model to optimize its performance in generating embeddings by replacing causal attention with bidirectional attention. This change enables the decoder-only model to create embeddings that are contextually relevant, semantically rich, and capable of improving the efficiency and accuracy of tasks like information retrieval, clustering, and text classification.

Our primary goals for this tutorial are as follows:

 * Demonstrate the ease of automatically converting the model with essential architectural changes for embedding model training
 * Improve the model's performance and accuracy in generating dense vector representations (embeddings)
 * Provide guidelines for finetuning embedding models, including hyperparameter choices.


# NeMo Tools and Resources

* [NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)

# Software Requirements

* Access to latest NeMo Framework NGC Containers


# Hardware Requirements

* This playbook has been tested on the following hardware: Single A6000, Single H100, 2xA6000, 8xH100. It can be scaled to multiple GPUs as well as multiple nodes by modifying the appropriate parameters.


#### Launch the NeMo Framework container as follows: 

Depending on the number of gpus, `--gpus` might need to adjust accordingly:
```
docker run -it -p 8080:8080 -p 8088:8088 --rm --gpus '"device=0,1"' --ipc=host --network host -v $(pwd):/workspace nvcr.io/nvidia/nemo:25.02
```

#### Launch Jupyter Notebook as follows: 
```
jupyter notebook --allow-root --ip 0.0.0.0 --port 8088 --no-browser --NotebookApp.token=''

```

In [None]:
import os
from pathlib import Path
import torch
from typing import Literal, Optional, Union
from transformers import AutoModel, AutoTokenizer

In [None]:
class TextEmbeddingModelAdapter(torch.nn.Module):
    """Wraps a Text embedding model with pooling and normalization."""

    def __init__(
        self,
        model: torch.nn.Module,
        normalize: bool,
        pooling_module: torch.nn.Module,
    ) -> None:
        super().__init__()
        self.model = model
        self.normalize = normalize
        self.pooling_module = pooling_module

    @property
    def device(self) -> torch.device:
        return self.model.device

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        token_type_ids: Optional[torch.Tensor] = None,
        dimensions: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        if token_type_ids is not None:
            inputs["token_type_ids"] = token_type_ids
        outputs = self.model(**inputs)
        hidden_states = outputs["last_hidden_state"].to(torch.float32)
        embeddings = self.pooling_module(hidden_states, inputs["attention_mask"])

        if dimensions is not None:
            if not torch.all(dimensions > 0):
                raise ValueError("Dimensions must be positive")

            fill_value = torch.tensor(
                float("-inf"), dtype=embeddings.dtype, device=embeddings.device
            )

            clipped_dimensions = torch.where(
                dimensions < embeddings.shape[1],
                dimensions,
                torch.tensor(embeddings.shape[1], device=embeddings.device),
            )

            embeddings = embeddings.masked_fill(
                torch.arange(embeddings.shape[1], device=embeddings.device)
                >= clipped_dimensions.unsqueeze(-1),
                fill_value,
            )[:, : dimensions.max()]

        if self.normalize:
            embeddings = F.normalize(embeddings, p=2, dim=1)

        return embeddings

In [None]:
class Pooling(torch.nn.Module):
    def __init__(self, pooling_mode: str):
        super().__init__()
        self.pooling_mode = pooling_mode

    def forward(
        self, last_hidden_states: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)

        pool_type = self.pooling_mode
        if pool_type == "avg":
            epsilon = 1e-9  # A small value to avoid division by zero
            emb = last_hidden.sum(dim=1) / (attention_mask.sum(dim=1)[..., None] + epsilon)
        elif pool_type == "cls":  # tokenizer padding right
            emb = last_hidden[:, 0]
        elif pool_type == "cls__left":  # tokenizer padding left
            seq_idxs = (1 - attention_mask).sum(dim=1)
            batch_size = last_hidden.shape[0]
            batch_idxs = torch.arange(batch_size, device=last_hidden.device)
            emb = last_hidden[batch_idxs, seq_idxs]
        elif pool_type == "last":  # tokenizer padding left
            emb = last_hidden[:, -1]
        elif pool_type == "last__right":  # tokenizer padding right
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden.shape[0]
            emb = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
        else:
            raise ValueError(f"pool_type {pool_type} not supported")

        return emb

In [None]:
def get_transformers_model(
    model_name_or_path: Union[str, os.PathLike[str]],
    normalize: bool,
    pooling_mode: Optional[Literal["avg", "cls", "last"]] = None,
    torch_dtype: Optional[Union[torch.dtype, str]] = None,
    trust_remote_code: bool = False,
):
    # check that the tokenizer matches the requirements of the pooling mode
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path, trust_remote_code=trust_remote_code
    )
    pooling_mode = pooling_mode or "avg"
    if pooling_mode == "last" and tokenizer.padding_side == "right":
        pooling_mode = "last__right"  # type: ignore
    if pooling_mode == "cls" and tokenizer.padding_side == "left":
        pooling_mode = "cls__left"  # type: ignore

    # load the model
    model = AutoModel.from_pretrained(
        model_name_or_path, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code
    ).eval()

    # configure pooling
    pooling_module = Pooling(pooling_mode=pooling_mode)

    # NV-Embed-v1 model has seperate embedding model and a built-in pooling module
    if (
        model.__class__.__name__ == "NVEmbedModel"
        and hasattr(model, "latent_attention_model")
        and hasattr(model, "embedding_model")
    ):
        pooling_module = model.latent_attention_model
        model = model.embedding_model

    adapted_model = TextEmbeddingModelAdapter(
        model=model, normalize=normalize, pooling_module=pooling_module
    )
    return adapted_model, tokenizer

In [None]:
def EmbeddingModelAdapter(
    model_path: str | os.PathLike[str],
    normalize: bool,
    *args,
    pooling_mode: Optional[Literal["avg", "cls", "last"]] = None,
    trust_remote_code: bool = False,
    **kwargs,
) -> TextEmbeddingModelAdapter:
    """Returns a callable that returns a PyTorch model.

    Args:
        model_path: Path to the model.
        normalize: Whether or not to normalize embeddings.
        pooling_mode: Pooling to apply.
        trust_remote_code: Whether or not to run custom code.

    Returns:
        TextEmbeddingModelAdapter.
    """
    return get_transformers_model(
        model_path,
        normalize=normalize,
        pooling_mode=pooling_mode,
        trust_remote_code=trust_remote_code,
    )

In [None]:
# Paths
hf_model_path = "/opt/checkpoints/llama_embedding_converted_hf"
quantization_calibration_data = "/opt/checkpoints/question_doc_pairs_500.json"

# HF model parameters
pooling_mode = "last"
normalize = False

# ONNX params
opset = 17
onnx_export_path = "/opt/checkpoints/llama_embedding_onnx/"
export_dtype = "fp32"
use_dimension_arg = True

# TRT params
trt_model_path = Path("/opt/checkpoints/llama_embedding_trt/model.plan")
override_layers_to_fp32 = ["/model/norm/", "/pooling_module", "/ReduceL2", "/Div", ]
override_layernorm_precision_to_fp32 = True
profiling_verbosity = "layer_names_only"

# Other params
quantize_model = False
quantization_type = "fp8"
export_to_trt = True

In [None]:
# Adapt the model first
model, tokenizer = EmbeddingModelAdapter(
    model_path=hf_model_path,
    normalize=normalize,
    pooling_mode=pooling_mode,
    trust_remote_code=True,
)

In [None]:
from nemo.export.onnx_llm_exporter import OnnxLLMExporter

if use_dimension_arg:
    input_names = ["input_ids", "attention_mask", "dimensions"]
    dynamic_axes_input = {"input_ids": {0: "batch_size", 1: "seq_length"},
                            "attention_mask": {0: "batch_size", 1: "seq_length"}, "dimensions": {0: "batch_size"}}
else:
    input_names = ["input_ids", "attention_mask"]
    dynamic_axes_input = {"input_ids": {0: "batch_size", 1: "seq_length"},
                            "attention_mask": {0: "batch_size", 1: "seq_length"}}

output_names = ["embeddings"]
dynamic_axes_output = {"embeddings": {0: "batch_size", 1: "embedding_dim"}}

onnx_exporter = OnnxLLMExporter(
    onnx_model_dir=onnx_export_path, 
    model=model,
    tokenizer=tokenizer,
)

if quantize_model:
    onnx_exporter.ptq(
        calibration_data=quantization_calibration_data,
        quantization_type=quantization_type,
    )

onnx_exporter.export(    
    input_names=input_names,
    output_names=output_names,
    opset=opset,
    dynamic_axes_input=dynamic_axes_input,
    dynamic_axes_output=dynamic_axes_output,
    export_dtype="fp32",
)

In [None]:
if export_to_trt:
    if use_dimension_arg:
        input_profiles = [{"input_ids": [[1, 3], [16, 128], [64, 256]], "attention_mask": [[1, 3], [16, 128], [64, 256]],
                            "dimensions": [[1], [16], [64]]}]
    else:
        input_profiles = [{"input_ids": [[1, 3], [16, 128], [64, 256]], "attention_mask": [[1, 3], [16, 128], [64, 256]]}]

    onnx_exporter.export_onnx_to_trt(
        trt_model_path=Path(trt_model_path),
        profiles=input_profiles,
        override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,
        override_layers_to_fp32=override_layers_to_fp32,
        profiling_verbosity=profiling_verbosity,
    )

In [None]:
prompt = ["hello", "world"]

if use_dimension_arg:
    prompt = onnx_exporter.get_tokenizer(prompt)
    prompt["dimensions"] = [[2]]

print(onnx_exporter.forward(prompt))

In [None]:
#from nemo.export.tensorrt_lazy_compiler import TRTEngine
#engine = TRTEngine(plan_path="/opt/checkpoints/llama_embedding_trt/model.plan")