# Step3. Domain Adapted Retrieval Model

In [None]:
!pip install llama-index==0.10
!pip install lightning

## (1) Convert HF model to .nemo

In [None]:
HF_ENCODER_MODEL = "intfloat/e5-small-unsupervised"
HF_LLM_MODEL = "meta-llama/Llama-3.1-8B"
DATA_ROOT_DIR = "/work/Data"
MODEL_ROOT_DIR = "/work/Models"

In [None]:
embed_nemo_path = "/work/Models/e5-small.nemo"

!python /opt/NeMo/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py \
       --input_name_or_path $HF_ENCODER_MODEL \
       --output_path $embed_nemo_path \
       --mcore True \
       --precision bf16

In [None]:
llm_nemo_path = "/work/Models/llama3.nemo"
precision = "bf16"

# Convert HF Model to NeMo
!python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py --input_name_or_path $HF_LLM_MODEL --output_path $llm_nemo_path --precision $precision --llama31 True 

## (2) Auto-Generated Domain-specific Retrieval Sample

In [None]:
vector_dir = f"{DATA_ROOT_DIR}/index" # save path for vectir database
data_dir = f"{DATA_ROOT_DIR}/docs" # your document directory for retrieve

!python /opt/NeMo/examples/nlp/rag/rag_indexing.py \
        trainer.devices=1 \
        trainer.precision='bf16-mixed' \
        indexing.embedder.model_path=$embed_nemo_path \
        indexing.embedder.embed_batch_size=128 \
        indexing.data.data_path=$data_dir \
        indexing.data.chunk_size=256 \
        indexing.data.chunk_overlap=10 \
        indexing.index_path=$vector_dir

In [None]:
data_dir = f"{DATA_ROOT_DIR}/retrieval" # save path for auto generated data

!python /opt/NeMo/tutorials/llm/llama-3/dapt/code/rag_auto_generate_sample.py \
    --config-path=/opt/NeMo/examples/nlp/rag/conf \
    --config-name=rag_generating \
    indexing.index_path=$vector_dir \
    indexing.embedder.model_path=$embed_nemo_path \
    generating.llm.model_path=$llm_nemo_path \
    ++generating.top_k=4 \
    ++generating.num_random=4 \
    ++generating.output_dir=$data_dir \
    ++generating.num_sample=3000 \
    ++generating.prefix="train" 

### (Optional) Prompt Engineering 

If the auto-generate results do not work well with your model, 

debug and design an appropriate prompt, then update the prompt in ./code/rag_auto_generate_sample.py 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
HF_LLM_MODEL = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(HF_LLM_MODEL)
model = AutoModelForCausalLM.from_pretrained(HF_LLM_MODEL)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

QUERY_PROMPT_TEMPLATE = """You will be provided with a document or a passage. Your task is to generate a single, highly relevant and natural language query that aligns perfectly with the content of the document.
The query should:
    1. Reflect the main idea or a key detail from the document.
    2. Be concise, specific, and written in natural language.
    3. Be something a user might naturally ask to retrieve this document.
    4. **Write only the answer, do not repeat the instructions or document.**
 
 ## Given Document:
 {document}
 
 ## Predict Query:
 """
 
FILTER_PROMPT_TEMPLATE = """You will be provided with a document and a query.
Your task is to evaluate whether the content of the document is relevant to answering the query. 

Return "True" if the document contains information directly related to the query, and "False" if it does not.
**Provide only the answer: "True" or "False", without repeating the instructions, document, or query.**

## Given Query:
{query}

## Given Document:
{document}

## Is Relevant (True or False):
"""

# Function to send a query and get a response
def get_response(query, max_length=512):
    """
    Sends a query to the Hugging Face model and receives a response.

    Args:
        query (str): The input text query.
        max_length (int): Maximum length of the response.

    Returns:
        str: The response generated by the model.
    """
    # Tokenize the input query and move tensors to the same device as the model
    inputs = tokenizer(query, return_tensors="pt").to(device)
    
    # Generate response
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,  # Set to True for sampling; False for deterministic output
        temperature=0.7,  # Controls randomness of predictions
        top_k=50,         # Top-k sampling
        top_p=0.9         # Top-p sampling
    )
    
    # Decode the output tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example query
document = (
    "NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and "
    "PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), "
    "Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, "
    "and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints."
)

query = QUERY_PROMPT_TEMPLATE.format(document=document)
response = get_response(query).split("## Predict Query:")[-1]
print(f"1 Response: {response}")

print("********")
query2 = FILTER_PROMPT_TEMPLATE.format(query="What is the NeMo?", document=document)
response2 = get_response(query2).split("## Is Relevant (True or False):")[-1]
print(f"2 Response: {response2}")

In [None]:
!ls $data_dir

## (3) Finetuning Retrieval Model

In [None]:
data_file = data_dir + "/train_data.json"

!python /opt/NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
    exp_manager.exp_dir=/work/log/retrieval \
    restore_from_path=$embed_nemo_path \
    trainer.devices=8 \
    trainer.precision=bf16 \
    trainer.max_epochs=1 \
    trainer.max_steps=-1 \
    trainer.val_check_interval=2 \
    trainer.limit_val_batches=8 \
    trainer.limit_test_batches=8 \
    model.micro_batch_size=1 \
    model.global_batch_size=64 \
    model.data.data_impl=jsonl \
    model.hidden_size=384 \
    model.num_layers=12 \
    model.ffn_hidden_size=1536 \
    model.data.data_train=$data_file