In [10]:
from vllm import LLM, SamplingParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from typing import List, Optional,Tuple
import pandas as pd

In [11]:
train_data = pd.read_json("../qa/wikipedia-train.json")
validation_data = train_data.iloc[:7900]

In [None]:
# Define the model
READER_MODEL_NAME = "AMead10/Llama-3.2-3B-Instruct-AWQ"
#READER_MODEL_NAME = "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8"
# Configure the model
model = LLM( 
    model = READER_MODEL_NAME,
    #quantization="awq",
    tensor_parallel_size=1, 
    gpu_memory_utilization=1.0, 
    trust_remote_code=True,
    enforce_eager=True,
    disable_log_stats=True,
    max_model_len=4096
)

INFO 04-06 21:53:58 [config.py:585] This model supports multiple tasks: {'classify', 'reward', 'score', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 04-06 21:53:58 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-06 21:54:03 [__init__.py:239] Automatically detected platform cuda.
INFO 04-06 21:54:04 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:04<00:00,  4.12s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:04<00:00,  4.12s/it]



INFO 04-06 21:54:11 [loader.py:447] Loading weights took 4.38 seconds
INFO 04-06 21:54:12 [gpu_model_runner.py:1186] Model loading took 3.4213 GB and 6.436726 seconds
INFO 04-06 21:54:18 [kv_cache_utils.py:566] GPU KV cache size: 5,760 tokens
INFO 04-06 21:54:18 [kv_cache_utils.py:569] Maximum concurrency for 4,096 tokens per request: 1.41x
INFO 04-06 21:54:19 [core.py:151] init engine (profile, create kv cache, warmup model) took 7.00 seconds


In [18]:
# Load the components of the RAG system

tokenizer = model.get_tokenizer()

prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Answer the question with only one word or the simplest possible response (e.g., a single number or a single word).
Do NOT generate sentences, explanations, or additional context.
Stop immediately after providing the answer. Do not generate any further words or tokens.
If the context does not provide any useful information, answer the question based on your own knowledge.
I am going to provide you five examples:

Question: What is the capital of Kenya?
Answer: Nairobi
---
Question: What was the name of the pig leader in George Orwell's Animal Farm?
Answer: Napoleon
---
Question: Which artist created the Katzenjammer Kids?
Answer: Rudolph Dirks
---
Question: Who was Geena Davis's husband when they made the loss-maker Cutthroat Island?
Answer: Renny Harlin
---
Question: Who was married to Spandau Ballet's Gary Kemp and later to Jude Law?
Answer: Sadie Frost

"""
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}"""
    },
]

RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

sampling_params = SamplingParams(
    n = 1,
    top_p=0.9,
    temperature=0,
    repetition_penalty=1.2,
    max_tokens=10,     
)

# The name of the embedding model
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
# HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

faiss_index_path = "knowledge_vector_database-validation"
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)

# def retrieve(
#     question: str,
#     embedding: List[float],
#     knowledge_index,
#     num_docs_final: int = 3,
#     printing: bool = True,
# ) -> Tuple[str, List[str]]:
#     """
#     Retrieves documents from the FAISS knowledge index.
#     """
#     if printing:
#         print("=> Retrieving documents...")

#     # Perform search using the precomputed embedding
#     relevant_docs = knowledge_index.similarity_search_by_vector(embedding, k=num_docs_final)
#     relevant_docs = [doc.page_content for doc in relevant_docs]  # Extract only the content

#     # Build the context for the LLM model
#     context = "\nExtracted documents:\n" + "".join(
#         [f"Document {i}:::\n{doc}\n" for i, doc in enumerate(relevant_docs)]
#     )

#     return context, relevant_docs
def retrieve(
    question: str,
    embedding: List[float],
    knowledge_index,
    num_docs_final: int = 3,
    printing: bool = True,
) -> Tuple[str, List[str]]:
    """
    Retrieves documents from the FAISS knowledge index.
    """
    if printing:
        print("\n" + "="*50)
        print(f"=> Retrieving documents for question:\n{question}\n")

    # Perform search using the precomputed embedding
    relevant_docs = knowledge_index.similarity_search_by_vector(embedding, k=num_docs_final)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Extract only the content

    if printing:
        print(f"=> Retrieved {len(relevant_docs)} documents:")
        for i, doc in enumerate(relevant_docs):
            preview = doc[:200].replace("\n", " ") + ("..." if len(doc) > 200 else "")
            print(f"Document {i + 1}: {preview}")

    # Build the context for the LLM model
    context = "\nExtracted documents:\n" + "".join(
        [f"Document {i}:::\n{doc}\n" for i, doc in enumerate(relevant_docs)]
    )

    return context, relevant_docs

def read(llm, sampling_params, prompt_template, contexts, questions):
    """
    Generates answers from the LLM by formatting the question-context pairs into prompts.
    """
    # Format prompts by combining questions and contexts
    prompts = [prompt_template.format(question=q, context=c) for q, c in zip(questions, contexts)]
    
    # Generate answers using the LLM
    outputs = llm.generate(prompts, sampling_params)
    
    # Extract the generated text from the outputs
    outputs = [output.outputs[0].text for output in outputs]
    
    return outputs

# Main function to answer questions using RAG
def answer_with_rag(
    questions: List[str],
    llm,
    prompt_template,
    sampling_params: dict,
    knowledge_index,
    embedding_model,
    num_docs_final: int = 3,
    printing: bool = True,
) -> List[str]:
    """
    Main function for answering questions using a Retrieval-Augmented Generation (RAG) pipeline.
    """
    # Ensure that questions are in list format
    if isinstance(questions, str):
        questions = [questions]

    # Calculate all embeddings for the questions at once
    embeddings = embedding_model.embed_documents(questions)

    # Retrieve contexts for each question
    contexts = []
    relevant_docs_list = []
    
    for idx, question in enumerate(questions):
        # Use the precomputed embedding for each question
        embedding = embeddings[idx]
        context, relevant_docs = retrieve(
            question,
            embedding=embedding,
            knowledge_index=knowledge_index,
            num_docs_final=num_docs_final,
            printing=printing,
        )
        contexts.append(context)
        relevant_docs_list.append(relevant_docs)

    # Generate answers using the LLM model
    if printing:
        print("=> Generating answers...")
    answers = read(llm, sampling_params, prompt_template, contexts, questions)

    return answers, relevant_docs_list

In [14]:
question = "Where was born the Queen Elizabeth II?"

answers, relevant_docs_list = answer_with_rag(
    questions=question,
    llm=model,
    prompt_template=RAG_PROMPT_TEMPLATE,
    sampling_params=sampling_params,
    knowledge_index=KNOWLEDGE_VECTOR_DATABASE, 
    embedding_model=embedding_model,
    num_docs_final=3,
    printing=True
)

print("Question: ", question)
print("Answer: ", answers[0])
print("Relevant Docs: ", relevant_docs_list[0])

=> Retrieving documents...
=> Generating answers...


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s, est. speed input: 2453.44 toks/s, output: 5.50 toks/s]

Question:  Where was born the Queen Elizabeth II?
Answer:  Greenwich
Relevant Docs:  ["Elizabeth II (Elizabeth Alexandra Mary; born 21 April 1926) is, and has been since her accession in 1952, Queen of the United Kingdom, Canada, Australia, and New Zealand, and Head of the Commonwealth. She is also Queen of 12 countries that have become independent since her accession: Jamaica, Barbados, the Bahamas, Grenada, Papua New Guinea, Solomon Islands, Tuvalu, Saint Lucia, Saint Vincent and the Grenadines, Belize, Antigua and Barbuda, and Saint Kitts and Nevis.\n\nElizabeth was born in London to the Duke and Duchess of York, later King George VI and Queen Elizabeth, and was the elder of their two daughters. She was educated privately at home. Her father acceded to the throne on the abdication of his brother Edward VIII in 1936, from which time she was the heir presumptive. She began to undertake public duties during the Second World War, serving in the Auxiliary Territorial Service. In 1947, sh




In [21]:
validation_data_batch = validation_data[0:500]
questions = [instance["Question"] for instance in validation_data_batch["Data"]]
answers, _ = answer_with_rag(
    questions=questions,
    llm=model,
    prompt_template=RAG_PROMPT_TEMPLATE,
    sampling_params=sampling_params,
    knowledge_index=KNOWLEDGE_VECTOR_DATABASE,
    embedding_model=embedding_model,
    num_docs_final=5,
    printing=False
)
for q, a in zip(questions, answers):
    print("Question: ", q)
    print("Answer: ", a)

Processed prompts: 100%|██████████| 500/500 [04:48<00:00,  1.73it/s, est. speed input: 4161.80 toks/s, output: 5.92 toks/s]

Question:  Where in England was Dame Judi Dench born?
Answer:  Heworth
Question:  From which country did Angola achieve independence in 1975?
Answer:  Portugal
Question:  Which city does David Soul come from?
Answer:  Chicago
Question:  Who won Super Bowl XX?
Answer:  Bears
Question:  Which was the first European country to abolish capital punishment?
Answer:  Italy
Question:  In which country did he widespread use of ISDN begin in 1988?
Answer:  Japan
Question:  What is Bruce Willis' real first name?
Answer:  Bruce
Question:  Which William wrote the novel Lord Of The Flies?
Answer:  Golding
Question:  How is Joan Molinsky better known?
Answer:  Joan
Question:  In which branch of the arts is Patricia Neary famous?
Answer:  Dance
Question:  Which country is Europe's largest silk producer?
Answer:  No data provided
Question:  The VS-300 was a type of what?
Answer:  Helicopter
Question:  At which university did Joseph Goebbels become a doctor of philosophy?
Answer:  University of Heidelbe


