In [1]:
from vllm import LLM, SamplingParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from typing import List,Tuple
import pandas as pd
import torch
import json

INFO 04-14 15:52:31 [__init__.py:239] Automatically detected platform cuda.


In [2]:
# Load Test Data from the Triviaqa wikipedia dev json
test_data= pd.read_json("../qa/wikipedia-dev.json")

In [3]:
READER_MODEL_NAME = "AMead10/Llama-3.2-3B-Instruct-AWQ"
model = LLM( 
    model = READER_MODEL_NAME,
    #quantization="awq",
    tensor_parallel_size=1, 
    gpu_memory_utilization=1.0, 
    trust_remote_code=True,
    enforce_eager=True,
    disable_log_stats=True,
    max_model_len=4096
)

INFO 04-14 15:52:38 [config.py:585] This model supports multiple tasks: {'embed', 'classify', 'reward', 'generate', 'score'}. Defaulting to 'generate'.
INFO 04-14 15:52:39 [awq_marlin.py:114] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 04-14 15:52:39 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-14 15:52:40 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='AMead10/Llama-3.2-3B-Instruct-AWQ', speculative_config=None, tokenizer='AMead10/Llama-3.2-3B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(gu

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-14 15:52:44 [loader.py:447] Loading weights took 2.09 seconds
INFO 04-14 15:52:44 [gpu_model_runner.py:1186] Model loading took 2.1364 GB and 3.284608 seconds
INFO 04-14 15:52:46 [kv_cache_utils.py:566] GPU KV cache size: 11,424 tokens
INFO 04-14 15:52:46 [kv_cache_utils.py:569] Maximum concurrency for 4,096 tokens per request: 2.79x
INFO 04-14 15:52:46 [core.py:151] init engine (profile, create kv cache, warmup model) took 1.89 seconds


In [4]:
# Load the components of the RAG system

tokenizer = model.get_tokenizer()

prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Answer the question with only one word or the simplest possible response (e.g., a single number or a single word).
Do NOT generate sentences, explanations, or additional context.
Stop immediately after providing the answer. Do not generate any further words or tokens.
If the context does not provide any useful information, answer the question based on your own knowledge.
I am going to provide you five examples:

Question: What is the capital of Kenya?
Answer: Nairobi
---
Question: What was the name of the pig leader in George Orwell's Animal Farm?
Answer: Napoleon
---
Question: Which artist created the Katzenjammer Kids?
Answer: Rudolph Dirks
---
Question: Who was Geena Davis's husband when they made the loss-maker Cutthroat Island?
Answer: Renny Harlin
---
Question: Who was married to Spandau Ballet's Gary Kemp and later to Jude Law?
Answer: Sadie Frost

"""
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}"""
    },
]

RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

sampling_params = SamplingParams(
    n = 1,
    top_p=0.9,
    temperature=0,
    repetition_penalty=1.2,
    max_tokens=5,     
)

# The name of the embedding model
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
# HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

faiss_index_path = "test_data_saved/Faiss"
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True)

def retrieve(
    question: str,
    embedding: List[float],
    knowledge_index,
    num_docs_final: int = 3,
    printing: bool = True,
) -> Tuple[str, List[str]]:
    """
    Retrieves documents from the FAISS knowledge index.
    """
    if printing:
        print("\n" + "="*50)
        print(f"=> Retrieving documents for question:\n{question}\n")

    # Perform search using the precomputed embedding
    relevant_docs = knowledge_index.similarity_search_by_vector(embedding, k=num_docs_final)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Extract only the content

    if printing:
        print(f"=> Retrieved {len(relevant_docs)} documents:")
        for i, doc in enumerate(relevant_docs):
            preview = doc[:200].replace("\n", " ") + ("..." if len(doc) > 200 else "")
            print(f"Document {i + 1}: {preview}")

    # Build the context for the LLM model
    context = "\nExtracted documents:\n" + "".join(
        [f"Document {i}:::\n{doc}\n" for i, doc in enumerate(relevant_docs)]
    )

    return context, relevant_docs

def read(llm, sampling_params, prompt_template, contexts, questions):
    """
    Generates answers from the LLM by formatting the question-context pairs into prompts.
    """
    # Format prompts by combining questions and contexts
    prompts = [prompt_template.format(question=q, context=c) for q, c in zip(questions, contexts)]
    
    # Generate answers using the LLM
    outputs = llm.generate(prompts, sampling_params)
    
    # Extract the generated text from the outputs
    outputs = [output.outputs[0].text for output in outputs]
    
    return outputs

# Main function to answer questions using RAG
def answer_with_rag(
    questions: List[str],
    llm,
    prompt_template,
    sampling_params: dict,
    knowledge_index,
    embedding_model,
    num_docs_final: int = 3,
    printing: bool = True,
) -> List[str]:
    """
    Main function for answering questions using a Retrieval-Augmented Generation (RAG) pipeline.
    """
    # Ensure that questions are in list format
    if isinstance(questions, str):
        questions = [questions]

    # Calculate all embeddings for the questions at once
    embeddings = embedding_model.embed_documents(questions)

    # Retrieve contexts for each question
    contexts = []
    relevant_docs_list = []
    
    for idx, question in enumerate(questions):
        # Use the precomputed embedding for each question
        embedding = embeddings[idx]
        context, relevant_docs = retrieve(
            question,
            embedding=embedding,
            knowledge_index=knowledge_index,
            num_docs_final=num_docs_final,
            printing=printing,
        )
        contexts.append(context)
        relevant_docs_list.append(relevant_docs)

    # Generate answers using the LLM model
    if printing:
        print("=> Generating answers...")
    answers = read(llm, sampling_params, prompt_template, contexts, questions)

    return answers, relevant_docs_list

In [5]:
def perform_inference_p(instances):
    # Processes a batch of instances to generate answers
    questions = [instance["Question"] for instance in instances]
    question_ids = [instance["QuestionId"] for instance in instances]

    # Generates answers using `answer_with_rag`
    answers, _ = answer_with_rag(
        questions=questions,
        llm=model,
        prompt_template=RAG_PROMPT_TEMPLATE,
        sampling_params=sampling_params,
        knowledge_index=KNOWLEDGE_VECTOR_DATABASE,
        embedding_model=embedding_model,
        printing=True
    )

    torch.cuda.empty_cache()

    # Associates the answers with their respective IDs
    results = [{"QuestionId": qid, "Answer": answer} for qid, answer in zip(question_ids, answers)]
    return results

def parallel_inference(validation_data):
    # Initialize structures for predictions and TriviaQA data
    predictions = {}
    triviaqa_instances = {
        "Data": [],
        "Domain": "Wikipedia",
        "VerifiedEval": False,
        "Version": 1.0,
    }

    # Call perform_inference_p
    results = perform_inference_p(validation_data["Data"])

    # Store the predictions
    for result in results:
        question_id = result["QuestionId"]
        answer = result["Answer"]
        predictions[question_id] = answer

    # Add the original instances to the TriviaQA set
    triviaqa_instances["Data"].extend(validation_data["Data"])

    return predictions, triviaqa_instances

In [6]:
# We chose the first 500 as the eval data, for the sake of simplicity
# Set sample size and range
sample_size = 500
data = test_data.iloc[:sample_size]  # only first 500 examples

# Run inference
predictions, triviaqa_instances = parallel_inference(data)


=> Retrieving documents for question:
Which Lloyd Webber musical premiered in the US on 10th December 1993?

=> Retrieved 3 documents:
Document 1: Andrew Lloyd Webber, Baron Lloyd-Webber   (born 22 March 1948) is an English composer and impresario of musical theatre.   Several of his musicals have run for more than a decade both in the West End ...
Document 2: Aspects of Love followed in 1989, a musical based on the story by David Garnett. The lyrics were by Don Black and Charles Hart and the original production was directed by Trevor Nunn. Aspects had a ru...
Document 3: Lloyd Webber started writing his own music at a young age, a suite of six pieces at the age of nine. He also put on "productions" with Julian and his Aunt Viola in his toy theatre (which he built at V...

=> Retrieving documents for question:
Who was the next British Prime Minister after Arthur Balfour?

=> Retrieved 3 documents:
Document 1: Andrew Bonar Law (16 September 1858 – 30 October 1923), commonly called Bona

Processed prompts: 100%|██████████| 500/500 [01:20<00:00,  6.18it/s, est. speed input: 9508.30 toks/s, output: 21.60 toks/s]


In [7]:
# Define file output paths
file_path_instances = 'test_results/triviaqa_test_instances_500.json'

# Save TriviaQA-formatted instances
with open(file_path_instances, 'w') as f:
    json.dump(triviaqa_instances, f, indent=4)

In [8]:
with open('test_results/trivaqa_test_predictions.json', 'w') as f:
    json.dump(predictions, f, indent=4)