In [1]:
from vllm import LLM, SamplingParams
import pandas as pd
import torch
import json


INFO 04-14 15:00:50 [__init__.py:239] Automatically detected platform cuda.


In [2]:
# Load test data
test_data = pd.read_json("../qa/wikipedia-dev.json")
sample_size = 500
data = test_data.iloc[:sample_size]

In [3]:
# Initialize the LLM
READER_MODEL_NAME = "AMead10/Llama-3.2-3B-Instruct-AWQ"
model = LLM( 
    model = READER_MODEL_NAME,
    tensor_parallel_size=1, 
    gpu_memory_utilization=1.0, 
    trust_remote_code=True,
    enforce_eager=True,
    disable_log_stats=True,
    max_model_len=4096
)

INFO 04-14 15:01:36 [config.py:585] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 04-14 15:01:37 [awq_marlin.py:114] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 04-14 15:01:37 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-14 15:01:37 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='AMead10/Llama-3.2-3B-Instruct-AWQ', speculative_config=None, tokenizer='AMead10/Llama-3.2-3B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(gu

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-14 15:01:42 [loader.py:447] Loading weights took 2.90 seconds
INFO 04-14 15:01:43 [gpu_model_runner.py:1186] Model loading took 2.1364 GB and 4.133809 seconds
INFO 04-14 15:01:45 [kv_cache_utils.py:566] GPU KV cache size: 11,424 tokens
INFO 04-14 15:01:45 [kv_cache_utils.py:569] Maximum concurrency for 4,096 tokens per request: 2.79x
INFO 04-14 15:01:45 [core.py:151] init engine (profile, create kv cache, warmup model) took 2.03 seconds


In [4]:
# Define prompt template without context
tokenizer = model.get_tokenizer()
prompt_in_chat_format_no_context = [
    {
        "role": "system",
        "content": """Answer the question with only one word or the simplest possible response (e.g., a single number or a single word).
Do NOT generate sentences, explanations, or additional context.
Stop immediately after providing the answer. Do not generate any further words or tokens.
If the context does not provide any useful information, answer the question based on your own knowledge.
I am going to provide you five examples:

Question: What is the capital of Kenya?
Answer: Nairobi
---
Question: What was the name of the pig leader in George Orwell's Animal Farm?
Answer: Napoleon
---
Question: Which artist created the Katzenjammer Kids?
Answer: Rudolph Dirks
---
Question: Who was Geena Davis's husband when they made the loss-maker Cutthroat Island?
Answer: Renny Harlin
---
Question: Who was married to Spandau Ballet's Gary Kemp and later to Jude Law?
Answer: Sadie Frost

"""
    },
    {
        "role": "user",
        "content": """Now here is the question you need to answer.

Question: {question}"""
    },
]
PLAIN_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format_no_context, tokenize=False, add_generation_prompt=True
)

# Define sampling parameters
sampling_params = SamplingParams(
    n=1,
    top_p=0.9,
    temperature=0,
    repetition_penalty=1.2,
    max_tokens=5,
)

In [5]:
# Inference functions (no context used)
def read_plain(llm, sampling_params, prompt_template, questions):
    prompts = [prompt_template.format(question=q) for q in questions]
    outputs = llm.generate(prompts, sampling_params)
    outputs = [output.outputs[0].text for output in outputs]
    return outputs

def answer_without_rag(instances):
    questions = [instance["Question"] for instance in instances]
    question_ids = [instance["QuestionId"] for instance in instances]

    print("=> Generating answers without context...")
    answers = read_plain(model, sampling_params, PLAIN_PROMPT_TEMPLATE, questions)

    torch.cuda.empty_cache()

    results = [{"QuestionId": qid, "Answer": answer} for qid, answer in zip(question_ids, answers)]
    return results

def plain_inference(validation_data):
    predictions = {}
    triviaqa_instances = {
        "Data": [],
        "Domain": "Wikipedia",
        "VerifiedEval": False,
        "Version": 1.0,
    }

    results = answer_without_rag(validation_data["Data"])

    for result in results:
        predictions[result["QuestionId"]] = result["Answer"]

    triviaqa_instances["Data"].extend(validation_data["Data"])
    return predictions, triviaqa_instances

In [6]:
# Run plain inference
predictions_no_context, triviaqa_instances_no_context = plain_inference(data)

# Save files
with open('test_results/triviaqa_test_instances_500_nocontext.json', 'w') as f:
    json.dump(triviaqa_instances_no_context, f, indent=4)

with open('test_results/triviaqa_test_predictions_nocontext.json', 'w') as f:
    json.dump(predictions_no_context, f, indent=4)

=> Generating answers without context...


Processed prompts: 100%|██████████| 500/500 [00:02<00:00, 248.63it/s, est. speed input: 62064.30 toks/s, output: 869.26 toks/s]
