In [None]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd


def template_dataset(example):
    return {"prompt": tokenizer.apply_chat_template(example["messages"], tokenize = False, add_generation_prompt = True)}

if __name__ == "__main__":
    base_model_path = "base_model/Llama-3.1-8B-Instruct-nf4"
    adapter_path = "adapter/Zip-Llama-aligned/policy"

    llm = LLM(
        model= base_model_path,
        dtype=torch.bfloat16,
        trust_remote_code = True,
        max_model_len = 32768,
        gpu_memory_utilization = 0.4,
        enable_lora = True,
        max_lora_rank = 64
    )

    sampling_params = SamplingParams(
        temperature=0.7,
        max_tokens=128
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast = True)
    inference_data = load_dataset("json", data_files = "data/inference_data.json", split = "train")
    inference_data = inference_data.map(template_dataset, remove_columns = ["messages"])
    prompts = inference_data["prompt"]

    output = llm.generate(
        prompts, 
        sampling_params,
        lora_request = LoRARequest("adapter", 1, adapter_path)
    )

  from .autonotebook import tqdm as notebook_tqdm


INFO 11-19 21:43:23 [utils.py:253] non-default args: {'trust_remote_code': True, 'dtype': torch.bfloat16, 'max_model_len': 32768, 'gpu_memory_utilization': 0.4, 'disable_log_stats': True, 'enable_lora': True, 'max_lora_rank': 64, 'model': 'base_model/Llama-3.1-8B-Instruct-nf4'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-19 21:43:23 [model.py:631] Resolved architecture: LlamaForCausalLM
INFO 11-19 21:43:23 [model.py:1745] Using max model len 32768


2025-11-19 21:43:24,204	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 11-19 21:43:24 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=16384.
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:25 [core.py:93] Initializing a V1 LLM engine (v0.11.1) with config: model='base_model/Llama-3.1-8B-Instruct-nf4', speculative_config=None, tokenizer='base_model/Llama-3.1-8B-Instruct-nf4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=bitsandbytes, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=Obse

Traceback (most recent call last):
  File "/root/miniconda3/envs/LLM/lib/python3.12/site-packages/tvm_ffi/utils/_build_optional_torch_c_dlpack.py", line 836, in <module>
    main()
  File "/root/miniconda3/envs/LLM/lib/python3.12/site-packages/tvm_ffi/utils/_build_optional_torch_c_dlpack.py", line 829, in main
    build_ninja(build_dir=str(build_dir))
  File "/root/miniconda3/envs/LLM/lib/python3.12/site-packages/tvm_ffi/cpp/extension.py", line 353, in build_ninja
    raise RuntimeError("\n".join(msg))
RuntimeError: ninja exited with status 1
stdout:
[1/2] c++ -MMD -MF main.o.d -std=c++17 -fPIC -O3 -DBUILD_WITH_CUDA -D_GLIBCXX_USE_CXX11_ABI=1 -I/root/miniconda3/envs/LLM/lib/python3.12/site-packages/tvm_ffi/include -I/root/miniconda3/envs/LLM/include/python3.12 -I/root/miniconda3/envs/LLM/lib/python3.12/site-packages/torch/include -I/root/miniconda3/envs/LLM/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.3/include -c /tmp/tvm-ffi-torch-c-dlpack-3b

[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:33 [cuda.py:418] Valid backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:33 [cuda.py:427] Using FLASH_ATTN backend.
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:33 [bitsandbytes_loader.py:791] Loading weights with BitsAndBytes quantization. May take a while ...


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.63it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00,  3.24it/s]
[1;36m(EngineCore_DP0 pid=3430614)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:05<00:05,  5.60s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:06<00:00,  2.91s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:06<00:00,  3.31s/it]
[1;36m(EngineCore_DP0 pid=3430614)[0;0m 


[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:40 [punica_selector.py:20] Using PunicaWrapperGPU.
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:41 [gpu_model_runner.py:3334] Model loading took 6.0187 GiB memory and 12.855393 seconds
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:56 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/9b5f756d58/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:56 [backends.py:647] Dynamo bytecode transform time: 13.92 s
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:43:57 [backends.py:251] Cache the graph for dynamic shape for later use
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:44:03 [backends.py:282] Compiling a graph for dynamic shape takes 6.07 s
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:44:06 [monitor.py:34] torch.compile takes 19.98 s in total
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/102 [00:00<?, ?it/s]



Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 102/102 [00:22<00:00,  4.61it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 102/102 [00:18<00:00,  5.49it/s]


[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:44:50 [gpu_model_runner.py:4240] Graph capturing finished in 42 secs, took 3.21 GiB
[1;36m(EngineCore_DP0 pid=3430614)[0;0m INFO 11-19 21:44:50 [core.py:250] init engine (profile, create kv cache, warmup model) took 69.25 seconds
INFO 11-19 21:44:52 [llm.py:352] Supported tasks: ['generate']


Adding requests:   0%|          | 0/10000 [00:00<?, ?it/s]



Adding requests: 100%|██████████| 10000/10000 [01:27<00:00, 114.56it/s]
Processed prompts: 100%|██████████| 10000/10000 [29:41<00:00,  5.61it/s, est. speed input: 16383.17 toks/s, output: 689.60 toks/s]   


In [13]:
data = []
idx = inference_data["subject_id"]

for idx, gen in zip(idx, output):
    row = {
        "subject_id": idx,
        "generated_text": gen.outputs[0].text.strip()
    }
    data.append(row)

In [None]:
df = pd.DataFrame(data)
df.to_csv("data/inference_result.csv", index=False, encoding="utf-8-sig")