# Running vllm - Model Inference and Comparing Performance


### Some notes on vllm github repo structure: 
--> vllm/ → main Python API & high-level orchestration engine/ — request scheduling, batching logic

--> model_executor/ — core model execution logic

--> layers/ — includes paged_attention.py

--> kv_cache/ — memory management for attention cache

--> worker/ — GPU worker processes, distributed logic

--> csrc/ → custom CUDA kernels (PagedAttention CUDA implementation lives here)

--> examples/ → ready-to-run scripts

--> benchmarks/ — perf testing scripts

https://github.com/vllm-project/vllm/tree/main
https://docs.nvidia.com/cutlass/media/docs/cpp/efficient_gemm.html
https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/


In [1]:
!nvidia-smi
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

Tue Aug 12 17:45:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070        Off |   00000000:41:00.0 Off |                  N/A |
|  0%   36C    P8             10W /  215W |      36MiB /  12282MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
##known warning from HuggingFace tokenizers when using multiprocessing 
#(like in Jupyter or multiprocessing environments). It doesn’t break your code but can be silenced with setting this flag
os.environ["TOKENIZERS_PARALLELISM"] = "false"


from vllm import LLM, SamplingParams
import time
import matplotlib.pyplot as plt
import pynvml

##Initialize NVML to query GPU info
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming single GPU

##Query GPU properties
gpu_name = pynvml.nvmlDeviceGetName(handle)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_bandwidth_bytes = 288 * 1e9  # RTX 4070 approx. bandwidth in bytes/sec

print(f"GPU: {gpu_name}")
print(f"Total memory: {mem_info.total / 1e9:.2f} GB")
print(f"Memory bandwidth: {mem_bandwidth_bytes/1e9:.1f}  GB/s")

  from .autonotebook import tqdm as notebook_tqdm


INFO 08-12 17:45:20 [__init__.py:235] Automatically detected platform cuda.
GPU: NVIDIA GeForce RTX 4070
Total memory: 12.88 GB
Memory bandwidth: 288.0  GB/s


In [3]:
# Here we do an example model serving with vllm
# note I have 12GB on my gpu so cannot run bigger models. 
#https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct

llm = LLM(
    model="Qwen/Qwen2.5-1.5B-Instruct",
    trust_remote_code=True,
    gpu_memory_utilization=0.6  # adjust this parameter--
)
params = SamplingParams(temperature=0.7, max_tokens=50)
outputs = llm.generate(["Explain TMA in hopper gpus"], params)

for output in outputs:
    print(output.outputs[0].text)
del llm  # delete your vLLM model object
torch.cuda.empty_cache()  # clear PyTorch GPU cache

INFO 08-12 17:45:27 [config.py:1604] Using max model len 32768


2025-08-12 17:45:28,898	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 08-12 17:45:28 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-12 17:45:32 [__init__.py:235] Automatically detected platform cuda.
INFO 08-12 17:45:33 [core.py:572] Waiting for init message from front-end.
INFO 08-12 17:45:33 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observabili

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.24it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.23it/s]



INFO 08-12 17:45:36 [default_loader.py:262] Loading weights took 0.53 seconds
INFO 08-12 17:45:37 [gpu_model_runner.py:1892] Model loading took 2.8876 GiB and 1.157818 seconds
INFO 08-12 17:45:42 [backends.py:530] Using cache directory: /home/hshahzad/.cache/vllm/torch_compile_cache/28339d7bf3/rank_0_0/backbone for vLLM's torch.compile
INFO 08-12 17:45:42 [backends.py:541] Dynamo bytecode transform time: 5.33 s
INFO 08-12 17:45:46 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.046 s
INFO 08-12 17:45:47 [monitor.py:34] torch.compile takes 5.33 s in total
INFO 08-12 17:45:48 [gpu_worker.py:255] Available KV cache memory: 2.59 GiB
INFO 08-12 17:45:48 [kv_cache_utils.py:833] GPU KV cache size: 97,120 tokens
INFO 08-12 17:45:48 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 2.96x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:01<00:00, 35.56it/s]


INFO 08-12 17:45:51 [gpu_model_runner.py:2485] Graph capturing finished in 2 secs, took 0.46 GiB
INFO 08-12 17:45:51 [core.py:193] init engine (profile, create kv cache, warmup model) took 13.98 seconds


Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 853.19it/s]
Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.52it/s, est. speed input: 22.83 toks/s, output: 126.83 toks/s]


. TMA is a new instruction set for GPUs, designed to optimize the performance of machine learning and deep learning tasks. It stands for Tensor Math Accelerator and is inspired by the popular Tensorflow instruction set.
TMA is optimized for the use of




In [None]:
## Measure Performance of the model with its quantized version (Quantized using AWQ- model downloaded from internet)

def measure_latency(model_name, prompt, max_tokens=50):
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        gpu_memory_utilization=0.6,
    )
    params = SamplingParams(temperature=0.7, max_tokens=max_tokens)

    start_time = time.time()
    outputs = llm.generate([prompt], params)
    end_time = time.time()

    for output in outputs:
        print(output.outputs[0].text)

    llm_time = end_time - start_time
    print(f"Model {model_name} inference time: {llm_time:.3f} seconds")

    del llm
    import torch
    torch.cuda.empty_cache()

    return llm_time

prompt = "Explain TMA in Hopper GPUs"


unquantized_model = "Qwen/Qwen2.5-1.5B-Instruct" 
quantized_model = "Qwen/Qwen2.5-1.5B-Instruct-AWQ"

unquantized_time = measure_latency(unquantized_model, prompt)
quantized_time = measure_latency(quantized_model, prompt)

print(f"Unquantized model time: {unquantized_time:.3f}s")
print(f"Quantized model time: {quantized_time:.3f}s")


INFO 08-12 17:45:53 [config.py:1604] Using max model len 32768
INFO 08-12 17:45:53 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-12 17:45:57 [__init__.py:235] Automatically detected platform cuda.
INFO 08-12 17:45:58 [core.py:572] Waiting for init message from front-end.
INFO 08-12 17:45:58 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.22it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.22it/s]



INFO 08-12 17:46:00 [default_loader.py:262] Loading weights took 0.53 seconds
INFO 08-12 17:46:01 [gpu_model_runner.py:1892] Model loading took 2.8876 GiB and 0.798807 seconds
INFO 08-12 17:46:06 [backends.py:530] Using cache directory: /home/hshahzad/.cache/vllm/torch_compile_cache/28339d7bf3/rank_0_0/backbone for vLLM's torch.compile
INFO 08-12 17:46:06 [backends.py:541] Dynamo bytecode transform time: 5.41 s
INFO 08-12 17:46:11 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.118 s
INFO 08-12 17:46:11 [monitor.py:34] torch.compile takes 5.41 s in total
INFO 08-12 17:46:12 [gpu_worker.py:255] Available KV cache memory: 2.59 GiB
INFO 08-12 17:46:13 [kv_cache_utils.py:833] GPU KV cache size: 97,120 tokens
INFO 08-12 17:46:13 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 2.96x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:01<00:00, 35.45it/s]


INFO 08-12 17:46:15 [gpu_model_runner.py:2485] Graph capturing finished in 2 secs, took 0.46 GiB
INFO 08-12 17:46:15 [core.py:193] init engine (profile, create kv cache, warmup model) took 14.13 seconds


Adding requests: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1463.98it/s]
Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.55it/s, est. speed input: 20.52 toks/s, output: 128.22 toks/s]


. TMA is a new instruction set for Hopper GPUs that provides a more efficient way to handle data in parallel. Instead of storing data in a linear fashion, TMA stores data in a tree-like structure, which allows for more efficient sharing of
Model Qwen/Qwen2.5-1.5B-Instruct inference time: 0.396 seconds




INFO 08-12 17:46:17 [config.py:1604] Using max model len 32768
INFO 08-12 17:46:19 [awq_marlin.py:116] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-12 17:46:19 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-12 17:46:22 [__init__.py:235] Automatically detected platform cuda.
INFO 08-12 17:46:24 [core.py:572] Waiting for init message from front-end.
INFO 08-12 17:46:24 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct-AWQ', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=aut

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.39it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.38it/s]



INFO 08-12 17:46:26 [default_loader.py:262] Loading weights took 0.26 seconds
INFO 08-12 17:46:26 [gpu_model_runner.py:1892] Model loading took 1.1002 GiB and 0.659654 seconds
INFO 08-12 17:46:33 [backends.py:530] Using cache directory: /home/hshahzad/.cache/vllm/torch_compile_cache/adf68c476f/rank_0_0/backbone for vLLM's torch.compile
INFO 08-12 17:46:33 [backends.py:541] Dynamo bytecode transform time: 6.11 s
INFO 08-12 17:46:38 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.532 s
INFO 08-12 17:46:39 [monitor.py:34] torch.compile takes 6.11 s in total
INFO 08-12 17:46:40 [gpu_worker.py:255] Available KV cache memory: 4.38 GiB
INFO 08-12 17:46:40 [kv_cache_utils.py:833] GPU KV cache size: 164,048 tokens
INFO 08-12 17:46:40 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 5.01x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:01<00:00, 36.20it/s]


INFO 08-12 17:46:42 [gpu_model_runner.py:2485] Graph capturing finished in 2 secs, took 0.48 GiB
INFO 08-12 17:46:42 [core.py:193] init engine (profile, create kv cache, warmup model) took 15.95 seconds


Adding requests: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2075.36it/s]
Processed prompts:   0%|                                                                                                                       | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

## Some Notes:

So when I run the above, I notice my VRAM on GPU is still occupied. Running another command gives error that not enough memory. This happens because:
--> Memory caching / pooling:Frameworks (like PyTorch underneath vLLM) keep allocated VRAM to speed up subsequent operations. They don't free GPU memory immediately to avoid costly reallocations later.
--> Persistent model & buffers:
Your loaded model, KV caches, buffers, CUDA contexts all remain in GPU memory until you explicitly delete or exit.
-->Background processes:
vLLM's engine might be running background workers or processes keeping memory reserved.

This is why we have added del llm and torch.cuda.empty_cache(). You can keep checking using nvidia-smi if any processes runninga nd using up memory

## Performance Monitoring with Nvidia Nsight

(env_vllm) hshahzad@codes-1:~/GPU_work/Tests$ nsys profile --output ./<name>.nsys-rep --force-overwrite true vllm serve Qwen/Qwen2.5-1.5B-Instruct-AWQ

In a separate terminal (no venv needed):
$ curl -X POST http://localhost:8000/v1/completions   -H "Content-Type: application/json"   -d '{
        "model": "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
        "prompt": "Explain TMA in hopper gpus",
        "max_tokens": 50,
        "temperature": 0.7
      }'

--> once response recieved, terminate vllm server using Ctrl+C (ONCE) in first terminal
--> Wait for report generation to complete

--> run nsys stats <name>.nsys-rep
--> Or save all info in textfile: nsys stats <name>.nsys-rep > quant.txt

--> On ur desktop u can run Nvidia Nsight Systems application(its in Applications-> Programming) Once launched go to: ->File->Open-><name>.nsys rep
--> this way you can visualize the timeline

In [None]:
## Measuring Latency and Throughput with Python Profiler
## Query GPU memory usage before and after running inference using nvidia-smi programmatically or like in this case, PyTorch's CUDA API:
## What we measure here:
#-->load_time_s → Model load time (sec)
#-->gpu_mem_mb → Actual GPU memory usage (MB) after load
#-->gen_time_s → Time to generate output for given prompt
#-->tokens_generated → Output length in tokens
#-->throughput_tok_s → Tokens per second during generation


import time
import pynvml
import pandas as pd
from vllm import LLM, SamplingParams

## --** GPU Memory Utility **--
def get_gpu_memory_mb(gpu_index=0):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return info.used / 1024**2  # MB

## --** Benchmark Function **--
def benchmark_model(model_path, prompt, quantized=False, runs=3):
    print(f"\n Loading model: {model_path} {'(Quantized)' if quantized else '(Unquantized)'}")
    start_load = time.time()
    llm = LLM(model=model_path)
    load_time = time.time() - start_load
    mem_after_load = get_gpu_memory_mb()

    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)

    gen_times = []
    throughputs = []
    tokens_counts = []

    for i in range(runs):
        start_gen = time.time()
        output = llm.generate([prompt], sampling_params=sampling_params)
        gen_time = time.time() - start_gen

        tokens_generated = len(output[0].outputs[0].token_ids)
        throughput = tokens_generated / gen_time

        gen_times.append(gen_time)
        throughputs.append(throughput)
        tokens_counts.append(tokens_generated)

    return {
        "model": model_path,
        "quantized": quantized,
        "load_time_s": load_time,
        "gpu_mem_mb": mem_after_load,
        "avg_gen_time_s": sum(gen_times) / runs,
        "avg_tokens_generated": sum(tokens_counts) / runs,
        "avg_throughput_tok_s": sum(throughputs) / runs
    }

# --** Test Settings **--
prompt = "Explain architecture of Blackwell GPUs."
runs_per_model = 5

# --** Run Benchmarks **--
results = []
results.append(benchmark_model("Qwen/Qwen2.5-1.5B-Instruct", prompt, quantized=False, runs=runs_per_model))
results.append(benchmark_model("Qwen/Qwen2.5-1.5B-Instruct-AWQ", prompt, quantized=True, runs=runs_per_model))

# --** Show Results **--
df = pd.DataFrame(results)
print("\n Final Benchmark Results (Averaged over", runs_per_model, "runs)")
print(df)
