# 06-05: vLLM Offline Inference

In [1]:
import mlrun
from IPython.display import Markdown, display

## 1. Configuration

In [2]:
#MODEL_NAME = "Tiny-LLM"
MODEL_NAME = "Mistral-7B-Instruct-v0.2-AWQ"
project_name = "test-vllm-integration" # the project name

### 1.1 Load The Project

In [3]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=False)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-12 16:53:21,829 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
Full project name: test-vllm-integration


## 2. Offine Inference

In [4]:
# the prompts to use for inference
prompts = [
    "What is the capital of France?",
    "Explain the theory of relativity in simple terms.",
    "What are the main differences between Python and Java?",
    "How does a neural network work?",
    "What is the significance of the Turing test in AI?"
]

# Sampling parameters for the model
sampling_params = {
    "temperature": 0.8,
    "top_p": 0.95,
    "max_tokens": 25 
}

In [5]:
# create the function
fn_inference = project.set_function(
    name="vllm-offile-inference",
    func="../../src/functions/vllm_model_server.py",
    kind="job",
    handler="offline_inference_handler",
    image="registry-service.mlrun.svc.cluster.local/foulds/mlrun-vllm:0.10.0"
)

fn_inference.with_limits(gpus=1)

# create the task to run the function
task = mlrun.new_task(
    name="vllm-offline-inference-task",
    project=project_name,

)    

# run the function
run_output = fn_inference.run(
    task=task,
    params={
        "model_name": MODEL_NAME,
        "prompts": prompts,
        "sampling_params": sampling_params,
    },
    local=False)


> 2025-08-12 16:53:22,032 [info] Storing function: {"db":"http://dragon.local:30070","name":"vllm-offile-inference-offline-inference-handler","uid":"c1fc392995274cd089c3c6c565d9afe5"}
> 2025-08-12 16:53:22,137 [info] Job is running in the background, pod: vllm-offile-inference-offline-inference-handler-trpg8
INFO 08-12 14:53:26 [__init__.py:235] Automatically detected platform cuda.
> 2025-08-12 14:53:27,443 [info] Running offline inference for model Mistral-7B-Instruct-v0.2-AWQ with 5 prompts.
> 2025-08-12 14:53:27,443 [info] Running offline inference...
> 2025-08-12 14:53:27,448 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
> 2025-08-12 14:53:27,451 [info] Downloading tokenizer for model Mistral-7B-Instruct-v0.2-AWQ
> 2025-08-12 14:53:27,451 [info] Downloading tokenizer files to temporary directory: /tmp/vllm_tokenizer_cq5o8w6g
> 2025-08-12 14:53:27,455 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
> 2025-08-12 14:53:27,53

project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results
test-vllm-integration,...d9afe5,0,Aug 12 14:53:23,2025-08-12 14:54:48.988728+00:00,completed,run,vllm-offile-inference-offline-inference-handler,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.10.18host=vllm-offile-inference-offline-inference-handler-trpg8,,"model_name=Mistral-7B-Instruct-v0.2-AWQprompts=['What is the capital of France?', 'Explain the theory of relativity in simple terms.', 'What are the main differences between Python and Java?', 'How does a neural network work?', 'What is the significance of the Turing test in AI?']sampling_params={'temperature': 0.8, 'top_p': 0.95, 'max_tokens': 25}","outputs=[{'prompt': 'What is the capital of France?', 'response': '\n\nThe capital city of France is Paris. It is the most populous city in France, and it is also one'}, {'prompt': 'Explain the theory of relativity in simple terms.', 'response': '\n\nThe theory of relativity is a set of scientific ideas developed by Albert Einstein in the early 1900'}, {'prompt': 'What are the main differences between Python and Java?', 'response': ' Python and Java are two of the most popular programming languages today, each with its own strengths and weaknesses. While they share'}, {'prompt': 'How does a neural network work?', 'response': '\n\nA neural network is a type of machine learning model that is inspired by the human brain. It consists of interconnected'}, {'prompt': 'What is the significance of the Turing test in AI?', 'response': ""\n\nThe Turing test is a measure of a machine's ability to exhibit intelligent behavior equivalent to, or indist""}]"





> 2025-08-12 16:54:53,379 [info] Run execution finished: {"name":"vllm-offile-inference-offline-inference-handler","status":"completed"}


In [6]:
print(f"Run ID: {run_output.metadata.uid}")
print(f"run_output state: {run_output.status.state}")
print(f"run_output results: {run_output.status.results}")

Run ID: c1fc392995274cd089c3c6c565d9afe5
run_output state: completed
run_output results: {'outputs': [{'prompt': 'What is the capital of France?', 'response': '\n\nThe capital city of France is Paris. It is the most populous city in France, and it is also one'}, {'prompt': 'Explain the theory of relativity in simple terms.', 'response': '\n\nThe theory of relativity is a set of scientific ideas developed by Albert Einstein in the early 1900'}, {'prompt': 'What are the main differences between Python and Java?', 'response': ' Python and Java are two of the most popular programming languages today, each with its own strengths and weaknesses. While they share'}, {'prompt': 'How does a neural network work?', 'response': '\n\nA neural network is a type of machine learning model that is inspired by the human brain. It consists of interconnected'}, {'prompt': 'What is the significance of the Turing test in AI?', 'response': "\n\nThe Turing test is a measure of a machine's ability to exhibit i