# 06-05 : Download Model

Download a LLM Model from Hugging Face and store it as an artifact in MLRun.

In [1]:
import os
import sys

In [2]:
sys.path.append("../../src")

In [3]:
import mlrun
from vllm import LLM, SamplingParams

from functions.vllm_model_server import VLLMModelServer

INFO 08-11 20:12:50 [__init__.py:235] Automatically detected platform cuda.


In [4]:
# Show the API server URL
mlrun.get_run_db()

HTTPRunDB('http://dragon.local:30070')

## 1. Configuration

In [5]:
MODEL_NAME = "deepseek-r1-distill-qwen-14b-awq"
MODEL_ID = f"casperhansen/{MODEL_NAME}"
MODEL_PATH = f"'/data/.cache/huggingface/{MODEL_NAME}"

project_name = "test-vllm-integration" # the project name

### 1.1 Create The Project

In [6]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=False)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-11 20:12:51,026 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
Full project name: test-vllm-integration


## 2. Download Model

In [None]:
# get the context
context = mlrun.get_or_create_ctx(project_name)

# create the model server function
vllm = VLLMModelServer(
    context=context,
    name=MODEL_NAME,
    model_path=MODEL_PATH,
    model_name=MODEL_ID,
)

# download the model
vllm.store_model()

# get the model artifact
model_artifact = project.get_artifact(MODEL_NAME)
print(f"Model Artifact Uri: {model_artifact.uri}")
print(f"Model Artifact S3: {model_artifact.target_path}")

## 2. vLLM Test

### 2.1 CLI Test

Start the vLLM server to serve the model:

```bash
vllm serve s3://mlrun/projects/test-vllm-integration/artifacts/deepseek-r1-distill-qwen-14b-awq/ --load-format runai_streamer --max-model-len 32768
```

Simple Promt:

```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
    "prompt": "The chees moon?",
    "max_tokens": 50
}'
```

### 2.2 Code Test

In [7]:
#model_artifact = project.get_artifact(MODEL_NAME)
model_artifact = project.get_artifact("Tiny-LLM")
model_uri = model_artifact.target_path

print(f"Model URI: {model_uri}")

Model URI: s3://mlrun/projects/test-vllm-integration/artifacts/Tiny-LLM/


In [None]:
?LLM

In [None]:
os.environ["VLLM_CI_USE_S3"] = "1" 

# llm = LLM(
#     model=model_uri,  
#     tokenizer=model_uri,
#     hf_config_path=model_uri,
#     #max_model_len=32768,
#     max_model_len=1024,
#     trust_remote_code=True,
#     load_format="runai_streamer",
#     enable_chunked_prefill=False
# )

INFO 08-11 20:46:22 [config.py:3440] Downcasting torch.float32 to torch.bfloat16.
INFO 08-11 20:46:22 [config.py:1604] Using max model len 1024
INFO 08-11 20:46:22 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-11 20:46:22 [core.py:572] Waiting for init message from front-end.
INFO 08-11 20:46:22 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='/tmp/tmpipxqny24', speculative_config=None, tokenizer='/tmp/tmpipxqny24', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=runai_streamer, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_proper

Loading safetensors using Runai Model Streamer:   0% Completed | 0/12 [00:00<?, ?it/s]


[RunAI Streamer] Overall time to stream 24.8 MiB of all files: 0.09s, 276.3 MiB/s
INFO 08-11 20:46:25 [gpu_model_runner.py:1892] Model loading took 0.0247 GiB and 0.202080 seconds
INFO 08-11 20:46:25 [backends.py:530] Using cache directory: /home/johnny/.cache/vllm/torch_compile_cache/6842510834/rank_0_0/backbone for vLLM's torch.compile
INFO 08-11 20:46:25 [backends.py:541] Dynamo bytecode transform time: 0.54 s
INFO 08-11 20:46:27 [backends.py:194] Cache the graph for dynamic shape for later use
INFO 08-11 20:46:28 [backends.py:215] Compiling a graph for dynamic shape takes 2.38 s
INFO 08-11 20:46:28 [monitor.py:34] torch.compile takes 2.92 s in total
INFO 08-11 20:46:29 [gpu_worker.py:255] Available KV cache memory: 20.96 GiB
INFO 08-11 20:46:29 [kv_cache_utils.py:833] GPU KV cache size: 58,609,536 tokens
INFO 08-11 20:46:29 [kv_cache_utils.py:837] Maximum concurrency for 1,024 tokens per request: 57235.88x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:00<00:00, 658.66it/s]


INFO 08-11 20:46:30 [gpu_model_runner.py:2485] Graph capturing finished in 0 secs, took 0.08 GiB
INFO 08-11 20:46:30 [core.py:193] init engine (profile, create kv cache, warmup model) took 4.94 seconds
ERROR 08-11 20:46:30 [core.py:632] EngineCore failed to start.
ERROR 08-11 20:46:30 [core.py:632] Traceback (most recent call last):
ERROR 08-11 20:46:30 [core.py:632]   File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/transformers/utils/hub.py", line 479, in cached_files
ERROR 08-11 20:46:30 [core.py:632]     hf_hub_download(
ERROR 08-11 20:46:30 [core.py:632]   File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
ERROR 08-11 20:46:30 [core.py:632]     validate_repo_id(arg_value)
ERROR 08-11 20:46:30 [core.py:632]   File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
ERR

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/transformers/utils/hub.py", line 479, in cached_files
    hf_hub_download(
  File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
    validate_repo_id(arg_value)
  File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
    raise HFValidationError(
huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/tmp/tmpipxqny24'. Use `repo_type` argument if needed.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/johnny/swan/miniconda3/envs/local-mlrun/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/johnny/

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}