# 06-05 : Download Model

Download a LLM Model from Hugging Face and store it as an artifact in MLRun.

In [1]:
import os
import sys

In [2]:
sys.path.append("../../src")

In [32]:
import tempfile

import mlrun
from IPython.display import Markdown, display
from vllm import LLM, SamplingParams

from functions.vllm_model_server import VLLMModelServer

In [4]:
# Show the API server URL
mlrun.get_run_db()

HTTPRunDB('http://dragon.local:30070')

## 1. Configuration

In [5]:
#MODEL_NAME = "deepseek-r1-distill-qwen-14b-awq"
#MODEL_ID = f"casperhansen/{MODEL_NAME}"

In [6]:
MODEL_NAME = "Mistral-7B-Instruct-v0.2-AWQ"
MODEL_ID = f"TheBloke/{MODEL_NAME}"
MODEL_PATH = f"'/data/.cache/huggingface/{MODEL_NAME}"

project_name = "test-vllm-integration" # the project name

### 1.1 Create The Project

In [7]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=False)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-11 23:31:30,245 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
Full project name: test-vllm-integration


## 2. Download Model

In [8]:
# get the context
context = mlrun.get_or_create_ctx(project_name)

# create the model server function
vllm = VLLMModelServer(
    context=context,
    name=MODEL_NAME,
    model_path=MODEL_PATH,
    model_name=MODEL_ID,
)

# download the model
vllm.store_model()

# get the model artifact
model_artifact = project.get_artifact(MODEL_NAME)
print(f"Model Artifact Uri: {model_artifact.uri}")
print(f"Model Artifact S3: {model_artifact.target_path}")

> 2025-08-11 23:31:30,259 [info] Logging run results to: http://dragon.local:30070
> 2025-08-11 23:31:30,279 [info] Storing model TheBloke/Mistral-7B-Instruct-v0.2-AWQ in project test-vllm-integration
> 2025-08-11 23:31:30,279 [info] Downloading model TheBloke/Mistral-7B-Instruct-v0.2-AWQ to '/data/.cache/huggingface/Mistral-7B-Instruct-v0.2-AWQ


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

quant_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

> 2025-08-11 23:38:35,612 [info] Deleting cache directory: '/data/.cache/huggingface/Mistral-7B-Instruct-v0.2-AWQ/.cache
> 2025-08-11 23:38:35,614 [info] Model TheBloke/Mistral-7B-Instruct-v0.2-AWQ downloaded successfully to: '/data/.cache/huggingface/Mistral-7B-Instruct-v0.2-AWQ
> 2025-08-11 23:38:35,745 [info] Project loaded successfully: {"project_name":"test-vllm-integration"}
> 2025-08-11 23:38:35,746 [info] Logging model TheBloke/Mistral-7B-Instruct-v0.2-AWQ to project test-vllm-integration
> 2025-08-11 23:38:54,477 [info] Model TheBloke/Mistral-7B-Instruct-v0.2-AWQ logged successfully to: store://artifacts/test-vllm-integration/Mistral-7B-Instruct-v0.2-AWQ#0@1dce1e31-27eb-471f-a8c4-46a2d3bcf0ec^89e1a5c62d8ff3d26e4fa12d7b4e4918be9b0c55
> 2025-08-11 23:38:54,478 [info] Deleting local model files at '/data/.cache/huggingface/Mistral-7B-Instruct-v0.2-AWQ
> 2025-08-11 23:38:55,063 [info] Model TheBloke/Mistral-7B-Instruct-v0.2-AWQ stored successfully.
Model Artifact Uri: store://arti

## 2. vLLM Test

### 2.1 CLI Test

Start the vLLM server to serve the model:

```bash
vllm serve s3://mlrun/projects/test-vllm-integration/artifacts/deepseek-r1-distill-qwen-14b-awq/ --load-format runai_streamer --max-model-len 32768
```

Simple Promt:

```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
    "prompt": "The chees moon?",
    "max_tokens": 50
}'
```

### 2.2 Code Test

#### 2.1.1 Get The Model Artifact

In [9]:
model_artifact = project.get_artifact(MODEL_NAME)
#model_artifact = project.get_artifact("Tiny-LLM")
model_uri = model_artifact.target_path

print(f"Model URI: {model_uri}")

Model URI: s3://mlrun/projects/test-vllm-integration/artifacts/Mistral-7B-Instruct-v0.2-AWQ/


#### 2.1.2 Download The Tokenizer

In [10]:
# get the data item
model_data_item = mlrun.get_dataitem(model_artifact.uri)
print(model_data_item.listdir())

['.gitattributes', 'README.md', 'config.json', 'generation_config.json', 'model.safetensors', 'quant_config.json', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer.model', 'tokenizer_config.json']


In [11]:
def download_tokenizer(model_artifact) -> str:
    """
    Download the tokenizer from the model data item.
    """
    # tokenizer files normally needed:
    tokenizer_files = [
        "tokenizer.json",
        "tokenizer.model",         # some models have one or the other
        "tokenizer_config.json",
        "special_tokens_map.json",
        "config.json",              # often needed too
        "generation_config.json",   # optional, if exists
    ]
    
    # create a temporary directory to store the tokenizer files
    temp_dir = tempfile.mkdtemp(prefix="vllm_tokenizer_")

    # get the files in the data item
    data_item = mlrun.get_dataitem(model_artifact.uri)
    data_item_files = data_item.listdir()

    # download tokenizer-related files
    for filename in data_item_files:
        # if the file is not in the tokenizer files, skip it
        if filename not in tokenizer_files:
            continue
        
        # download the file to the temporary directory
        print(f"Downloading {filename} to {temp_dir}")
        data_item_file = mlrun.get_dataitem(f"{model_data_item.url}{filename}")
        data_item_file.download(target_path=f"{temp_dir}/{filename}")


    return temp_dir

##   Download the tokenizer files
tokenizer_dir = download_tokenizer(model_artifact)
print(f"Tokenizer files downloaded to: {tokenizer_dir}")

Downloading config.json to /tmp/vllm_tokenizer_lzpe2bat
Downloading generation_config.json to /tmp/vllm_tokenizer_lzpe2bat
Downloading special_tokens_map.json to /tmp/vllm_tokenizer_lzpe2bat
Downloading tokenizer.json to /tmp/vllm_tokenizer_lzpe2bat
Downloading tokenizer.model to /tmp/vllm_tokenizer_lzpe2bat
Downloading tokenizer_config.json to /tmp/vllm_tokenizer_lzpe2bat
Tokenizer files downloaded to: /tmp/vllm_tokenizer_lzpe2bat


In [30]:
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=512)

In [13]:
llm = LLM(
    model=model_uri,  
    tokenizer=tokenizer_dir,
    hf_config_path=tokenizer_dir,
    #max_model_len=32768,
    # max_model_len=1024,
    trust_remote_code=True,
    load_format="runai_streamer",
#     enable_chunked_prefill=False
    quantization="AWQ"
)

INFO 08-11 23:38:59 [config.py:1604] Using max model len 32768
INFO 08-11 23:39:00 [awq_marlin.py:120] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 08-11 23:39:01 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-11 23:39:01 [core.py:572] Waiting for init message from front-end.
INFO 08-11 23:39:01 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='/tmp/tmpqlxcaiif', speculative_config=None, tokenizer='/tmp/vllm_tokenizer_lzpe2bat', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=runai_streamer, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto,  device_conf

Loading safetensors using Runai Model Streamer:   0% Completed | 0/739 [00:00<?, ?it/s]


[RunAI Streamer] Overall time to stream 3.9 GiB of all files: 1.95s, 2.0 GiB/s
Read throughput is 2.25 GB per second 
INFO 08-11 23:39:05 [gpu_model_runner.py:1892] Model loading took 3.8812 GiB and 2.226387 seconds
INFO 08-11 23:39:10 [backends.py:530] Using cache directory: /home/johnny/.cache/vllm/torch_compile_cache/79a36ed55d/rank_0_0/backbone for vLLM's torch.compile
INFO 08-11 23:39:10 [backends.py:541] Dynamo bytecode transform time: 4.74 s
INFO 08-11 23:39:12 [backends.py:194] Cache the graph for dynamic shape for later use
INFO 08-11 23:39:30 [backends.py:215] Compiling a graph for dynamic shape takes 19.44 s
INFO 08-11 23:39:37 [monitor.py:34] torch.compile takes 24.18 s in total
INFO 08-11 23:39:38 [gpu_worker.py:255] Available KV cache memory: 16.38 GiB
INFO 08-11 23:39:38 [kv_cache_utils.py:833] GPU KV cache size: 134,144 tokens
INFO 08-11 23:39:38 [kv_cache_utils.py:837] Maximum concurrency for 32,768 tokens per request: 4.09x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:07<00:00,  8.87it/s]


INFO 08-11 23:39:46 [gpu_model_runner.py:2485] Graph capturing finished in 8 secs, took 0.83 GiB
INFO 08-11 23:39:46 [core.py:193] init engine (profile, create kv cache, warmup model) took 41.08 seconds


In [38]:
outputs = llm.generate("What is the capital of Congo?", sampling_params=sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    display(Markdown(f"## {prompt}\n\n{generated_text}"))

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

## What is the capital of Congo?

 The capital city of the Republic of the Congo is Brazzaville. The Democratic Republic of Congo, also known as DRC or Zaire, has its capital in Kinshasa. These two countries are often confused due to their similar names and shared history. It is important to note that they are separate countries with distinct capitals. Brazzaville is located on the Congo River in the west of the country, while Kinshasa is situated on the Congo River in the southeast of DRC.