## Deploying DeepSeek-R1-1.5B - A Python SDK Experience

Learn how to deploy <code>DeepSeek-R1-1.5B</code> model via vLLM and AML Online Endpoint. 

This notebook follows a similar approach to Jihua Liu's [AML-DeepSeek](https://github.com/liougehooa/azureml-deepseek/tree/main) [notebook](https://github.com/liougehooa/azureml-deepseek/blob/main/distilled/deepseek_aml_vllm_1.5b.ipynb), but streamlines the process and includes API concurrency evaluations at the end.

You can either run this notebook locally or run on an <code>AML CPU Compute Standard_D13_v2</code> with Kernel type <code>Python 3.10 - SDK v2</code>. 

Note that you need to have at least a <code>16G RAM v100 </code> GPU instance <code>Standard_NC6s_v3</code> available in order to successfully run this demo.

He Zhang, Mar. 2025

In [None]:
#%pip install azure-ai-ml
#%pip install azure-identity
#pip install openai

In [None]:
# import required libraries
import time
import asyncio
import requests
import numpy as np
import pandas as pd

from openai import OpenAI
from time import perf_counter
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Environment,
    BuildContext, 
    OnlineRequestSettings,
    ProbeSettings
)
from azure.identity import DefaultAzureCredential
from azureml.core.workspace import Workspace

In [None]:
# enter details of your Azure Machine Learning workspace
subscription_id = "xxx"
resource_group = "xxx"
workspace = "xxx"

In [None]:
# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace)

## Deploying DeepSeek as an AML Online Endpoint / API

In [None]:
# create docker folder
!mkdir -p docker

In [None]:
%%writefile docker/Dockerfile

## vllm cuda12.1, azure image cuda vision 18.x
FROM vllm/vllm-openai:latest
ENV MODEL_NAME deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server --model $MODEL_NAME $VLLM_ARGS

In [None]:
# create environment using Docker Context
env_docker_context = Environment(
    build=BuildContext(path="docker", dockerfile_path='Dockerfile'),
    name="deepseek-r1-1p5b-qwen",
    description="Environment created from a Docker context for vLLM: DeepSeek-R1-Distill-Qwen-1.5B.",
    inference_config = {
        "liveness_route": {"port": 8000, "path": "/health"},
        "readiness_route": {"port": 8000, "path": "/health"},
        "scoring_route": {"port": 8000, "path": "/"}
    }
)

ml_client.environments.create_or_update(env_docker_context)

In [None]:
# define an endpoint name
endpoint_name_prefix = "vllms-deepseek-r1-1p5b-qwen"
endpoint_name = f'{endpoint_name_prefix}-ep'

# define an Managed Online Endpoint
blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=endpoint_name,
    environment=env_docker_context,
    environment_variables={
        "MODEL_NAME": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "VLLM_ARGS": "--max-model-len 8000 --max-num-seqs 16 --enforce-eager --dtype float16 --trust-remote-code",
        "HUGGING_FACE_HUB_TOKEN":"xxx"
    },
    instance_type="Standard_NC6s_v3",
    instance_count=1,
    request_settings= OnlineRequestSettings(
        max_concurrent_requests_per_instance=10,
        request_timeout_ms = 120000,
        max_queue_wait_ms=240000,
    ),
    liveness_probe = ProbeSettings(initial_delay=200,
                                    period=30,
                                    timeout=10,
                                    success_threshold=1,
                                    failure_threshold=30),
    readiness_probe = ProbeSettings(initial_delay=200,
                                    period=10,
                                    timeout=2,
                                    success_threshold=1,
                                    failure_threshold=30)
)

blue_deployment

In [None]:
# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name = endpoint_name, 
    description="this is an endpoint for vllms: DeepSeek-R1-Distill-Qwen-1.5B.",
    auth_mode="key"
)

ml_client.online_endpoints.begin_create_or_update(endpoint)
ml_client.online_endpoints.get(name=endpoint_name)

In [None]:
# create a deployment to the endpoint (this will usually take 5 to 15 minutes)
#ml_client.online_deployments.begin_create_or_update(blue_deployment)
ml_client.online_deployments.begin_create_or_update(blue_deployment).result()

In [None]:
# list all endpoints
for indx, endpoint in enumerate(ml_client.online_endpoints.list()):
    print("Endpoint:", indx+1)
    print("  Kind:", endpoint.kind)
    print("  Location:", endpoint.location)
    print("  Name:", endpoint.name)
    print("  Endpoint:", endpoint.scoring_uri)

In [None]:
# check deployment log
logs = ml_client.online_deployments.get_logs(name="blue", endpoint_name=endpoint_name, lines=500)
print(logs)

In [None]:
# endpoint.traffic = {"blue": 100, "green": 0}
endpoint_remote = ml_client.online_endpoints.get(name=endpoint_name)
endpoint_remote.traffic = {"blue": 100}
ml_client.begin_create_or_update(endpoint_remote).result()

In [None]:
# get endpoint url
endpoint = ml_client.online_endpoints.get(name=endpoint_name)
endpoint_url = endpoint.scoring_uri
print("endpoint_url:", endpoint_url)

# get endpoint key
keys = ml_client.online_endpoints.get_keys(name=endpoint_name)
endpoint_key = keys.primary_key
print("endpoint_key:", endpoint_key)

In [None]:
%%time
# call the API endpoint using HTTP Requests Completion
api_url = endpoint_url + "v1/completions"
api_key = endpoint_key

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

data = {
    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "prompt": "The city of Shanghai is",
    "max_tokens": 300,
    "temperature": 0.6
}

response = requests.post(api_url, headers=headers, json=data)
print(response.json()["choices"][0]["text"])
print("\n", response.json()["usage"])

In [None]:
%%time
# call the API endpoint using OpenAI Completion
openai_base_url = endpoint_url + "v1"
openai_api_key = endpoint_key

client = OpenAI(api_key=openai_api_key,
                base_url=openai_base_url)

completion = client.completions.create(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
                                       prompt="The city of Shanghai is",
                                       max_tokens=300,
                                       temperature=0.6)

print(completion.choices[0].text)
print("\n", completion.usage)

In [None]:
%%time
# call the API endpoint using OpenAI Chat Completion
openai_base_url = endpoint_url + "v1"
openai_api_key = endpoint_key

client = OpenAI(api_key=openai_api_key,
                base_url=openai_base_url)

chat_completion = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    messages=[
        {"role": "user", "content": "Tell me something about Shanghai China."}
    ],
    max_tokens=300,
    temperature=0.6)

print(chat_completion.choices[0].message.content)
print("\n", chat_completion.usage)

## Testing API Performance in Batches

### Evaluate API Sequential Calls

In [None]:
# define a function to do sequential calls to the API 
def call_llm_sequential(
    model_path: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 
    num_infers: int = 5,
    aoai_client: object = client,
    messages: list = [{"role": "user", "content": "Hello."}], 
    max_tokens: int = 500,
    temperature: float = 0.6) -> dict:
    """Evaluate LLM performance in sequential calls."""

    print("=== Measuring latency ===")
    print(f"model_path={model_path}, num_infers={num_infers}, max_tokens={max_tokens}, temperature={temperature}")
    
    # warm up
    warmup_start = perf_counter()
    _ = aoai_client.chat.completions.create(
        model=model_path,
        messages=[{"role": "user", "content": "Please just say: Warm-up is done."}],
        max_tokens=100,
        temperature=0.6)
    
    warmup_latency = perf_counter() - warmup_start
    print("\nWarm-up is done! (it takes {:.2f} seconds.)".format(warmup_latency))

    # test serial calls 
    total_test_start = time.time()
    latencies_sec = []
    chat_completions = []
    for curr_infer in range(num_infers):
        single_test_start = perf_counter()
        if (curr_infer % 5) == 0:
            print(f"\nCalling API for the {curr_infer}th time ...")
            
        chat_completion = aoai_client.chat.completions.create(
            model=model_path,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature)
        
        single_test_latency = perf_counter() - single_test_start
        latencies_sec.append(single_test_latency)
        chat_completions.append(chat_completion)
    
    total_test_end = time.time()
    
    # compute various metrics
    total_dur_sec = np.round((total_test_end - total_test_start), 2)
    
    # output metrics in a dict.
    results = {
        "total_duration_sec": total_dur_sec,
        "latencies_sec": latencies_sec,
        "chat_completions": chat_completions
    }
    
    return results

In [None]:
%%time
# test API in sequential way
model_path = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
num_infers = 5
aoai_client = client
max_tokens = 1000
temperature = 0.6
messages = [{"role": "user", 
             "content": "What is Shanghai China famous for? Give me top 3 points only."}]

results = call_llm_sequential(model_path, num_infers, client, messages, max_tokens, temperature)

In [None]:
# calculate latency metrics
latency_avg_sec = np.mean(results["latencies_sec"]).round(2)
latency_std_sec = np.std(results["latencies_sec"]).round(2)
latency_p95_sec = np.percentile(results["latencies_sec"], 95).round(2)
latency_p99_sec = np.percentile(results["latencies_sec"], 99).round(2)
total_duration_sec = results["total_duration_sec"]
print("latency_avg_sec:", latency_avg_sec)
print("latency_std_sec:", latency_std_sec)
print("latency_p95_sec:", latency_p95_sec)
print("latency_p99_sec:", latency_p99_sec)
print("total_duration_sec:", total_duration_sec)

In [None]:
# calculate token metrics
sequential_calls_completion_tokens = [result.usage.completion_tokens for result in results["chat_completions"]]
avg_tokens_per_sec = np.sum(sequential_calls_completion_tokens) / total_duration_sec
print("avg_tokens_per_sec:", np.round(avg_tokens_per_sec, 2))

In [None]:
# check individual completions
for indx, cm in enumerate(results["chat_completions"]):
    print("Completion =", indx+1, "\n--------------\n", cm.choices[0].message.content, "\n")

### Evaluate API Concurrent Calls

In [None]:
# define a function to evaluate API concurrent calls 
async def call_llm_concurrent(aoai_client, model_path, concurrent_requests, messages, max_tokens, temperature):
    """Run multiple concurrent OpenAI API requests."""
    
    async def call_openai(aoai_client, model_path, request_id):
        """Make an asynchronous OpenAI API call using the OpenAI client."""
        response = await asyncio.to_thread(aoai_client.chat.completions.create,
            model=model_path,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature)
        return response
    
    tasks = [call_openai(aoai_client, model_path, i) for i in range(concurrent_requests)]
    results = await asyncio.gather(*tasks)

    return results

In [None]:
# test API in concurrent way
model_path = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
concurrent_requests = 5
aoai_client = client
max_tokens = 1000
temperature = 0.6
messages = [{"role": "user", 
             "content": "What is Shanghai China famous for? Give me top 3 points only."}]

begin_time = time.time()
results = asyncio.run(call_llm_concurrent(aoai_client, model_path, concurrent_requests, messages, max_tokens, temperature))
end_time = time.time()
total_dur_sec = np.round((end_time - begin_time), 2)
print("Total time in running {} concurrent API calls: {:.2f}".format(concurrent_requests, total_dur_sec))

In [None]:
# calculate token metrics
concurrent_calls_completion_tokens = [result.usage.completion_tokens for result in results]
avg_tokens_per_sec = np.sum(sequential_calls_completion_tokens) / total_duration_sec
print("avg_tokens_per_sec:", np.round(avg_tokens_per_sec, 2))

In [None]:
# check individual completions
for indx, result in enumerate(results):
    print("Completion =", indx+1, "\n--------------\n", result.choices[0].message.content, "\n")