# 06-01 : LLM Serving

A basic test to serve a LLM as a function.

## Refrences

- [Deploying an LLM using MLRun](https://docs.mlrun.org/en/v1.7.2/tutorials/genai_01_basic_tutorial.html)

In [None]:
import mlrun

In [None]:
# Show the API server URL
mlrun.get_run_db()

## 1. Configuration

In [None]:
MODEL_ID = "microsoft/phi-2" # the model ID to use
project_name = "llm-serving" # the project name

### 1.1 Create The Project

In [None]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=True)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

### 1.2 Model Cache Directory

In [None]:
# the cache directory for the model
CACHE_DIR = mlrun.mlconf.artifact_path
CACHE_DIR = (
    CACHE_DIR.replace("v3io://", "/v3io").replace("{{run.project}}", project.name)
    + "/cache"
)
print(f"Cache directory: {CACHE_DIR}")

## 2. Serving Function

In [None]:
# requirements for the function
requirements = [
    "transformers==4.41.2",
    "tensorflow==2.16.1",
    "torch"
]

# create the function to serve the model 
serve_func = project.set_function(
    name="serve-llm",
    func="src/06-01_serving.py",
    image="dragon:6500/mlrun/mlrun-gpu:1.7.2",
    kind="nuclio",
    handler="invoke_llm",
    requirements=requirements
).apply(mlrun.auto_mount())

# set the environment variables for the function
serve_func.set_envs(env_vars={
    "MODEL_ID": MODEL_ID, 
    "CACHE_DIR": CACHE_DIR
})

# Since the model is stored in memory, use only 1 replica and and one worker
# Since this is running on CPU only, inference might take ~1 minute (increasing timeout)
serve_func.spec.min_replicas = 1
serve_func.spec.max_replicas = 1
serve_func.with_http(worker_timeout=120, gateway_timeout=150, workers=1)
serve_func.set_config("spec.readinessTimeoutSeconds", 1200)

# set gpu resources for the function
serve_func.with_limits(gpus=1)

In [None]:
# deploy the function
serve_func = project.deploy_function(function="serve-llm")