Lightning-AI · hhsecond · Dec 20, 2022 · Dec 15, 2022 · Dec 15, 2022 · Dec 15, 2022
diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py
@@ -1,5 +1,5 @@
 # ! pip install torch torchvision
-from typing import Any, List
+from typing import List
 
 import torch
 import torchvision
@@ -8,16 +8,12 @@
 import lightning as L
 
 
-class RequestModel(BaseModel):
-    image: str  # bytecode
-
-
 class BatchRequestModel(BaseModel):
-    inputs: List[RequestModel]
+    inputs: List[L.app.components.Image]
 
 
 class BatchResponse(BaseModel):
-    outputs: List[Any]
+    outputs: List[L.app.components.Number]
 
 
 class PyTorchServer(L.app.components.PythonServer):
@@ -81,8 +77,8 @@ def scale(self, replicas: int, metrics: dict) -> int:
         max_replicas=4,
         autoscale_interval=10,
         endpoint="predict",
-        input_type=RequestModel,
-        output_type=Any,
+        input_type=L.app.components.Image,
+        output_type=L.app.components.Number,
         timeout_batching=1,
         max_batch_size=8,
     )

@@ -15,6 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added partial support for fastapi `Request` annotation in `configure_api` handlers ([#16047](https://github.com/Lightning-AI/lightning/pull/16047))
 
+- Added a nicer UI with URL and examples for the autoscaler component ([#16063](https://github.com/Lightning-AI/lightning/pull/16063))
+
 
 ### Changed
 

@@ -1,4 +1,3 @@
-from lightning_app.components.auto_scaler import AutoScaler
 from lightning_app.components.database.client import DatabaseClient
 from lightning_app.components.database.server import Database
 from lightning_app.components.multi_node import (
@@ -10,7 +9,7 @@
 from lightning_app.components.python.popen import PopenPythonScript
 from lightning_app.components.python.tracer import Code, TracerPythonScript
 from lightning_app.components.serve.gradio import ServeGradio
-from lightning_app.components.serve.python_server import Image, Number, PythonServer
+from lightning_app.components.serve.python_server import AutoScaler, Image, Number, PythonServer
 from lightning_app.components.serve.serve import ModelInferenceAPI
 from lightning_app.components.serve.streamlit import ServeStreamlit
 from lightning_app.components.training import LightningTrainerScript, PyTorchLightningScriptRunner

@@ -1,5 +1,5 @@
 from lightning_app.components.serve.gradio import ServeGradio
-from lightning_app.components.serve.python_server import Image, Number, PythonServer
+from lightning_app.components.serve.python_server import AutoScaler, Image, Number, PythonServer
 from lightning_app.components.serve.streamlit import ServeStreamlit
 
-__all__ = ["ServeGradio", "ServeStreamlit", "PythonServer", "Image", "Number"]
+__all__ = ["ServeGradio", "ServeStreamlit", "PythonServer", "Image", "Number", "AutoScaler"]
@@ -1,12 +1,13 @@
 import asyncio
+import inspect
 import logging
 import os
 import secrets
 import time
 import uuid
 from base64 import b64encode
 from itertools import cycle
-from typing import Any, Dict, List, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import requests
 import uvicorn
@@ -15,11 +16,13 @@
 from fastapi.responses import RedirectResponse
 from fastapi.security import HTTPBasic, HTTPBasicCredentials
 from pydantic import BaseModel
+from starlette.staticfiles import StaticFiles
 from starlette.status import HTTP_401_UNAUTHORIZED
 
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
 from lightning_app.utilities.app_helpers import Logger
+from lightning_app.utilities.cloud import is_running_in_cloud
 from lightning_app.utilities.imports import _is_aiohttp_available, requires
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
@@ -30,7 +33,32 @@
 logger = Logger(__name__)
 
 
-def _raise_granular_exception(exception: Exception) -> None:
+class ColdStartProxy:
+    def __init__(self, proxy_url):
+        self.proxy_url = proxy_url
+        self.proxy_timeout = 50
+        if not inspect.iscoroutinefunction(self.handle_request):
+            raise TypeError("handle_request must be an `async` function")
+
+    async def handle_request(self, request: BaseModel) -> Any:
+        try:
+            async with aiohttp.ClientSession() as session:
+                headers = {
+                    "accept": "application/json",
+                    "Content-Type": "application/json",
+                }
+                async with session.post(
+                        self.proxy_url,
+                        json=request.dict(),
+                        timeout=self.proxy_timeout,
+                        headers=headers,
+                ) as response:
+                    return await response.json()
+        except Exception as ex:
+            raise HTTPException(status_code=500, detail=f"Error in proxy: {ex}")
+
+
+def _maybe_raise_granular_exception(exception: Exception) -> None:
     """Handle an exception from hitting the model servers."""
     if not isinstance(exception, Exception):
         return
@@ -114,20 +142,22 @@ class _LoadBalancer(LightningWork):
             requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached.
         timeout_keep_alive: The number of seconds until it closes Keep-Alive connections if no new data is received.
         timeout_inference_request: The number of seconds to wait for inference.
-        \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
+        **kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc.
     """
 
     @requires(["aiohttp"])
     def __init__(
         self,
-        input_type: BaseModel,
-        output_type: BaseModel,
+        input_type: Type[BaseModel],
+        output_type: Type[BaseModel],
         endpoint: str,
         max_batch_size: int = 8,
         # all timeout args are in seconds
-        timeout_batching: int = 1,
+        timeout_batching: float = 1,
         timeout_keep_alive: int = 60,
         timeout_inference_request: int = 60,
+        work_name: Optional[str] = "API",  # used for displaying the name in the UI
+        cold_start_proxy: Union[ColdStartProxy, str, None] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(cloud_compute=CloudCompute("default"), **kwargs)
@@ -142,12 +172,24 @@ def __init__(
         self._batch = []
         self._responses = {}  # {request_id: response}
         self._last_batch_sent = 0
+        self._work_name = work_name
 
         if not endpoint.startswith("/"):
             endpoint = "/" + endpoint
 
         self.endpoint = endpoint
 
+        self._fastapi_app = None
+
+        self._cold_start_proxy = None
+        if cold_start_proxy:
+            if isinstance(cold_start_proxy, str):
+                self._cold_start_proxy = ColdStartProxy(proxy_url=cold_start_proxy)
+            elif isinstance(cold_start_proxy, ColdStartProxy):
+                self._cold_start_proxy = cold_start_proxy
+            else:
+                raise ValueError("cold_start_proxy must be of type ColdStartProxy or str")
+
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]]):
         server = next(self._iter)  # round-robin
         request_data: List[_LoadBalancer._input_type] = [b[1] for b in batch]
@@ -193,22 +235,40 @@ async def consumer(self):
                 self._last_batch_sent = time.time()
 
     async def process_request(self, data: BaseModel):
-        if not self.servers:
+        if not self.servers and not self._cold_start_proxy:
             raise HTTPException(500, "None of the workers are healthy!")
 
         request_id = uuid.uuid4().hex
-        request: Tuple = (request_id, data)
-        self._batch.append(request)
 
+        # if no servers are available, proxy the request to cold start proxy handler
+        if not self.servers:
+            return await self._cold_start_proxy.handle_request(data)
+
+        # if out of capacity, proxy the request to cold start proxy handler
+        if not self._has_processing_capacity():
+            return await self._cold_start_proxy.handle_request(data)
+
+        # if we have capacity, process the request
+        self._batch.append((request_id, data))
         while True:
             await asyncio.sleep(0.05)
-
             if request_id in self._responses:
                 result = self._responses[request_id]
                 del self._responses[request_id]
-                _raise_granular_exception(result)
+                _maybe_raise_granular_exception(result)
                 return result
 
+    def _has_processing_capacity(self):
+        """ this function checks if currently have processing capacity for one more request or not. Depends on
+        the value from here, we decide whether we should proxy the request or not
+        """
+        if not self._fastapi_app:
+            return False
+        active_server_count = len(self.servers)
+        max_processable = self.max_batch_size * active_server_count
+        current_req_count = len(self._fastapi_app.num_current_requests)
+        return current_req_count < max_processable
+
     def run(self):
         logger.info(f"servers: {self.servers}")
         lock = asyncio.Lock()
@@ -219,6 +279,7 @@ def run(self):
         fastapi_app = _create_fastapi("Load Balancer")
         security = HTTPBasic()
         fastapi_app.SEND_TASK = None
+        self._fastapi_app = fastapi_app
 
         @fastapi_app.middleware("http")
         async def current_request_counter(request: Request, call_next):
@@ -280,6 +341,14 @@ async def update_servers(servers: List[str], authenticated: bool = Depends(authe
         async def balance_api(inputs: self._input_type):
             return await self.process_request(inputs)
 
+        endpoint_info_page = self._get_endpoint_info_page()
+        if endpoint_info_page:
+            fastapi_app.mount(
+                "/endpoint-info", StaticFiles(directory=endpoint_info_page.serve_dir, html=True), name="static"
+            )
+
+        logger.info(f"Your load balancer has started. The endpoint is 'http://{self.host}:{self.port}{self.endpoint}'")
+
         uvicorn.run(
             fastapi_app,
             host=self.host,
@@ -332,6 +401,56 @@ def send_request_to_update_servers(self, servers: List[str]):
         response = requests.put(f"{self.url}/system/update-servers", json=servers, headers=headers, timeout=10)
         response.raise_for_status()
 
+    @staticmethod
+    def _get_sample_dict_from_datatype(datatype: Any) -> dict:
+        if hasattr(datatype, "_get_sample_data"):
+            return datatype._get_sample_data()
+
+        datatype_props = datatype.schema()["properties"]
+        out: Dict[str, Any] = {}
+        lut = {"string": "data string", "number": 0.0, "integer": 0, "boolean": False}
+        for k, v in datatype_props.items():
+            if v["type"] not in lut:
+                raise TypeError("Unsupported type")
+            out[k] = lut[v["type"]]
+        return out
+
+    def get_code_sample(self, url: str) -> Optional[str]:
+        input_type: Any = self._input_type
+        output_type: Any = self._output_type
+
+        if not (hasattr(input_type, "request_code_sample") and hasattr(output_type, "response_code_sample")):
+            return None
+        return f"{input_type.request_code_sample(url)}\n{output_type.response_code_sample()}"
+
+    def _get_endpoint_info_page(self) -> Optional["APIAccessFrontend"]:  # noqa: F821
+        try:
+            from lightning_api_access import APIAccessFrontend
+        except ModuleNotFoundError:
+            logger.warn("APIAccessFrontend not found. Please install lightning-api-access to enable the UI")
+            return
+
+        if is_running_in_cloud():
+            url = f"{self._future_url}{self.endpoint}"
+        else:
+            url = f"http://localhost:{self.port}{self.endpoint}"
+
+        frontend_objects = {"name": self._work_name, "url": url, "method": "POST", "request": None, "response": None}
+        code_samples = self.get_code_sample(url)
+        if code_samples:
+            frontend_objects["code_samples"] = code_samples
+            # TODO also set request/response for JS UI
+        else:
+            try:
+                request = self._get_sample_dict_from_datatype(self._input_type)
+                response = self._get_sample_dict_from_datatype(self._output_type)
+            except TypeError:
+                return None
+            else:
+                frontend_objects["request"] = request
+                frontend_objects["response"] = response
+        return APIAccessFrontend(apis=[frontend_objects])
+
 
 class AutoScaler(LightningFlow):
     """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in
@@ -347,6 +466,7 @@ class AutoScaler(LightningFlow):
         timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process.
         input_type: Input type.
         output_type: Output type.
+        cold_start_proxy: If provided, the proxy will be used while the gpu machines are warming up.
 
     .. testcode::
 
@@ -403,8 +523,9 @@ def __init__(
         max_batch_size: int = 8,
         timeout_batching: float = 1,
         endpoint: str = "api/predict",
-        input_type: BaseModel = Dict,
-        output_type: BaseModel = Dict,
+        input_type: Type[BaseModel] = Dict,
+        output_type: Type[BaseModel] = Dict,
+        cold_start_proxy: Union[ColdStartProxy, str, None] = None,
         *work_args: Any,
         **work_kwargs: Any,
     ) -> None:
@@ -438,6 +559,8 @@ def __init__(
             timeout_batching=timeout_batching,
             cache_calls=True,
             parallel=True,
+            work_name=self._work_cls.__name__,
+            cold_start_proxy=cold_start_proxy,
         )
         for _ in range(min_replicas):
             work = self.create_work()
@@ -574,5 +697,8 @@ def autoscale(self) -> None:
         self._last_autoscale = time.time()
 
     def configure_layout(self):
-        tabs = [{"name": "Swagger", "content": self.load_balancer.url}]
+        tabs = [
+            {"name": "Endpoint Info", "content": f"{self.load_balancer}/endpoint-info"},
+            {"name": "Swagger", "content": self.load_balancer.url},
+        ]
         return tabs