InternLM · AllentDan · May 27, 2024 · May 28, 2024 · May 29, 2024 · May 29, 2024
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
@@ -84,14 +84,27 @@ def __init__(self,
             self.model_name = model_name
         self.pbar = None
 
-    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
-                   stream_output: bool):
+    def _inference(self,
+                   req_queue: Queue,
+                   res_queue: Queue,
+                   session_id: int,
+                   stream_output: bool,
+                   image_url: str = None):
 
         stats = []
         client = APIClient(self.server_addr, api_key=self.api_key)
 
         for prompt, input_seqlen, output_seqlen in iter(
                 req_queue.get, [None, None, None]):
+            if image_url is not None:
+                prompt = [
+                    dict(role='user',
+                         content=[
+                             dict(type='text', text=prompt),
+                             dict(type='image_url',
+                                  image_url=dict(url=image_url))
+                         ])
+                ]
             timestamps = []
             timestamps.append(time.perf_counter())
             for output in client.chat_completions_v1(
@@ -123,7 +136,8 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
     def process_request(self,
                         requests,
                         concurrency: int = 1,
-                        stream_output: bool = False):
+                        stream_output: bool = False,
+                        img_hw: str = None):
         res_queue = Queue()
         req_queue = Queue()
         threads = []
@@ -138,10 +152,26 @@ def process_request(self,
 
         start = time.time()
 
+        if img_hw is not None:
+            import PIL
+
+            from lmdeploy.vl.utils import encode_image_base64
+            h, w = [int(s) for s in img_hw.split('x')]
+            data = np.random.randint(low=0,
+                                     high=255,
+                                     size=h * w * 3,
+                                     dtype=np.uint8)
+            data = data.reshape(h, w, 3)
+            img = PIL.Image.fromarray(data, 'RGB')
+            encoded = encode_image_base64(img)
+            image_url = f'data:image/jpeg;base64,{encoded}'
+        else:
+            image_url = None
         # start threads
         for i in range(concurrency):
             t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, stream_output))
+                       args=(req_queue, res_queue, i, stream_output,
+                             image_url))
             t.start()
             threads.append(t)
 
@@ -223,7 +253,8 @@ def main(server_addr: str,
          temperature: float = 1.0,
          stream_output: bool = False,
          csv: str = './profile_api_server.csv',
-         seed: int = 0):
+         seed: int = 0,
+         img_hw: str = None):
     """Benchmark the request througput of api server.
 
     Args:
@@ -241,6 +272,8 @@ def main(server_addr: str,
         stream_output (bool, optional): Indicator for streaming output. Defaults to False.
         csv (str, optional): The path to save the result.
         seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
+        img_hw (str, optional): The image size to benchmark vl serving, such as '512x512'.
+            Default to None, which means to benchmark language model only.
     """    # noqa
     if not server_addr.startswith('http://'):
         print(f'[WARNING] server_addr of the api_server should '
@@ -259,7 +292,7 @@ def main(server_addr: str,
 
     requests = sample_requests(dataset, num_prompts, engine.tokenizer)
 
-    engine.process_request(requests, concurrency, stream_output)
+    engine.process_request(requests, concurrency, stream_output, img_hw)
 
 
 if __name__ == '__main__':