[python] Support non-gpu models for huggingface (deepjavalibrary#772)

KexinFeng · Aug 16, 2023 · 97594a4 · 97594a4
1 parent b7fab1e
commit 97594a4
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 21 deletions.
diff --git a/engines/python/setup/djl_python/encode_decode.py b/engines/python/setup/djl_python/encode_decode.py
@@ -51,35 +51,37 @@ def encode_csv(content):  # type: (str) -> np.array
 
 
 def decode(inputs: Input, content_type: str):
-    if not content_type or content_type == "application/json":
+    if not content_type or "application/json" in content_type:
         return inputs.get_as_json()
-    elif content_type == "text/csv":
+    elif "text/csv" in content_type:
         return decode_csv(inputs)
-    elif content_type == "text/plain":
+    elif "text/plain" in content_type:
         return {"inputs": [inputs.get_as_string()]}
     if content_type.startswith("image/"):
         return {"inputs": inputs.get_as_image()}
     elif content_type.startswith("audio/"):
         return {"inputs": inputs.get_as_bytes()}
-    elif content_type == "tensor/npz":
+    elif "tensor/npz" in content_type:
         return {"inputs": inputs.get_as_npz()}
     elif content_type in {"tensor/ndlist", "application/x-npy"}:
         return {"inputs": inputs.get_as_numpy()}
+    elif content_type == "application/x-www-form-urlencoded":
+        return {"inputs": inputs.get_as_string()}
     else:
         # "application/octet-stream"
         return {"inputs": inputs.get_as_bytes()}
 
 
 def encode(outputs: Output, prediction, content_type: str):
-    if content_type == "application/json":
+    if not content_type or "application/json" in content_type:
         outputs.add_as_json(prediction)
-        outputs.add_property("Content-Type", content_type)
-    elif content_type == "text/csv":
+        outputs.add_property("Content-Type", "application/json")
+    elif "text/csv" in content_type:
         outputs.add_as_string(encode_csv(prediction))
-        outputs.add_property("Content-Type", content_type)
-    elif content_type == "tensor/npz":
+        outputs.add_property("Content-Type", "text/csv")
+    elif "tensor/npz" in content_type:
         outputs.add_as_npz(prediction)
-        outputs.add_property("Content-Type", content_type)
+        outputs.add_property("Content-Type", "tensor/npz")
     else:
         outputs.add_as_numpy(prediction)
         outputs.add_property("Content-Type", "tensor/ndlist")
diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py
@@ -11,9 +11,7 @@
 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
 # the specific language governing permissions and limitations under the License.
 
-import json
 import logging
-import os
 
 import torch
 from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
@@ -64,6 +62,7 @@ def __init__(self):
         self.initialized = False
         self.enable_streaming = False
         self.model = None
+        self.device_id = -1
         self.tokenizer = None
 
     def initialize(self, properties: dict):
@@ -72,7 +71,7 @@ def initialize(self, properties: dict):
         # Otherwise we assume model artifacts are in the model_dir
         model_id_or_path = properties.get("model_id") or properties.get(
             "model_dir")
-        device_id = int(properties.get("device_id", "-1"))
+        self.device_id = int(properties.get("device_id", "-1"))
         task = properties.get("task")
         tp_degree = int(properties.get("tensor_parallel_degree", "-1"))
         self.enable_streaming = properties.get("enable_streaming",
@@ -83,7 +82,7 @@ def initialize(self, properties: dict):
         if "device_map" in properties:
             kwargs["device_map"] = properties.get("device_map")
             logging.info(f"Using device map {kwargs['device_map']}")
-        elif tp_degree > 0:
+        elif tp_degree > 0 and torch.cuda.device_count() > 0:
             kwargs["device_map"] = "auto"
             world_size = torch.cuda.device_count()
             assert world_size == tp_degree, f"TP degree ({tp_degree}) doesn't match available GPUs ({world_size})"
@@ -109,7 +108,7 @@ def initialize(self, properties: dict):
 
         self.hf_pipeline = self.get_pipeline(task=task,
                                              model_id_or_path=model_id_or_path,
-                                             device=device_id,
+                                             device=self.device_id,
                                              kwargs=kwargs)
 
         self.initialized = True
@@ -121,7 +120,7 @@ def inference(self, inputs):
             if not accept:
                 accept = content_type if content_type.startswith(
                     "tensor/") else "application/json"
-            elif accept == "*/*":
+            elif "*/*" in accept:
                 accept = "application/json"
 
             input_map = decode(inputs, content_type)
@@ -229,15 +228,14 @@ def wrapped_pipeline(inputs, *args, **kwargs):
 
         return wrapped_pipeline
 
-    @staticmethod
-    def wrap_text_generation_pipeline(hf_pipeline):
+    def wrap_text_generation_pipeline(self, hf_pipeline):
 
         def wrapped_pipeline(inputs, *args, **kwargs):
             model = hf_pipeline.model
             tokenizer = hf_pipeline.tokenizer
-            input_tokens = tokenizer(inputs, padding=True,
-                                     return_tensors="pt").to(
-                                         torch.cuda.current_device())
+            input_tokens = tokenizer(inputs, padding=True, return_tensors="pt")
+            if self.device_id >= 0:
+                input_tokens.to(torch.cuda.current_device())
             with torch.no_grad():
                 output_tokens = model.generate(
                     *args,