From 9a6f20bee9dd5ec4916b056e507ba15df01a1c5d Mon Sep 17 00:00:00 2001
From: Frank Liu <frankfliu2000@gmail.com>
Date: Fri, 2 Jun 2023 14:36:23 -0700
Subject: [PATCH] [python] Fixes device id mismatch issue for mutlple GPU case
 (#800)

---
 engines/python/setup/djl_python/huggingface.py            | 8 +++-----
 .../src/main/java/ai/djl/python/engine/Connection.java    | 5 ++++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py
index 138f02d37..d8628b46c 100644
--- a/engines/python/setup/djl_python/huggingface.py
+++ b/engines/python/setup/djl_python/huggingface.py
@@ -119,7 +119,6 @@ def initialize(self, properties: dict):
 
         self.hf_pipeline = self.get_pipeline(task=task,
                                              model_id_or_path=model_id_or_path,
-                                             device=self.device_id,
                                              kwargs=kwargs)
 
         self.initialized = True
@@ -158,8 +157,7 @@ def inference(self, inputs):
 
         return outputs
 
-    def get_pipeline(self, task: str, device: int, model_id_or_path: str,
-                     kwargs):
+    def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
         # define tokenizer or feature extractor as kwargs to load it the pipeline correctly
         if task in {
                 "automatic-speech-recognition",
@@ -186,7 +184,7 @@ def get_pipeline(self, task: str, device: int, model_id_or_path: str,
             else:
                 hf_pipeline = pipeline(task=task,
                                        model=model_id_or_path,
-                                       device=device,
+                                       device=self.device_id,
                                        **kwargs)
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
@@ -250,7 +248,7 @@ def wrapped_pipeline(inputs, *args, **kwargs):
             tokenizer = hf_pipeline.tokenizer
             input_tokens = tokenizer(inputs, padding=True, return_tensors="pt")
             if self.device_id >= 0:
-                input_tokens.to(torch.cuda.current_device())
+                input_tokens.to(f"cuda:{self.device_id}")
             with torch.no_grad():
                 output_tokens = model.generate(
                     *args,
diff --git a/engines/python/src/main/java/ai/djl/python/engine/Connection.java b/engines/python/src/main/java/ai/djl/python/engine/Connection.java
index c63fd1497..0f17e3e0e 100644
--- a/engines/python/src/main/java/ai/djl/python/engine/Connection.java
+++ b/engines/python/src/main/java/ai/djl/python/engine/Connection.java
@@ -151,12 +151,15 @@ static String[] getPythonStartCmd(PyEnv pyEnv, Model model, int workerId, int po
 
         // TP settings
         Device device = model.getNDManager().getDevice();
+        String deviceId = String.valueOf(device.getDeviceId());
         if (tensorParallelDegree > 0 && device.isGpu()) {
+            deviceId = "0";
             String cudaDevices = getVisibleDevices(device.getDeviceId(), tensorParallelDegree);
             pyEnv.addEnv("CUDA_VISIBLE_DEVICES", cudaDevices);
             logger.info("Set CUDA_VISIBLE_DEVICES={}", cudaDevices);
         }
         if ("nc".equals(device.getDeviceType())) {
+            deviceId = "0";
             String visibleCores;
             if (tensorParallelDegree > 0) {
                 visibleCores = getNeuronVisibleCores(device.getDeviceId(), tensorParallelDegree);
@@ -179,7 +182,7 @@ static String[] getPythonStartCmd(PyEnv pyEnv, Model model, int workerId, int po
         args[8] = "--entry-point";
         args[9] = pyEnv.getEntryPoint();
         args[10] = "--device-id";
-        args[11] = String.valueOf(device.getDeviceId());
+        args[11] = deviceId;
         return args;
     }