ModelEngine-Group · harrisonyhq · Sep 28, 2025 · Sep 28, 2025
@@ -1,4 +1,5 @@
 self-hosted-runner:
   # Labels of self-hosted runner in array of strings.
   labels:
-    - default
+    - default
+    - arc-runner-ucm
@@ -4,7 +4,7 @@ on:
 
 jobs:
   offline-inference:
-    runs-on: [self-hosted,default]          # 与上面 LABELS 对应
+    runs-on: arc-runner-ucm       
     steps:
       - run: pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
       - uses: actions/checkout@v4
@@ -14,7 +14,7 @@ jobs:
           docker run --rm \
             --gpus all \
             -v ${{ github.workspace }}:/workspace/unified-cache-management \
-            -v /home/models/Qwen2.5-1.5B-Instruct:/home/models/Qwen2.5-1.5B-Instruct \
+            -v /home_116/models/Qwen2.5-1.5B-Instruct:/home/models/Qwen2.5-1.5B-Instruct \
             -w /workspace/unified-cache-management \
             --entrypoint /bin/bash \
             vllm/vllm-openai:v0.9.2 \

@@ -48,7 +48,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
     llm_args = EngineArgs(
         model=model,
         kv_transfer_config=ktc,
-        max_model_len=32768,
+        max_model_len=5000,
         gpu_memory_utilization=0.8,
         max_num_batched_tokens=30000,
         block_size=128,