InternRobotics · kew6688 · Oct 11, 2025 · Oct 11, 2025
diff --git a/challenge/onsite_competition/README.md b/challenge/onsite_competition/README.md
@@ -5,6 +5,11 @@ In this phase, participants’ models will be deployed on **a real robot** to ev
 
 ---
 
+## Robot
+The robot uses a wheeled chassis. The chassis and camera control code can be found in the provided SDK. The camera’s default resolution is 640×480, and the depth and color images are already aligned.
+
+The robot is a agilex (RANGER MINI 3.0)[https://www.agilex.ai/solutions/1] with RGB camera and a LiDAR sensor.
+
 ## ⚙️ Installation
 
 First, install the `InternNav` package:

diff --git a/challenge/onsite_competition/captures/rs_depth_mm.png b/challenge/onsite_competition/captures/rs_depth_mm.png
diff --git a/challenge/onsite_competition/captures/rs_depth_vis.png b/challenge/onsite_competition/captures/rs_depth_vis.png
diff --git a/challenge/onsite_competition/sdk/main.py b/challenge/onsite_competition/sdk/main.py
@@ -3,6 +3,7 @@
 import sys
 
 from real_world_env import RealWorldEnv
+from stream import app, start_env
 
 from internnav.agent.utils.client import AgentClient
 from internnav.configs.evaluator.default_config import get_config
@@ -21,6 +22,7 @@ def parse_args():
         type=str,
         help='current instruction to follow',
     )
+    parser.add_argument("--tag", type=str, help="tag for the run, saved by the tag name which is team-task-trail")
     return parser.parse_args()
 
 
@@ -32,6 +34,7 @@ def load_eval_cfg(config_path, attr_name='eval_cfg'):
     return getattr(config_module, attr_name)
 
 
+# TODO add logging for each step, saved by the tag name which is team-task-trail
 def main():
     args = parse_args()
     print("--- Loading config from:", args.config, "---")
@@ -43,16 +46,34 @@ def main():
     agent = AgentClient(cfg.agent)
 
     # initialize real world env
-    env = RealWorldEnv(args.instruction)
+    env = RealWorldEnv()
 
-    while True:
-        # print("get observation...")
-        # obs contains {rgb, depth, instruction}
-        obs = env.get_observation()
+    # start stream
+    start_env(env)
+    app.run(host="0.0.0.0", port=8080, threaded=True)
 
-        # print("agent step...")
-        # action is a integer in [0, 3], agent return [{'action': [int], 'ideal_flag': bool}] (same to internvla_n1 agent)
-        action = agent.step(obs)[0]['action'][0]  # only take the first env's action integer
+    try:
+        while True:
+            # print("get observation...")
+            # obs contains {rgb, depth, instruction}
+            obs = env.get_observation()
+            obs["instruction"] = args.instruction
 
-        # print("env step...")
-        env.step(action)
+            # print("agent step...")
+            # action is a integer in [0, 3], agent return [{'action': [int], 'ideal_flag': bool}] (same to internvla_n1 agent)
+            try:
+                action = agent.step(obs)[0]['action'][0]
+                print(f"agent step success, action is {action}")
+            except Exception as e:
+                print(f"agent step error {e}")
+                continue
+
+            # print("env step...")
+            try:
+                env.step(action)
+                print("env step success")
+            except Exception as e:
+                print(f"env step error {e}")
+                continue
+    finally:
+        env.close()
diff --git a/challenge/onsite_competition/sdk/real_world_env.py b/challenge/onsite_competition/sdk/real_world_env.py
@@ -1,27 +1,56 @@
+import threading
+import time
+
 from cam import AlignedRealSense
 from control import DiscreteRobotController
 
 from internnav.env import Env
 
 
 class RealWorldEnv(Env):
-    def __init__(self):
+    def __init__(self, fps: int = 30):
         self.node = DiscreteRobotController()
         self.cam = AlignedRealSense()
+        self.latest_obs = None
+        self.lock = threading.Lock()
+        self.stop_flag = threading.Event()
+        self.fps = fps
+
+        # 启动相机
+        self.cam.start()
+        # 启动采集线程
+        self.thread = threading.Thread(target=self._capture_loop, daemon=True)
+        self.thread.start()
+
+    def _capture_loop(self):
+        """keep capturing frames"""
+        interval = 1.0 / self.fps
+        while not self.stop_flag.is_set():
+            t0 = time.time()
+            try:
+                obs = self.cam.get_observation(timeout_ms=1000)
+                with self.lock:
+                    self.latest_obs = obs
+            except Exception as e:
+                print("Camera capture failed:", e)
+                time.sleep(0.05)
+            dt = time.time() - t0
+            if dt < interval:
+                time.sleep(interval - dt)
 
     def get_observation(self):
-        frame = self.cam.get_observation()
-        return frame
-
-    def step(self, action):
-
-        '''
-        action (int): Discrete action to apply:
-                    - 0: no movement (stand still)
-                    - 1: move forward
-                    - 2: rotate left
-                    - 3: rotate right
-        '''
+        """return most recent frame"""
+        with self.lock:
+            return self.latest_obs
+
+    def step(self, action: int):
+        """
+        action:
+            0: stand still
+            1: move forward
+            2: turn left
+            3: turn right
+        """
         if action == 0:
             self.node.stand_still()
         elif action == 1:
@@ -30,3 +59,8 @@ def step(self, action):
             self.node.turn_left()
         elif action == 3:
             self.node.turn_right()
+
+    def close(self):
+        self.stop_flag.set()
+        self.thread.join(timeout=1.0)
+        self.cam.stop()
diff --git a/challenge/onsite_competition/sdk/stream.py b/challenge/onsite_competition/sdk/stream.py
@@ -0,0 +1,38 @@
+# stream_server.py
+import time
+
+import cv2
+from flask import Flask, Response
+
+app = Flask(__name__)
+
+# 由主程序注入
+_env = None
+
+
+def set_env(env):
+    """set env from main to stream server"""
+    global _env
+    _env = env
+
+
+def _mjpeg_generator(jpeg_quality: int = 80):
+    boundary = b"--frame"
+    while True:
+        if _env is None:
+            time.sleep(0.1)
+            continue
+        obs = _env.get_observation()
+        if obs is None:
+            time.sleep(0.01)
+            continue
+        frame_bgr = obs["rgb"]
+        ok, jpg = cv2.imencode(".jpg", frame_bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality])
+        if not ok:
+            continue
+        yield (boundary + b"\r\n" b"Content-Type: image/jpeg\r\n\r\n" + jpg.tobytes() + b"\r\n")
+
+
+@app.route("/stream")
+def stream():
+    return Response(_mjpeg_generator(), mimetype="multipart/x-mixed-replace; boundary=frame")