Author: Layla Ashry Platform: Meta Quest (Android / Horizon OS)
This project implements a dual-camera capture and streaming pipeline on Meta Quest devices using the Meta Passthrough Camera API and Android’s Camera2 / MediaCodec stack. The application captures video from two passthrough cameras (camera IDs 50 and 51), displays live previews using Jetpack Compose, and streams encoded video frames together with structured metadata to an external host over TCP.
The system is composed of four primary layers:
-
UI & Lifecycle Layer (MainActivity / Compose UI)
- Handles Android lifecycle events and permission management
- Hosts Jetpack Compose UI for dual camera previews and controls
- Owns
TextureViewsurfaces used by the camera pipeline
-
Camera & State Management Layer (ViewModel)
- Discovers available passthrough cameras
- Manages camera configuration, brightness, intrinsics, and extrinsics
- Coordinates camera start/stop behavior based on UI and lifecycle state
-
Encoding & Frame Preparation Layer
- Uses
MediaCodecto encode camera frames - Extracts encoded buffers and per-frame metadata
- Prepares frames for network transmission
- Uses
-
Networking & Streaming Layer (Streamer)
- Maintains a TCP socket connection to a remote host
- Runs a dedicated background thread for network I/O
- Packages encoded video frames with JSON-serialized metadata
- Ensures backpressure via a bounded frame queue
- Entry point of the application
- Manages runtime permissions (Android camera + Meta/HZOS permissions)
- Coordinates surface readiness and camera startup
- Bridges Compose UI callbacks with ViewModel logic
- Ensures proper cleanup during lifecycle transitions
- Central state holder for camera configuration and streaming status
- Exposes observable UI state for Compose
- Starts and stops individual or dual camera streams
- Owns camera discovery results and parameter data
-
Low-level networking utility responsible for streaming data off-device
-
Uses TCP sockets for deterministic, ordered delivery
-
Sends each frame as:
- Metadata size (Int)
- Video frame size (Int)
- JSON metadata payload
- Encoded video bytes
This design allows external consumers (e.g., Python, C++, ROS nodes) to reconstruct frames and metadata reliably.
Each video frame is accompanied by structured metadata (e.g., timestamps, camera parameters). Metadata is serialized using kotlinx.serialization to JSON, enabling:
- Language-agnostic decoding
- Easy logging and debugging
- Direct integration with robotics and vision pipelines
- Requires standard Android
CAMERApermission - Requires Meta/Horizon OS passthrough camera permissions
- Designed specifically for Meta Quest devices supporting the Passthrough Camera API
This project was developed with the support of the following tools and documentation:
-
Meta Passthrough Camera API Samples https://github.com/meta-quest/Meta-Passthrough-Camera-API-Samples
-
Meta Spatial SDK – Passthrough Camera API Overview https://developers.meta.com/horizon/documentation/spatial-sdk/spatial-sdk-pca-overview
-
Gemini — used for ideation, debugging assistance, documenentation, and architectural reasoning
-
NVIDIA Documentation Referenced for MediaCodec usage patterns, video encoding concepts, and performance considerations
# File: src/quest_ros_streamer/quest_ros_streamer/tcp_server_node.py
from importlib import metadata
import rclpy
from rclpy.node import Node
import socket
import struct
import json
from sensor_msgs.msg import CompressedImage, CameraInfo
from geometry_msgs.msg import TransformStamped
from builtin_interfaces.msg import Time
import tf2_ros
import traceback
import threading
from rclpy.time import Time as RclpyTime
# Import the QoS classes
from rclpy.qos import QoSProfile, ReliabilityPolicy, HistoryPolicy, DurabilityPolicy
class QuestStreamReceiver(Node):
def __init__(self):
super().__init__('quest_stream_receiver')
self.declare_parameter('listen_ip', '0.0.0.0')
self.declare_parameter('listen_port', 8888)
self.ip = self.get_parameter('listen_ip').get_parameter_value().string_value
self.port = self.get_parameter('listen_port').get_parameter_value().integer_value
video_qos_profile = QoSProfile(
reliability=ReliabilityPolicy.BEST_EFFORT,
history=HistoryPolicy.KEEP_LAST,
depth=1,
durability=DurabilityPolicy.VOLATILE
)
self.image_pub = self.create_publisher(
CompressedImage,
'/left/image_raw/compressed/from_tcp',
video_qos_profile)
self.cam_info_pub = self.create_publisher(
CameraInfo,
'/left/camera_info',
video_qos_profile)
self.tf_broadcaster = tf2_ros.TransformBroadcaster(self)
self.get_logger().info(f"TCP Server Node initialized. Listening on {self.ip}:{self.port}")
def run_server(self):
# ... (The rest of the file is identical and correct)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind((self.ip, self.port))
s.listen()
while rclpy.ok():
self.get_logger().info("Waiting for a connection from the Quest headset...")
try:
conn, addr = s.accept()
with conn:
self.get_logger().info(f"Connection established with {addr}")
self.handle_connection(conn)
except Exception as e:
self.get_logger().error(f"Server loop error: {e}\n{traceback.format_exc()}")
self.get_logger().warn("Client disconnected. Waiting for new connection.")
def handle_connection(self, conn):
self.get_logger().error(">>> HANDLE_CONNECTION ENTERED <<<")
while rclpy.ok():
try:
self.get_logger().info("Waiting for header (8 bytes)...")
header_data = self._read_exact(conn, 8)
if not header_data:
self.get_logger().warn("Connection closed while reading header")
break
self.get_logger().info("Header received")
metadata_size, frame_size = struct.unpack('>II', header_data)
self.get_logger().info(
f"Parsed header: metadata_size={metadata_size}, frame_size={frame_size}"
)
self.get_logger().info("Reading metadata bytes...")
metadata_bytes = self._read_exact(conn, metadata_size)
self.get_logger().info("Reading frame bytes...")
frame_bytes = self._read_exact(conn, frame_size)
if not (metadata_bytes and frame_bytes):
break
metadata = json.loads(metadata_bytes.decode('utf-8'))
#temp testing logs
self.get_logger().info(f"Metadata keys: {metadata.keys()}")
self.get_logger().info(f"Intrinsics keys: {metadata['intrinsics'].keys()}")
self.get_logger().info(f"Extrinsics keys: {metadata['extrinsics'].keys()}")
timestamp_ms = metadata['timestamp']
self.get_logger().info(f"Quest timestamp: {timestamp_ms}")
ros_time = self.get_clock().now()
self.publish_image(ros_time, frame_bytes)
self.publish_camera_info(ros_time, metadata['intrinsics'])
self.publish_transform(ros_time, metadata['extrinsics'])
except (ConnectionResetError, BrokenPipeError) as e:
self.get_logger().warn(f"Connection lost: {e}")
break
except Exception as e:
self.get_logger().error(f"Error handling connection: {e}\n{traceback.format_exc()}")
break
def _read_exact(self, conn, length):
data = bytearray()
while len(data) < length:
packet = conn.recv(length - len(data))
if not packet: return None
data.extend(packet)
return bytes(data)
def publish_image(self, timestamp, frame_data):
msg = CompressedImage()
msg.header.stamp = timestamp.to_msg()
msg.header.frame_id = 'quest_left_camera'
msg.format = "h264"
msg.data = list(frame_data)
self.image_pub.publish(msg)
def publish_camera_info(self, timestamp, intrinsics):
msg = CameraInfo()
msg.header.stamp = timestamp.to_msg()
msg.header.frame_id = 'quest_left_camera'
msg.width, msg.height = 1280, 720
msg.k = [
float(intrinsics.get('focalLengthX', 0.0)), 0.0, float(intrinsics.get('principalPointX', 0.0)),
0.0, float(intrinsics.get('focalLengthY', 0.0)), float(intrinsics.get('principalPointY', 0.0)),
0.0, 0.0, 1.0
]
distortion = intrinsics.get('distortionCoefficients', [])
if distortion:
msg.d = [float(c) for c in distortion]
msg.distortion_model = "plumb_bob"
else:
msg.distortion_model = ""
msg.r = [1.0] * 9
msg.p = [msg.k[0], 0.0, msg.k[2], 0.0, 0.0, msg.k[4], msg.k[5], 0.0, 0.0, 0.0, 1.0, 0.0]
self.cam_info_pub.publish(msg)
def publish_transform(self, timestamp, extrinsics):
t = TransformStamped()
t.header.stamp = timestamp.to_msg()
t.header.frame_id = 'world'
t.child_frame_id = 'quest_left_camera'
translation = extrinsics.get('translationVector', [0.0]*3)
rotation = extrinsics.get('rotationQuaternion', [0.0]*3 + [1.0])
t.transform.translation.x, t.transform.translation.y, t.transform.translation.z = float(translation[0]), float(translation[1]), float(translation[2])
t.transform.rotation.x, t.transform.rotation.y, t.transform.rotation.z, t.transform.rotation.w = float(rotation[0]), float(rotation[1]), float(rotation[2]), float(rotation[3])
self.tf_broadcaster.sendTransform(t)
def main(args=None):
rclpy.init(args=args)
node = QuestStreamReceiver()
server_thread = threading.Thread(target=node.run_server)
server_thread.daemon = True
server_thread.start()
try:
rclpy.spin(node)
except KeyboardInterrupt:
pass
finally:
node.destroy_node()
rclpy.shutdown()
if __name__ == '__main__':
main()