From dbcd8158fe5d31dd2a7f87a75d9b4d59806a096c Mon Sep 17 00:00:00 2001 From: Kevin Edey Date: Fri, 29 May 2026 14:06:07 -0600 Subject: [PATCH 1/2] Test harness: in-process screenshot + scroll control over dist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add screenshot/3, scroll_info/1, scroll_to/3 NIFs (debug-only, iOS #if !MOB_RELEASE) surfaced as Mob.Test.screenshot/2, scroll_info/2, scroll_to/4, screenshot_tour/3. A remotely-connected agent gets pixels and deterministic scroll entirely over Erlang distribution — no adb/xcrun/idb — which is what Sloppy Joe and WireTap need. - iOS: UIGraphicsImageRenderer + drawViewHierarchy for capture; UIScrollView.contentOffset for scroll. Scroll views are tagged with the node :id as accessibilityIdentifier; since SwiftUI doesn't reliably propagate that onto the backing UIScrollView, the NIF falls back to the largest scroll view. - Android: PixelCopy against the activity window; an id-keyed Compose scroll registry (ScrollState/LazyListState) in MobBridge. kind is :pixel (UIScrollView / verticalScroll) or :index (LazyColumn). - Target resolution (:top/:bottom/{:page,n}/{x,y}) and tour paging are pure, unit-tested Elixir helpers. Verified end-to-end on iOS sim, Android device, and a physical iPhone. The Android Kotlin side lives in the mob_new MobBridge.kt.eex template. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 3 + android/jni/mob_nif.zig | 117 +++++++++ android/jni/mob_zig.zig | 7 +- ...2026-05-29-bridge-nif-screenshot-scroll.md | 69 +++++ ios/MobRootView.swift | 4 + ios/mob_nif.m | 189 ++++++++++++++ lib/mob/test.ex | 244 ++++++++++++++++++ src/mob_nif.erl | 17 ++ test/mob/test_test.exs | 99 +++++++ 9 files changed, 748 insertions(+), 1 deletion(-) create mode 100644 decisions/2026-05-29-bridge-nif-screenshot-scroll.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 02d1c13..5a4c79a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ Full module documentation: [hexdocs.pm/mob](https://hexdocs.pm/mob). ## [Unreleased] +### Added +- **In-process screenshot + scroll control over dist (no adb/xcrun).** Three test-harness NIFs (`screenshot/3`, `scroll_info/1`, `scroll_to/3`) surfaced as `Mob.Test.screenshot/2`, `scroll_info/2`, `scroll_to/4`, and `screenshot_tour/3`. A remotely-connected agent gets pixels and deterministic scroll entirely over Erlang distribution — the capability Sloppy Joe and WireTap need to drive a device an agent can only reach over dist. Capture is in-process (iOS `UIGraphicsImageRenderer` + `drawViewHierarchy`; Android `PixelCopy` against the activity window). Scroll views are addressed by their `:id` prop; `scroll_info` reports `kind: :pixel` (iOS `UIScrollView`, Android `verticalScroll`) or `:index` (Android `LazyColumn`, where y is an item index and viewport is the visible-item count). Captures the app's own surface only — `FLAG_SECURE`/secure fields render blank, and a backgrounded app returns `{:error, :no_window}`. The Android Kotlin side (`screenshot`/`scrollInfo`/`scrollTo`) lives in the `mob_new` `MobBridge.kt.eex` template; existing apps pick it up on regeneration. Debug-only (iOS `#if !MOB_RELEASE`). See `decisions/2026-05-29-bridge-nif-screenshot-scroll.md`. + ### Changed - **`Mob.Bt` extracted to standalone `mob_bluetooth` plugin.** See `plugin_extraction_plan.md` Wave 1. Session A moved the Elixir wrappers (`Mob.Bt`, `Mob.Bt.Hfp`, `Mob.Bt.Hid`, `Mob.Bt.Spp`) out of core into a separate repo as `MobBluetooth.*`; the Zig NIF (`android/jni/mob_nif.zig`) and the iOS stubs (`ios/mob_nif.m`) stay here until Session B promotes the plugin to tier-1. Apps that used `Mob.Bt.*` should add `{:mob_bluetooth, path: "..."}` and rename their references to `MobBluetooth.*` — there is intentionally no compatibility shim. diff --git a/android/jni/mob_nif.zig b/android/jni/mob_nif.zig index 127efbb..ec75c5f 100644 --- a/android/jni/mob_nif.zig +++ b/android/jni/mob_nif.zig @@ -295,6 +295,9 @@ pub const BridgeMethods = extern struct { clear_text: jni.JMethodID = null, long_press_xy: jni.JMethodID = null, swipe_xy: jni.JMethodID = null, + screenshot: jni.JMethodID = null, + scroll_info: jni.JMethodID = null, + scroll_to: jni.JMethodID = null, // ── Mob.Peripheral.VendorUsb ───────────────────────────────────────── // Each takes a pid as jlong (so Kotlin can echo it back when calling // mob_deliver_vendor_usb_*) plus the operation's typed payload. @@ -590,6 +593,114 @@ export fn nif_screen_info( return erts.makeMap(env, &keys, &vvals) orelse erts.atom(env, "error"); } +// ── In-process screenshot + scroll control (agent driving over dist) ───────── +// +// Mirrors the iOS NIFs. These delegate to MobBridge (PixelCopy for capture, +// Compose scroll state for scroll) so a remotely-connected agent gets pixels + +// deterministic scroll with no adb/xcrun. Bridge methods are optional: apps +// generated before these existed return {:error, :not_loaded}. + +// Copy an id binary into a NUL-terminated buffer; returns the C string and an +// optional heap pointer the caller must free. Mirrors the nif_tap/type_text idiom. +const IdBuf = struct { cstr: [*:0]const u8, heap: ?*anyopaque }; + +fn idCString(bin: erts.ErlNifBinary, stack_buf: []u8) ?IdBuf { + const use_heap = bin.size + 1 > stack_buf.len; + const heap_buf: ?*anyopaque = if (use_heap) jni.malloc(bin.size + 1) else null; + if (use_heap and heap_buf == null) return null; + const buf_ptr: [*]u8 = if (use_heap) @ptrCast(heap_buf) else stack_buf.ptr; + @memcpy(buf_ptr[0..bin.size], bin.data[0..bin.size]); + buf_ptr[bin.size] = 0; + return .{ .cstr = @ptrCast(buf_ptr), .heap = heap_buf }; +} + +// nif_screenshot/3 — capture the activity window; returns PNG/JPEG bytes. +export fn nif_screenshot( + env: ?*erts.ErlNifEnv, + argc: c_int, + argv: [*]const erts.ERL_NIF_TERM, +) callconv(.c) erts.ERL_NIF_TERM { + _ = argc; + if (Bridge.screenshot == null) return notLoaded(env); + + var fmt: [8]u8 = @splat(0); + if (erts.enif_get_atom(env, argv[0], &fmt, fmt.len, erts.ERL_NIF_LATIN1) == 0) + return erts.badarg(env); + var quality: c_int = 90; + _ = erts.enif_get_int(env, argv[1], &quality); + const scale = erts.getNumber(env, argv[2]) orelse 1.0; + const fmt_cstr: [*:0]const u8 = @ptrCast(&fmt); + + var attached: c_int = 0; + const jenv = get_jenv(&attached) orelse return erts.atom(env, "error"); + defer detachIfAttached(attached); + + const jfmt = jni.newStringUTF(jenv, fmt_cstr); + const jbytes = jenv.*.CallStaticObjectMethod.?(jenv, Bridge.cls, Bridge.screenshot, jfmt, @as(jni.JInt, @intCast(quality)), @as(f64, scale)); + jni.deleteLocalRef(jenv, jfmt); + if (jbytes == null) return errorAtom(env, "no_window"); + + const len = jni.getArrayLength(jenv, jbytes); + var bin: erts.ErlNifBinary = undefined; + _ = erts.enif_alloc_binary(@intCast(len), &bin); + if (len > 0) jni.getByteArrayRegion(jenv, jbytes, 0, len, @ptrCast(bin.data)); + jni.deleteLocalRef(jenv, jbytes); + return erts.enif_make_binary(env, &bin); +} + +// nif_scroll_info/1 — read a scroll view's offset/extent (JSON string by :id). +export fn nif_scroll_info( + env: ?*erts.ErlNifEnv, + argc: c_int, + argv: [*]const erts.ERL_NIF_TERM, +) callconv(.c) erts.ERL_NIF_TERM { + _ = argc; + if (Bridge.scroll_info == null) return notLoaded(env); + var bin: erts.ErlNifBinary = undefined; + if (erts.enif_inspect_binary(env, argv[0], &bin) == 0) return erts.badarg(env); + + var stack_buf: [256]u8 = undefined; + const id = idCString(bin, &stack_buf) orelse return erts.atom(env, "error"); + defer if (id.heap) |h| jni.free(h); + + var attached: c_int = 0; + const jenv = get_jenv(&attached) orelse return erts.atom(env, "error"); + defer detachIfAttached(attached); + + const jid = jni.newStringUTF(jenv, id.cstr); + const jresult = jenv.*.CallStaticObjectMethod.?(jenv, Bridge.cls, Bridge.scroll_info, jid); + jni.deleteLocalRef(jenv, jid); + if (jresult == null) return errorAtom(env, "scroll_view_not_found"); + return jstringToBin(env, jenv, jresult); // releases jresult +} + +// nif_scroll_to/3 — scroll a view (by :id) to absolute (x, y). +export fn nif_scroll_to( + env: ?*erts.ErlNifEnv, + argc: c_int, + argv: [*]const erts.ERL_NIF_TERM, +) callconv(.c) erts.ERL_NIF_TERM { + _ = argc; + if (Bridge.scroll_to == null) return notLoaded(env); + var bin: erts.ErlNifBinary = undefined; + if (erts.enif_inspect_binary(env, argv[0], &bin) == 0) return erts.badarg(env); + const x = erts.getNumber(env, argv[1]) orelse return erts.badarg(env); + const y = erts.getNumber(env, argv[2]) orelse return erts.badarg(env); + + var stack_buf: [256]u8 = undefined; + const id = idCString(bin, &stack_buf) orelse return erts.atom(env, "error"); + defer if (id.heap) |h| jni.free(h); + + var attached: c_int = 0; + const jenv = get_jenv(&attached) orelse return erts.atom(env, "error"); + defer detachIfAttached(attached); + + const jid = jni.newStringUTF(jenv, id.cstr); + const ok = jenv.*.CallStaticBooleanMethod.?(jenv, Bridge.cls, Bridge.scroll_to, jid, @as(f64, x), @as(f64, y)); + jni.deleteLocalRef(jenv, jid); + return if (ok != 0) erts.ok(env) else errorAtom(env, "scroll_view_not_found"); +} + // nif_ax_action/2 + nif_ax_action_at_xy/3 — Android stubs. // // Both are iOS-only today. Compose semantics walker (the proper Android @@ -4886,6 +4997,9 @@ fn nifLoad(env: ?*erts.ErlNifEnv, priv: *?*anyopaque, info: erts.ERL_NIF_TERM) c cacheOptional(jenv, "uiTree", "()Ljava/lang/String;", &Bridge.ui_tree); cacheOptional(jenv, "uiViewTree", "()Ljava/lang/String;", &Bridge.ui_view_tree); cacheOptional(jenv, "screenInfo", "()[F", &Bridge.screen_info); + cacheOptional(jenv, "screenshot", "(Ljava/lang/String;ID)[B", &Bridge.screenshot); + cacheOptional(jenv, "scrollInfo", "(Ljava/lang/String;)Ljava/lang/String;", &Bridge.scroll_info); + cacheOptional(jenv, "scrollTo", "(Ljava/lang/String;DD)Z", &Bridge.scroll_to); cacheOptional(jenv, "tapXy", "(FF)Z", &Bridge.tap_xy); cacheOptional(jenv, "tapByLabel", "(Ljava/lang/String;)Z", &Bridge.tap_by_label); cacheOptional(jenv, "typeText", "(Ljava/lang/String;)Z", &Bridge.type_text); @@ -4921,6 +5035,9 @@ const nif_funcs = [_]erts.ErlNifFunc{ .{ .name = "clear_text", .arity = 0, .fptr = nif_clear_text, .flags = 0 }, .{ .name = "long_press_xy", .arity = 3, .fptr = nif_long_press_xy, .flags = 0 }, .{ .name = "swipe_xy", .arity = 4, .fptr = nif_swipe_xy, .flags = 0 }, + .{ .name = "screenshot", .arity = 3, .fptr = nif_screenshot, .flags = erts.ERL_NIF_DIRTY_JOB_CPU_BOUND }, + .{ .name = "scroll_info", .arity = 1, .fptr = nif_scroll_info, .flags = 0 }, + .{ .name = "scroll_to", .arity = 3, .fptr = nif_scroll_to, .flags = 0 }, // Core mob functions. .{ .name = "platform", .arity = 0, .fptr = nif_platform, .flags = 0 }, .{ .name = "color_scheme", .arity = 0, .fptr = nif_color_scheme, .flags = 0 }, diff --git a/android/jni/mob_zig.zig b/android/jni/mob_zig.zig index 39de4c6..390f10c 100644 --- a/android/jni/mob_zig.zig +++ b/android/jni/mob_zig.zig @@ -511,7 +511,8 @@ pub const JNINativeInterface = extern struct { ReleaseFloatArrayElements: ?*anyopaque, ReleaseDoubleArrayElements: ?*anyopaque, GetBooleanArrayRegion: ?*anyopaque, - GetByteArrayRegion: ?*anyopaque, + // Typed (used by nif_screenshot to read a Kotlin byte[] into a binary). + GetByteArrayRegion: ?*const fn (env: *JNIEnv, arr: JByteArray, start: JInt, len: JInt, buf: [*]JByte) callconv(.c) void, GetCharArrayRegion: ?*anyopaque, GetShortArrayRegion: ?*anyopaque, GetIntArrayRegion: ?*anyopaque, @@ -618,6 +619,10 @@ pub inline fn getFloatArrayRegion(env: *JNIEnv, arr: JObject, start: JInt, len: env.*.GetFloatArrayRegion.?(env, arr, start, len, buf); } +pub inline fn getByteArrayRegion(env: *JNIEnv, arr: JByteArray, start: JInt, len: JInt, buf: [*]JByte) void { + env.*.GetByteArrayRegion.?(env, arr, start, len, buf); +} + pub inline fn newByteArray(env: *JNIEnv, len: JSize) JByteArray { return env.*.NewByteArray.?(env, len); } diff --git a/decisions/2026-05-29-bridge-nif-screenshot-scroll.md b/decisions/2026-05-29-bridge-nif-screenshot-scroll.md new file mode 100644 index 0000000..f9c8d5b --- /dev/null +++ b/decisions/2026-05-29-bridge-nif-screenshot-scroll.md @@ -0,0 +1,69 @@ +# In-process screenshot + scroll control via the bridge NIF + +- Date: 2026-05-29 +- Status: accepted + +## Context + +`Mob.Test` already drives Mob apps fully over Erlang distribution (state reads, +taps, navigation, synthetic touches) with no adb/xcrun. The one remaining hard +dependency on external device tooling was the *observe-visually* half of the +agent loop: `PLAN.md`'s Layer 5 (Visual) is "MCP, external" — screenshots came +only from `xcrun simctl io` / `adb screencap`. There was also no way over dist +to read a scroll view's offset/extent or command it to a position (only the +imprecise `swipe_xy` and iOS-AX-only `ax_action :scroll_*`). + +This blocks Sloppy Joe and WireTap, which must be programmable by a remote agent +that can only reach the device over dist. The agent needs eyes and deterministic +scroll through the bridge NIF itself. + +## Decision + +Add three test-harness NIFs, surfaced on `Mob.Test`: + +- `screenshot/3` (format, quality, scale) → PNG/JPEG bytes, returned over dist. +- `scroll_info/1` (id) → flat JSON `{offset,content,viewport,max,kind}`. +- `scroll_to/3` (id, x, y) → absolute offset (clamped by the Elixir wrapper). + +`Mob.Test` adds `screenshot/2`, `scroll_info/2`, `scroll_to/4`, and +`screenshot_tour/3` (page top→bottom, capture each). Target resolution +(`:top`/`:bottom`/`{:page,n}`/`{x,y}`) and the tour paging are pure, unit-tested +helpers; the NIF stays a dumb absolute setter. + +Scroll views are addressed by their `:id` prop: + +- **iOS**: the SwiftUI renderer applies `node.nativeViewId` as the scroll view's + `accessibilityIdentifier`; the NIF walks `UIScrollView`s and matches it. In + practice SwiftUI does **not** reliably propagate `.accessibilityIdentifier` onto + the backing `UIScrollView` (verified on-device 2026-05-29), so the NIF falls back + to the largest scroll view (the main content scroller) when an explicit id does + not match — correct for the common one-scroll-per-screen case. Pixel units. +- **Android**: the Compose renderer registers each `:scroll`/lazy-list state in an + id-keyed registry in `MobBridge` (with the measured viewport for `ScrollState`, + which doesn't expose it). `kind` is `"pixel"` for `verticalScroll`/`ScrollState` + and `"index"` for `LazyColumn`/`LazyListState` (y is an item index, viewport is + the visible-item count). The `kind` field makes the asymmetry explicit so paging + stays coherent in either unit. + +Capture is in-process: iOS `UIGraphicsImageRenderer` + `drawViewHierarchy`; +Android `PixelCopy` against the activity window (decor-view `draw` fallback +pre-API-26). Both are debug-only harness code (iOS `#if !MOB_RELEASE`). + +This is core test-harness work (same bucket as `ui_tree`/`tap_xy`), not a +plugin-shaped feature, so it lands under the current plugin-first hold. + +## Consequences + +- A remote agent gets pixels + deterministic scroll with zero adb/xcrun — the + capability `wiretap_screenshot` will build on. +- Capture is the app's own surface only; `FLAG_SECURE` (Android) and secure text + fields (iOS) render blank, and a backgrounded app has no window (returns + `{:error, :no_window}` / not_found). +- Cross-repo: the Android side spans `mob` (Zig NIF) and the `mob_new` + `MobBridge.kt.eex` template; existing apps pick it up on regeneration or a + manual `MobBridge.kt` patch. +- `:scroll` (ScrollState) is not persisted across BEAM re-renders the way lists + are; the registry holds the live state, which is current during a scroll→shot + tour. Persisting it by id is a possible follow-up. +- The Compose-semantics walker for arbitrary (non-Mob) apps remains deferred to + WireTap (see `future_developments.md`); this change covers Mob-rendered apps. diff --git a/ios/MobRootView.swift b/ios/MobRootView.swift index aa300c1..c061f7f 100644 --- a/ios/MobRootView.swift +++ b/ios/MobRootView.swift @@ -343,6 +343,9 @@ struct MobNodeView: View { .scrollDismissesKeyboard(.interactively) .padding(node.paddingEdgeInsets) .background(node.backgroundColor.map { Color($0) } ?? Color.clear) + // Expose the node :id on the backing UIScrollView so the test + // harness (Mob.Test.scroll_info/scroll_to) can address it by id. + .ifLet(node.nativeViewId) { view, id in view.accessibilityIdentifier(id) } // ── Batch 5 Tier 1: scroll position observation ── // SwiftUI's onScrollGeometryChange is iOS 18+. On older iOS // there's no clean SwiftUI API for raw offset; UIKit-backed @@ -405,6 +408,7 @@ struct MobNodeView: View { .frame(maxHeight: .infinity) .padding(node.paddingEdgeInsets) .background(node.backgroundColor.map { Color($0) } ?? Color.clear) + .ifLet(node.nativeViewId) { view, id in view.accessibilityIdentifier(id) } case .progress: let trackColor = node.color.map { Color($0) } ?? Color.accentColor diff --git a/ios/mob_nif.m b/ios/mob_nif.m index b50b233..7082a10 100644 --- a/ios/mob_nif.m +++ b/ios/mob_nif.m @@ -5764,6 +5764,192 @@ static ERL_NIF_TERM nif_swipe_xy(ErlNifEnv *env, int argc, const ERL_NIF_TERM ar #endif } +// ── In-process screenshot + scroll control (agent driving over dist) ───────── +// +// screenshot/3, scroll_info/1, scroll_to/3 give a remotely-connected agent +// pixels and deterministic scroll without adb/xcrun. They use only public +// UIKit APIs (UIGraphicsImageRenderer, UIScrollView.contentOffset) but live in +// the debug-only harness block alongside the other driving NIFs. + +// Recursively collect every UIScrollView under `view` into `acc`. +static void mob_collect_scroll_views(UIView *view, NSMutableArray *acc) { + if ([view isKindOfClass:[UIScrollView class]]) + [acc addObject:(UIScrollView *)view]; + for (UIView *sub in view.subviews) + mob_collect_scroll_views(sub, acc); +} + +// Find the scroll view addressed by `identifier` (the node's :id, which the +// SwiftUI renderer applies as accessibilityIdentifier). If `identifier` is +// empty, fall back to the largest scroll view — the main content scroller. +// Returns nil if none match. Main-thread only. +static UIScrollView *mob_find_scroll_view(NSString *identifier) { + NSMutableArray *all = [NSMutableArray array]; + for (UIScene *scene in [UIApplication sharedApplication].connectedScenes) { + if (![scene isKindOfClass:[UIWindowScene class]]) + continue; + for (UIWindow *win in [(UIWindowScene *)scene windows]) { + if (!win.isHidden) + mob_collect_scroll_views(win, all); + } + } + if (all.count == 0) + return nil; + + if (identifier.length > 0) { + for (UIScrollView *sv in all) { + if ([sv.accessibilityIdentifier isEqualToString:identifier]) + return sv; + } + // SwiftUI does not reliably propagate `.accessibilityIdentifier` onto the + // backing UIScrollView, so an explicit id may not match even when set on + // the Mob node. Fall through to the largest scroll view (the main content + // scroller) rather than failing — correct for the common one-scroll screen. + } + + UIScrollView *best = nil; + CGFloat bestArea = -1.0; + for (UIScrollView *sv in all) { + CGFloat area = sv.bounds.size.width * sv.bounds.size.height; + if (area > bestArea) { + bestArea = area; + best = sv; + } + } + return best; +} + +static ERL_NIF_TERM nif_screenshot(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + char fmt[8] = {0}; + int quality = 90; + double scale = 1.0; + if (!enif_get_atom(env, argv[0], fmt, sizeof(fmt), ERL_NIF_LATIN1) || + !enif_get_int(env, argv[1], &quality) || !enif_get_double(env, argv[2], &scale)) + return enif_make_badarg(env); + + BOOL jpeg = (strcmp(fmt, "jpeg") == 0); + if (scale <= 0.0) + scale = 1.0; + + __block NSData *imageData = nil; + dispatch_sync(dispatch_get_main_queue(), ^{ + UIWindow *window = nil; + for (UIScene *scene in [UIApplication sharedApplication].connectedScenes) { + if (![scene isKindOfClass:[UIWindowScene class]]) + continue; + for (UIWindow *win in [(UIWindowScene *)scene windows]) { + if (win.isHidden) + continue; + if (win.isKeyWindow) { + window = win; + break; + } + if (!window) + window = win; // first visible window as fallback + } + if (window.isKeyWindow) + break; + } + if (!window) + return; + + // `scale` is a multiplier of the native screen scale: 1.0 = crisp native + // resolution, 0.5 = half (smaller payload over dist). + UIGraphicsImageRendererFormat *rf = [UIGraphicsImageRendererFormat preferredFormat]; + rf.scale = [UIScreen mainScreen].scale * (CGFloat)scale; + rf.opaque = YES; + UIGraphicsImageRenderer *renderer = + [[UIGraphicsImageRenderer alloc] initWithSize:window.bounds.size format:rf]; + UIImage *img = [renderer imageWithActions:^(UIGraphicsImageRendererContext *_Nonnull ctx) { + (void)ctx; + [window drawViewHierarchyInRect:window.bounds afterScreenUpdates:YES]; + }]; + imageData = jpeg ? UIImageJPEGRepresentation(img, (CGFloat)quality / 100.0) + : UIImagePNGRepresentation(img); + }); + + if (!imageData) + return enif_make_tuple2(env, enif_make_atom(env, "error"), + enif_make_atom(env, "no_window")); + + ErlNifBinary bin; + enif_alloc_binary(imageData.length, &bin); + memcpy(bin.data, imageData.bytes, imageData.length); + return enif_make_binary(env, &bin); +} + +static ERL_NIF_TERM nif_scroll_info(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + ErlNifBinary idb; + if (!enif_inspect_binary(env, argv[0], &idb)) + return enif_make_badarg(env); + NSString *identifier = [[NSString alloc] initWithBytes:idb.data + length:idb.size + encoding:NSUTF8StringEncoding] + ?: @""; + + __block NSData *jsonData = nil; + dispatch_sync(dispatch_get_main_queue(), ^{ + UIScrollView *sv = mob_find_scroll_view(identifier); + if (!sv) + return; + + // Normalize so offset 0 == content top, regardless of inset. + UIEdgeInsets in = sv.adjustedContentInset; + CGFloat vw = sv.bounds.size.width - in.left - in.right; + CGFloat vh = sv.bounds.size.height - in.top - in.bottom; + CGFloat cw = sv.contentSize.width; + CGFloat ch = sv.contentSize.height; + NSDictionary *d = @{ + @"offset_x" : @(sv.contentOffset.x + in.left), + @"offset_y" : @(sv.contentOffset.y + in.top), + @"content_w" : @(cw), + @"content_h" : @(ch), + @"viewport_w" : @(vw), + @"viewport_h" : @(vh), + @"max_x" : @(MAX(0.0, cw - vw)), + @"max_y" : @(MAX(0.0, ch - vh)), + @"kind" : @"pixel" + }; + jsonData = [NSJSONSerialization dataWithJSONObject:d options:0 error:nil]; + }); + + if (!jsonData) + return enif_make_tuple2(env, enif_make_atom(env, "error"), + enif_make_atom(env, "scroll_view_not_found")); + + ErlNifBinary bin; + enif_alloc_binary(jsonData.length, &bin); + memcpy(bin.data, jsonData.bytes, jsonData.length); + return enif_make_binary(env, &bin); +} + +static ERL_NIF_TERM nif_scroll_to(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + ErlNifBinary idb; + double x, y; + if (!enif_inspect_binary(env, argv[0], &idb) || !enif_get_double(env, argv[1], &x) || + !enif_get_double(env, argv[2], &y)) + return enif_make_badarg(env); + NSString *identifier = [[NSString alloc] initWithBytes:idb.data + length:idb.size + encoding:NSUTF8StringEncoding] + ?: @""; + + __block BOOL ok = NO; + dispatch_sync(dispatch_get_main_queue(), ^{ + UIScrollView *sv = mob_find_scroll_view(identifier); + if (!sv) + return; + // Caller works in normalized coords (0 == top); convert to offset space. + UIEdgeInsets in = sv.adjustedContentInset; + [sv setContentOffset:CGPointMake((CGFloat)x - in.left, (CGFloat)y - in.top) animated:NO]; + ok = YES; + }); + + return ok ? enif_make_atom(env, "ok") + : enif_make_tuple2(env, enif_make_atom(env, "error"), + enif_make_atom(env, "scroll_view_not_found")); +} + #endif // !MOB_RELEASE — end of test harness block (started near line 2780) // ── Storage ─────────────────────────────────────────────────────────────────── @@ -6612,6 +6798,9 @@ static ERL_NIF_TERM nif_bt_hid_subscribe_raw(ErlNifEnv *env, int argc, const ERL {"clear_text", 0, nif_clear_text, 0}, {"long_press_xy", 3, nif_long_press_xy, 0}, {"swipe_xy", 4, nif_swipe_xy, 0}, + {"screenshot", 3, nif_screenshot, ERL_NIF_DIRTY_JOB_CPU_BOUND}, + {"scroll_info", 1, nif_scroll_info, 0}, + {"scroll_to", 3, nif_scroll_to, 0}, #endif // ── Core mob functions ─────────────────────────────────────────────────── {"background_keep_alive", 0, nif_background_keep_alive, 0}, diff --git a/lib/mob/test.ex b/lib/mob/test.ex index dea8587..afa4f6d 100644 --- a/lib/mob/test.ex +++ b/lib/mob/test.ex @@ -28,6 +28,12 @@ defmodule Mob.Test do # Lists Mob.Test.select(node, :my_list, 0) # select first row + # Visual capture + scroll (in-process, over dist — no adb/xcrun) + {:ok, png} = Mob.Test.screenshot(node) + Mob.Test.scroll_info(node, "feed") # offset/content/viewport + Mob.Test.scroll_to(node, "feed", :bottom) + Mob.Test.screenshot_tour(node, "feed") # page top→bottom, capture each + # Device API simulation Mob.Test.send_message(node, {:permission, :camera, :granted}) Mob.Test.send_message(node, {:camera, :photo, %{path: "/tmp/photo.jpg", width: 1920, height: 1080}}) @@ -847,6 +853,244 @@ defmodule Mob.Test do end end + # ── In-process visual capture + scroll control ─────────────────────────────── + # + # Remote-driving primitives: a connected agent gets pixels and deterministic + # scroll over Erlang distribution, with no adb / xcrun / idb. These call + # mob_nif directly via RPC. screenshot returns the raw image bytes, which + # cross the dist boundary fine (the same path camera frames already take). + + @doc """ + Capture the running app's own window in-process and return the image bytes. + + Returns `{:ok, binary}` (PNG or JPEG) or `{:error, reason}`. The bytes come + back over Erlang distribution — no `adb screencap` / `xcrun simctl io`, so it + works against a remote device an agent can only reach over dist. + + Options: + + * `:format` — `:png` (default) or `:jpeg` + * `:quality` — `0..100`, JPEG only (default `90`) + * `:scale` — output scale factor (default `1.0`); `0.5` halves resolution + + Captures only the app's own surface, not system layers or other processes. + Secure text fields (iOS) and `FLAG_SECURE` windows (Android) render blank by + OS policy. A backgrounded app has no live window, so this fails when the app + is not foregrounded. + + {:ok, png} = Mob.Test.screenshot(node) + File.write!("/tmp/shot.png", png) + + {:ok, jpg} = Mob.Test.screenshot(node, format: :jpeg, quality: 60, scale: 0.5) + """ + @spec screenshot(node(), keyword()) :: {:ok, binary()} | {:error, term()} + def screenshot(node, opts \\ []) do + %{format: format, quality: quality, scale: scale} = normalize_screenshot_opts(opts) + + case :rpc.call(node, :mob_nif, :screenshot, [format, quality, scale]) do + bin when is_binary(bin) -> {:ok, bin} + {:error, _} = err -> err + other -> {:error, other} + end + end + + @doc false + # Pure: keyword opts -> the {format, quality, scale} args the NIF expects. + @spec normalize_screenshot_opts(keyword()) :: + %{format: :png | :jpeg, quality: 0..100, scale: float()} + def normalize_screenshot_opts(opts) do + format = + case Keyword.get(opts, :format, :png) do + f when f in [:png, :jpeg] -> + f + + other -> + raise ArgumentError, + "screenshot format must be :png or :jpeg, got: #{Kernel.inspect(other)}" + end + + quality = opts |> Keyword.get(:quality, 90) |> clamp_int(0, 100) + scale = opts |> Keyword.get(:scale, 1.0) |> Kernel.*(1.0) + %{format: format, quality: quality, scale: scale} + end + + @doc """ + Read a scroll view's current offset and extent, addressed by its `:id` prop + (the same `:id` you set on a `type: :scroll` or `type: :list` node). + + Returns a map, or `{:error, reason}`: + + %{ + offset: {x, y}, # current scroll position + content: {w, h}, # full scrollable content size + viewport: {w, h}, # visible area + max_offset: {x, y}, # offset at the bottom/right edge + kind: :pixel | :index + } + + `:kind` is `:pixel` for pixel-precise scroll views (iOS `UIScrollView`, + Android `verticalScroll`). It is `:index` for item-indexed lists (Android + `LazyColumn`), where the y components count items, not pixels, and `viewport` + height is the number of visible items. `scroll_to/4` and `screenshot_tour/3` + work in whichever unit `:kind` reports, so paging stays coherent either way. + + Mob.Test.scroll_info(node, "feed") + #=> %{offset: {0.0, 0.0}, content: {393.0, 2400.0}, viewport: {393.0, 756.0}, + # max_offset: {0.0, 1644.0}, kind: :pixel} + """ + @spec scroll_info(node(), String.t() | atom()) :: map() | {:error, term()} + def scroll_info(node, id) do + case :rpc.call(node, :mob_nif, :scroll_info, [to_string(id)]) do + json when is_binary(json) -> decode_scroll_info(json) + {:error, _} = err -> err + other -> {:error, other} + end + end + + # The NIF returns a flat JSON object on both platforms (iOS builds it via + # NSJSONSerialization, Android via the Kotlin bridge). Decode to the + # tuple-shaped public map. + defp decode_scroll_info(json) do + m = :json.decode(json) + + %{ + offset: {f(m["offset_x"]), f(m["offset_y"])}, + content: {f(m["content_w"]), f(m["content_h"])}, + viewport: {f(m["viewport_w"]), f(m["viewport_h"])}, + max_offset: {f(m["max_x"]), f(m["max_y"])}, + kind: if(m["kind"] == "index", do: :index, else: :pixel) + } + end + + defp f(n) when is_number(n), do: n * 1.0 + defp f(_), do: 0.0 + + @doc """ + Scroll a view (by `:id`) to a target position. Reads `scroll_info/2` first to + resolve and clamp the absolute offset, then drives the native scroll view. + + `target`: + + * `{x, y}` — absolute offset (pixels, or item index on an `:index` list) + * `:top` / `:bottom` — the extremes + * `{:page, n}` — `n` viewport-heights down from the top (works on both + `:pixel` and `:index` views) + + Returns `:ok` or `{:error, reason}`. + + Mob.Test.scroll_to(node, "feed", :bottom) + Mob.Test.scroll_to(node, "feed", {:page, 2}) + Mob.Test.scroll_to(node, "feed", {0.0, 500.0}) + """ + @spec scroll_to(node(), String.t() | atom(), tuple() | atom(), keyword()) :: + :ok | {:error, term()} + def scroll_to(node, id, target, _opts \\ []) do + with %{} = info <- scroll_info(node, id), + {x, y} <- resolve_scroll_target(target, info) do + raw_scroll_to(node, id, x, y) + end + end + + defp raw_scroll_to(node, id, x, y) do + case :rpc.call(node, :mob_nif, :scroll_to, [to_string(id), x * 1.0, y * 1.0]) do + :ok -> :ok + {:error, _} = err -> err + other -> {:error, other} + end + end + + @doc false + # Pure: turn a target (:top | :bottom | {:page, n} | {x, y}) into an absolute + # {x, y} offset clamped to the scroll view's extent. A "page" is one viewport + # height in whatever unit `:kind` uses (pixels or item count). + @spec resolve_scroll_target(tuple() | atom(), map()) :: {float(), float()} + def resolve_scroll_target(target, %{max_offset: {mx, my}, viewport: {_vw, vh}} = info) do + {ox, _oy} = Map.get(info, :offset, {0.0, 0.0}) + + {x, y} = + case target do + :top -> {0.0, 0.0} + :bottom -> {mx, my} + {:page, n} when is_number(n) -> {ox, n * vh} + {x, y} when is_number(x) and is_number(y) -> {x, y} + end + + {clamp(x * 1.0, 0.0, mx), clamp(y * 1.0, 0.0, my)} + end + + @doc """ + Walk a scroll view top→bottom, capturing a screenshot at each page. Returns a + list of `{offset, image_binary}` pairs — the agent's "see the whole long + screen" path, entirely over dist. + + Options: + + * `:format` / `:quality` / `:scale` — passed through to `screenshot/2` + * `:overlap` — `0.0..0.9`, fraction of a viewport to overlap between pages + (default `0.0`) + * `:settle_ms` — pause after each scroll before capturing (default `150`) + + pages = Mob.Test.screenshot_tour(node, "feed", format: :jpeg, quality: 60) + for {{_x, y}, bin} <- pages, do: File.write!("/tmp/page_\#{trunc(y)}.jpg", bin) + """ + @spec screenshot_tour(node(), String.t() | atom(), keyword()) :: + [{{float(), float()}, binary()}] | {:error, term()} + def screenshot_tour(node, id, opts \\ []) do + settle_ms = Keyword.get(opts, :settle_ms, 150) + shot_opts = Keyword.take(opts, [:format, :quality, :scale]) + + with %{} = info <- scroll_info(node, id) do + info + |> tour_offsets(opts) + |> Enum.reduce_while([], fn {x, y} = off, acc -> + case raw_scroll_to(node, id, x, y) do + :ok -> + Process.sleep(settle_ms) + + case screenshot(node, shot_opts) do + {:ok, bin} -> {:cont, [{off, bin} | acc]} + {:error, _} = err -> {:halt, err} + end + + {:error, _} = err -> + {:halt, err} + end + end) + |> case do + {:error, _} = err -> err + list when is_list(list) -> Enum.reverse(list) + end + end + end + + @doc false + # Pure: the list of {x, y} offsets a top→bottom tour should visit. Steps by + # one viewport (minus `:overlap`) and always pins a final page to the bottom. + @spec tour_offsets(map(), keyword()) :: [{float(), float()}] + def tour_offsets(%{max_offset: {_mx, my}, viewport: {_vw, vh}} = info, opts) do + {ox, _oy} = Map.get(info, :offset, {0.0, 0.0}) + overlap = opts |> Keyword.get(:overlap, 0.0) |> clamp(0.0, 0.9) + step = max(vh * (1.0 - overlap), 1.0) + + my + |> tour_ys(step) + |> Enum.map(fn y -> {ox, y} end) + end + + defp tour_ys(my, _step) when my <= 0.0, do: [0.0] + + defp tour_ys(my, step) do + count = ceil(my / step) + + 0..count + |> Enum.map(fn i -> min(i * step * 1.0, my * 1.0) end) + |> Enum.uniq() + end + + defp clamp(v, lo, hi), do: v |> max(lo) |> min(hi) + defp clamp_int(v, lo, hi) when is_integer(v), do: v |> max(lo) |> min(hi) + defp clamp_int(v, lo, hi), do: v |> trunc() |> max(lo) |> min(hi) + # ── Native UI (requires MCP tools) ─────────────────────────────────────────── @doc """ diff --git a/src/mob_nif.erl b/src/mob_nif.erl index 35ec007..2ce5997 100644 --- a/src/mob_nif.erl +++ b/src/mob_nif.erl @@ -103,6 +103,12 @@ clear_text/0, long_press_xy/3, swipe_xy/4, + %% Test harness — in-process visual capture and scroll control + %% (remote-driving: agent gets pixels + deterministic scroll over dist, + %% no adb/xcrun). See Mob.Test.screenshot/2, scroll_info/2, scroll_to/3. + screenshot/3, + scroll_info/1, + scroll_to/3, %% Peripheral.VendorUsb (Android USB host; iOS returns :unsupported) vendor_usb_list_devices/1, vendor_usb_request_permission/1, @@ -201,6 +207,9 @@ clear_text/0, long_press_xy/3, swipe_xy/4, + screenshot/3, + scroll_info/1, + scroll_to/3, %% Storage storage_dir/1, storage_save_to_photo_library/1, @@ -328,6 +337,14 @@ key_press(_Key) -> erlang:nif_error(not_loaded). clear_text() -> erlang:nif_error(not_loaded). long_press_xy(_X, _Y, _Ms) -> erlang:nif_error(not_loaded). swipe_xy(_X1, _Y1, _X2, _Y2) -> erlang:nif_error(not_loaded). +%% In-process visual capture + scroll control (see Mob.Test). +%% screenshot(Format, Quality, Scale) -> Binary (PNG/JPEG bytes) | {error, Reason} +%% Format :: png | jpeg, Quality :: 0..100 (jpeg), Scale :: float +%% scroll_info(Id) -> #{offset, content, viewport, max_offset, kind} | {error, Reason} +%% scroll_to(Id, X, Y) -> ok | {error, Reason} +screenshot(_Format, _Quality, _Scale) -> erlang:nif_error(not_loaded). +scroll_info(_Id) -> erlang:nif_error(not_loaded). +scroll_to(_Id, _X, _Y) -> erlang:nif_error(not_loaded). storage_dir(_Location) -> erlang:nif_error(not_loaded). storage_save_to_photo_library(_Path) -> erlang:nif_error(not_loaded). storage_save_to_media_store(_Path, _Type) -> erlang:nif_error(not_loaded). diff --git a/test/mob/test_test.exs b/test/mob/test_test.exs index 3018e78..7d8c720 100644 --- a/test/mob/test_test.exs +++ b/test/mob/test_test.exs @@ -158,4 +158,103 @@ defmodule Mob.TestTest do assert length(sample_tree().children) > 0 end end + + # ── screenshot + scroll pure helpers ────────────────────────────────────── + + describe "normalize_screenshot_opts/1" do + test "defaults to png, quality 90, scale 1.0" do + assert %{format: :png, quality: 90, scale: 1.0} = M.normalize_screenshot_opts([]) + end + + test "passes jpeg through and clamps quality to 0..100" do + assert %{format: :jpeg, quality: 60} = + M.normalize_screenshot_opts(format: :jpeg, quality: 60) + + assert %{quality: 100} = M.normalize_screenshot_opts(quality: 250) + assert %{quality: 0} = M.normalize_screenshot_opts(quality: -5) + end + + test "floatifies an integer scale" do + assert %{scale: 2.0} = M.normalize_screenshot_opts(scale: 2) + end + + test "raises on an unsupported format" do + assert_raise ArgumentError, ~r/:png or :jpeg/, fn -> + M.normalize_screenshot_opts(format: :gif) + end + end + end + + describe "resolve_scroll_target/2" do + defp pixel_info do + %{ + offset: {0.0, 200.0}, + content: {393.0, 2400.0}, + viewport: {393.0, 756.0}, + max_offset: {0.0, 1644.0}, + kind: :pixel + } + end + + test ":top and :bottom resolve to the extremes" do + assert M.resolve_scroll_target(:top, pixel_info()) == {0.0, 0.0} + assert M.resolve_scroll_target(:bottom, pixel_info()) == {0.0, 1644.0} + end + + test "{:page, n} steps n viewport-heights from the top, keeping x" do + # 1 page = one viewport height (756) + assert M.resolve_scroll_target({:page, 1}, pixel_info()) == {0.0, 756.0} + # 3 pages would be 2268 but clamps to max_offset y (1644) + assert M.resolve_scroll_target({:page, 3}, pixel_info()) == {0.0, 1644.0} + end + + test "absolute {x, y} is clamped to the extent" do + assert M.resolve_scroll_target({0.0, 500.0}, pixel_info()) == {0.0, 500.0} + assert M.resolve_scroll_target({0.0, 9999.0}, pixel_info()) == {0.0, 1644.0} + assert M.resolve_scroll_target({0.0, -10.0}, pixel_info()) == {0.0, 0.0} + end + + test "works in item units for an :index list (page = visible item count)" do + index_info = %{ + offset: {0.0, 0.0}, + content: {0.0, 100.0}, + viewport: {0.0, 8.0}, + max_offset: {0.0, 92.0}, + kind: :index + } + + # one page = 8 items + assert M.resolve_scroll_target({:page, 1}, index_info) == {0.0, 8.0} + assert M.resolve_scroll_target(:bottom, index_info) == {0.0, 92.0} + end + end + + describe "tour_offsets/2" do + test "pages from 0 to max_offset by viewport height, pinning a final bottom page" do + offsets = M.tour_offsets(pixel_info(), []) + ys = Enum.map(offsets, fn {_x, y} -> y end) + + assert List.first(ys) == 0.0 + assert List.last(ys) == 1644.0 + # 1644 / 756 -> ceil 3 steps: 0, 756, 1512, 1644 + assert ys == [0.0, 756.0, 1512.0, 1644.0] + end + + test "overlap shrinks the step" do + ys = M.tour_offsets(pixel_info(), overlap: 0.5) |> Enum.map(fn {_x, y} -> y end) + # step = 756 * 0.5 = 378 + assert Enum.at(ys, 1) == 378.0 + assert List.last(ys) == 1644.0 + end + + test "keeps the current x offset across pages" do + info = %{pixel_info() | offset: {40.0, 0.0}} + assert Enum.all?(M.tour_offsets(info, []), fn {x, _y} -> x == 40.0 end) + end + + test "a non-scrollable view yields a single page at the top" do + info = %{pixel_info() | max_offset: {0.0, 0.0}} + assert M.tour_offsets(info, []) == [{0.0, 0.0}] + end + end end From 0bd54a26a5a75ed5a767d365de3ebdf6facae03d Mon Sep 17 00:00:00 2001 From: Kevin Edey Date: Fri, 29 May 2026 14:27:13 -0600 Subject: [PATCH 2/2] =?UTF-8?q?Test=20harness:=20element=20frames=20?= =?UTF-8?q?=E2=80=94=20positions=20by=20id,=20no=20screenshot?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add element_frames/0 NIF surfaced as Mob.Test.element_frames/1 (%{id => {x,y,w,h}}), frame/2, and tap_id/2. Any rendered node given an :id reports its live on-screen frame (logical points iOS / dp Android) to a registry the agent reads over dist — a compact structured map instead of image bytes, with no accessibility activation. Lets an agent locate and drive elements by id without screenshotting (which blows out session memory). - iOS: a MobFrameTracker ViewModifier on every node records frame(in: .global) via a GeometryReader background and sets accessibilityIdentifier when the node carries an :id; frames go to a C registry (mob_register_frame / g_element_frames in mob_nif.m), cleared on set_root. nif_element_frames is debug-only; the registry itself uses only public APIs. - Android: RenderNodeInner attaches Modifier.onGloballyPositioned + testTag for id'd nodes → elementFramesById in MobBridge (px→dp). (Kotlin side in the mob_new MobBridge.kt.eex template.) Opt-in per :id — untagged nodes get no tracking modifier, so zero cost. Verified on iOS sim, Android device, and a physical iPhone (41 frames each). Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 1 + android/jni/mob_nif.zig | 20 ++++++++++ ios/MobDemo-Bridging-Header.h | 6 +++ ios/MobRootView.swift | 30 +++++++++++++++ ios/mob_nif.m | 62 +++++++++++++++++++++++++++++++ lib/mob/test.ex | 69 +++++++++++++++++++++++++++++++++++ src/mob_nif.erl | 6 +++ 7 files changed, 194 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a4c79a..4676397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Full module documentation: [hexdocs.pm/mob](https://hexdocs.pm/mob). ## [Unreleased] ### Added +- **Element positions without a screenshot.** `element_frames/0` NIF surfaced as `Mob.Test.element_frames/1` (`%{id => {x,y,w,h}}`), `frame/2`, and `tap_id/2` (drive by id at real coordinates). Any rendered node given an `:id` reports its live on-screen frame (logical points iOS / dp Android) to a registry the agent reads over dist — a compact structured map instead of image bytes, with no accessibility activation. The renderer also sets the `:id` as the element's accessibility identifier (iOS `accessibilityIdentifier`, Android Compose `testTag`), so the same tags are visible to XCUITest/Espresso. Opt-in per element: untagged nodes cost nothing (the tracking modifier only attaches when an `:id` is present). iOS records the full element frame via a `GeometryReader` background; Android via `Modifier.onGloballyPositioned`. Verified on iOS sim, Android device, and a physical iPhone. The Android Kotlin side lives in the `mob_new` `MobBridge.kt.eex` template. - **In-process screenshot + scroll control over dist (no adb/xcrun).** Three test-harness NIFs (`screenshot/3`, `scroll_info/1`, `scroll_to/3`) surfaced as `Mob.Test.screenshot/2`, `scroll_info/2`, `scroll_to/4`, and `screenshot_tour/3`. A remotely-connected agent gets pixels and deterministic scroll entirely over Erlang distribution — the capability Sloppy Joe and WireTap need to drive a device an agent can only reach over dist. Capture is in-process (iOS `UIGraphicsImageRenderer` + `drawViewHierarchy`; Android `PixelCopy` against the activity window). Scroll views are addressed by their `:id` prop; `scroll_info` reports `kind: :pixel` (iOS `UIScrollView`, Android `verticalScroll`) or `:index` (Android `LazyColumn`, where y is an item index and viewport is the visible-item count). Captures the app's own surface only — `FLAG_SECURE`/secure fields render blank, and a backgrounded app returns `{:error, :no_window}`. The Android Kotlin side (`screenshot`/`scrollInfo`/`scrollTo`) lives in the `mob_new` `MobBridge.kt.eex` template; existing apps pick it up on regeneration. Debug-only (iOS `#if !MOB_RELEASE`). See `decisions/2026-05-29-bridge-nif-screenshot-scroll.md`. ### Changed diff --git a/android/jni/mob_nif.zig b/android/jni/mob_nif.zig index ec75c5f..5f7cefb 100644 --- a/android/jni/mob_nif.zig +++ b/android/jni/mob_nif.zig @@ -298,6 +298,7 @@ pub const BridgeMethods = extern struct { screenshot: jni.JMethodID = null, scroll_info: jni.JMethodID = null, scroll_to: jni.JMethodID = null, + element_frames: jni.JMethodID = null, // ── Mob.Peripheral.VendorUsb ───────────────────────────────────────── // Each takes a pid as jlong (so Kotlin can echo it back when calling // mob_deliver_vendor_usb_*) plus the operation's typed payload. @@ -701,6 +702,23 @@ export fn nif_scroll_to( return if (ok != 0) erts.ok(env) else errorAtom(env, "scroll_view_not_found"); } +// nif_element_frames/0 — JSON {id:[x,y,w,h],...} of tagged element frames (dp). +export fn nif_element_frames( + env: ?*erts.ErlNifEnv, + argc: c_int, + argv: [*]const erts.ERL_NIF_TERM, +) callconv(.c) erts.ERL_NIF_TERM { + _ = argc; + _ = argv; + if (Bridge.element_frames == null) return notLoaded(env); + var attached: c_int = 0; + const jenv = get_jenv(&attached) orelse return erts.atom(env, "error"); + const jresult = jenv.*.CallStaticObjectMethod.?(jenv, Bridge.cls, Bridge.element_frames); + const result = jstringToBin(env, jenv, jresult); + detachIfAttached(attached); + return result; +} + // nif_ax_action/2 + nif_ax_action_at_xy/3 — Android stubs. // // Both are iOS-only today. Compose semantics walker (the proper Android @@ -5000,6 +5018,7 @@ fn nifLoad(env: ?*erts.ErlNifEnv, priv: *?*anyopaque, info: erts.ERL_NIF_TERM) c cacheOptional(jenv, "screenshot", "(Ljava/lang/String;ID)[B", &Bridge.screenshot); cacheOptional(jenv, "scrollInfo", "(Ljava/lang/String;)Ljava/lang/String;", &Bridge.scroll_info); cacheOptional(jenv, "scrollTo", "(Ljava/lang/String;DD)Z", &Bridge.scroll_to); + cacheOptional(jenv, "elementFrames", "()Ljava/lang/String;", &Bridge.element_frames); cacheOptional(jenv, "tapXy", "(FF)Z", &Bridge.tap_xy); cacheOptional(jenv, "tapByLabel", "(Ljava/lang/String;)Z", &Bridge.tap_by_label); cacheOptional(jenv, "typeText", "(Ljava/lang/String;)Z", &Bridge.type_text); @@ -5038,6 +5057,7 @@ const nif_funcs = [_]erts.ErlNifFunc{ .{ .name = "screenshot", .arity = 3, .fptr = nif_screenshot, .flags = erts.ERL_NIF_DIRTY_JOB_CPU_BOUND }, .{ .name = "scroll_info", .arity = 1, .fptr = nif_scroll_info, .flags = 0 }, .{ .name = "scroll_to", .arity = 3, .fptr = nif_scroll_to, .flags = 0 }, + .{ .name = "element_frames", .arity = 0, .fptr = nif_element_frames, .flags = erts.ERL_NIF_DIRTY_JOB_CPU_BOUND }, // Core mob functions. .{ .name = "platform", .arity = 0, .fptr = nif_platform, .flags = 0 }, .{ .name = "color_scheme", .arity = 0, .fptr = nif_color_scheme, .flags = 0 }, diff --git a/ios/MobDemo-Bridging-Header.h b/ios/MobDemo-Bridging-Header.h index 67bd6df..08f00f5 100644 --- a/ios/MobDemo-Bridging-Header.h +++ b/ios/MobDemo-Bridging-Header.h @@ -21,3 +21,9 @@ void mob_send_component_event(int handle, const char *event, const char *payload // the OS appearance toggles (light/dark). Dispatches to Mob.Device subscribers. // `scheme` is "light" or "dark". void mob_notify_color_scheme(const char *scheme); + +// Called from MobFrameTracker (SwiftUI) as a tagged element lays out, recording +// its on-screen frame (logical points) keyed by the element's :id. Read back via +// the element_frames NIF so an agent can locate/drive elements without a +// screenshot. Implemented in mob_nif.m. +void mob_register_frame(const char *id, double x, double y, double w, double h); diff --git a/ios/MobRootView.swift b/ios/MobRootView.swift index c061f7f..b9bdfbc 100644 --- a/ios/MobRootView.swift +++ b/ios/MobRootView.swift @@ -474,6 +474,36 @@ struct MobNodeView: View { // (0, 0) which is a no-op. Used by SquareTriangle's hexagonal // snowflake to position rings absolutely within a center-aligned box. .offset(x: CGFloat(node.offsetX), y: CGFloat(node.offsetY)) + // Record on-screen frame + set accessibilityIdentifier for any node + // carrying an :id, so the agent can read positions via the + // element_frames NIF without a screenshot. + .modifier(MobFrameTracker(node: node)) + } +} + +// MobFrameTracker — for any node with an :id, set it as the accessibility +// identifier and report the element's global frame (logical points) to the C +// registry as it lays out / moves. Untagged nodes pass through untouched, so +// there's no cost unless a dev opts an element in by giving it an :id. +private struct MobFrameTracker: ViewModifier { + let node: MobNode + + func body(content: Content) -> some View { + if let id = node.nativeViewId { + content + .accessibilityIdentifier(id) + .background( + GeometryReader { geo in + Color.clear.onChange(of: geo.frame(in: .global), initial: true) { _, frame in + mob_register_frame( + id, Double(frame.minX), Double(frame.minY), + Double(frame.width), Double(frame.height)) + } + } + ) + } else { + content + } } } diff --git a/ios/mob_nif.m b/ios/mob_nif.m index 7082a10..0f139ef 100644 --- a/ios/mob_nif.m +++ b/ios/mob_nif.m @@ -1796,12 +1796,19 @@ static ERL_NIF_TERM nif_set_theme(ErlNifEnv *env, int argc, const ERL_NIF_TERM a return enif_make_atom(env, "ok"); } +static NSMutableDictionary *mob_frame_registry(void); // both defined with the +static void mob_clear_frames(void); // element frame registry below + static ERL_NIF_TERM nif_set_root(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { ErlNifBinary bin; if (!enif_inspect_binary(env, argv[0], &bin) && !enif_inspect_iolist_as_binary(env, argv[0], &bin)) return enif_make_badarg(env); + // New render tree — drop stale element frames; MobFrameTracker repopulates + // on the next layout pass. + mob_clear_frames(); + NSData *data = [NSData dataWithBytes:bin.data length:bin.size]; NSError *err = nil; id json = [NSJSONSerialization JSONObjectWithData:data options:0 error:&err]; @@ -5950,6 +5957,24 @@ static ERL_NIF_TERM nif_scroll_to(ErlNifEnv *env, int argc, const ERL_NIF_TERM a enif_make_atom(env, "scroll_view_not_found")); } +// nif_element_frames/0 — JSON {"id":[x,y,w,h],...} of tagged element frames +// (logical points). Recorded by MobFrameTracker; see mob_register_frame. +static ERL_NIF_TERM nif_element_frames(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + NSMutableDictionary *reg = mob_frame_registry(); + NSData *jsonData = nil; + @synchronized(reg) { + jsonData = [NSJSONSerialization dataWithJSONObject:reg options:0 error:nil]; + } + if (!jsonData) + return enif_make_tuple2(env, enif_make_atom(env, "error"), + enif_make_atom(env, "encode_failed")); + + ErlNifBinary bin; + enif_alloc_binary(jsonData.length, &bin); + memcpy(bin.data, jsonData.bytes, jsonData.length); + return enif_make_binary(env, &bin); +} + #endif // !MOB_RELEASE — end of test harness block (started near line 2780) // ── Storage ─────────────────────────────────────────────────────────────────── @@ -6512,6 +6537,42 @@ void mob_send_component_event(int handle, const char *event, const char *payload enif_free_env(env); } +// ── Element frame registry (positions without a screenshot) ────────────────── +// +// mob_register_frame is called from MobFrameTracker (SwiftUI) on the main thread +// as a tagged element lays out; the element_frames NIF reads it from a NIF +// thread. Both use only public APIs, so this is compiled unconditionally (the +// reading NIF is still debug-gated). @synchronized guards the shared dictionary. +static NSMutableDictionary *> *g_element_frames = nil; +static dispatch_once_t g_element_frames_once; + +static NSMutableDictionary *mob_frame_registry(void) { + dispatch_once(&g_element_frames_once, ^{ + g_element_frames = [NSMutableDictionary dictionary]; + }); + return g_element_frames; +} + +void mob_register_frame(const char *id, double x, double y, double w, double h) { + if (!id) + return; + NSString *key = [NSString stringWithUTF8String:id]; + if (!key) + return; + NSMutableDictionary *reg = mob_frame_registry(); + @synchronized(reg) { + reg[key] = @[ @(x), @(y), @(w), @(h) ]; + } +} + +// Drop stale frames when the render tree changes (called from nif_set_root). +static void mob_clear_frames(void) { + NSMutableDictionary *reg = mob_frame_registry(); + @synchronized(reg) { + [reg removeAllObjects]; + } +} + // ── Mob.Peripheral.VendorUsb (iOS stubs) ────────────────────────────────────── // // iOS exposes no public USB-host API equivalent to Android's UsbManager. @@ -6801,6 +6862,7 @@ static ERL_NIF_TERM nif_bt_hid_subscribe_raw(ErlNifEnv *env, int argc, const ERL {"screenshot", 3, nif_screenshot, ERL_NIF_DIRTY_JOB_CPU_BOUND}, {"scroll_info", 1, nif_scroll_info, 0}, {"scroll_to", 3, nif_scroll_to, 0}, + {"element_frames", 0, nif_element_frames, ERL_NIF_DIRTY_JOB_CPU_BOUND}, #endif // ── Core mob functions ─────────────────────────────────────────────────── {"background_keep_alive", 0, nif_background_keep_alive, 0}, diff --git a/lib/mob/test.ex b/lib/mob/test.ex index afa4f6d..5b20157 100644 --- a/lib/mob/test.ex +++ b/lib/mob/test.ex @@ -34,6 +34,11 @@ defmodule Mob.Test do Mob.Test.scroll_to(node, "feed", :bottom) Mob.Test.screenshot_tour(node, "feed") # page top→bottom, capture each + # Element positions without a screenshot (elements need an :id) + Mob.Test.element_frames(node) # %{id => {x, y, w, h}} + Mob.Test.frame(node, "save") # {x, y, w, h} + Mob.Test.tap_id(node, "save") # drive by id at real coords + # Device API simulation Mob.Test.send_message(node, {:permission, :camera, :granted}) Mob.Test.send_message(node, {:camera, :photo, %{path: "/tmp/photo.jpg", width: 1920, height: 1080}}) @@ -1091,6 +1096,70 @@ defmodule Mob.Test do defp clamp_int(v, lo, hi) when is_integer(v), do: v |> max(lo) |> min(hi) defp clamp_int(v, lo, hi), do: v |> trunc() |> max(lo) |> min(hi) + # ── Element frames (positions without a screenshot) ───────────────────────── + + @doc """ + Return the on-screen frame of every rendered element that carries an `:id`, + as `%{id => {x, y, w, h}}` in logical units (points on iOS, dp on Android). + + This is the screenshot-free way for an agent to know *where* things are: give + the elements you want to inspect or drive an `:id`, and their live positions + come back as a small structured map — no image bytes, no accessibility + activation. The renderer also sets the `:id` as the element's accessibility + identifier, so the same tags are visible to external tools (XCUITest, etc.). + + Pairs with `tap_id/2` to drive by id at real coordinates. + + Mob.Test.element_frames(node) + #=> %{"save" => {24.0, 720.0, 327.0, 48.0}, "row_3" => {0.0, 300.0, 393.0, 56.0}} + """ + @spec element_frames(node()) :: + %{optional(String.t()) => {float(), float(), float(), float()}} | {:error, term()} + def element_frames(node) do + case :rpc.call(node, :mob_nif, :element_frames, []) do + json when is_binary(json) -> decode_frames(json) + {:error, _} = err -> err + other -> {:error, other} + end + end + + defp decode_frames(json) do + json + |> :json.decode() + |> Map.new(fn {id, [x, y, w, h]} -> {id, {f(x), f(y), f(w), f(h)}} end) + end + + @doc """ + Frame `{x, y, w, h}` of the element with `id`, or `nil` if it has no tracked + position. See `element_frames/1`. + + Mob.Test.frame(node, "save") #=> {24.0, 720.0, 327.0, 48.0} + """ + @spec frame(node(), String.t() | atom()) :: + {float(), float(), float(), float()} | nil | {:error, term()} + def frame(node, id) do + case element_frames(node) do + %{} = frames -> frames[to_string(id)] + {:error, _} = err -> err + end + end + + @doc """ + Tap the element with `id` at the center of its tracked frame — driving by id + without a screenshot or coordinate guess. The element must carry an `:id` + (see `element_frames/1`). + + Mob.Test.tap_id(node, "save") + """ + @spec tap_id(node(), String.t() | atom()) :: :ok | {:error, term()} + def tap_id(node, id) do + case frame(node, id) do + {x, y, w, h} -> tap_xy(node, x + w / 2, y + h / 2) + nil -> {:error, :not_found} + {:error, _} = err -> err + end + end + # ── Native UI (requires MCP tools) ─────────────────────────────────────────── @doc """ diff --git a/src/mob_nif.erl b/src/mob_nif.erl index 2ce5997..82aa7dd 100644 --- a/src/mob_nif.erl +++ b/src/mob_nif.erl @@ -109,6 +109,7 @@ screenshot/3, scroll_info/1, scroll_to/3, + element_frames/0, %% Peripheral.VendorUsb (Android USB host; iOS returns :unsupported) vendor_usb_list_devices/1, vendor_usb_request_permission/1, @@ -210,6 +211,7 @@ screenshot/3, scroll_info/1, scroll_to/3, + element_frames/0, %% Storage storage_dir/1, storage_save_to_photo_library/1, @@ -345,6 +347,10 @@ swipe_xy(_X1, _Y1, _X2, _Y2) -> erlang:nif_error(not_loaded). screenshot(_Format, _Quality, _Scale) -> erlang:nif_error(not_loaded). scroll_info(_Id) -> erlang:nif_error(not_loaded). scroll_to(_Id, _X, _Y) -> erlang:nif_error(not_loaded). +%% element_frames() -> JSON binary {"id":[x,y,w,h],...} of on-screen frames for +%% every rendered node that carries an :id (logical points iOS / dp Android). +%% Lets an agent locate + drive elements by id without a screenshot. +element_frames() -> erlang:nif_error(not_loaded). storage_dir(_Location) -> erlang:nif_error(not_loaded). storage_save_to_photo_library(_Path) -> erlang:nif_error(not_loaded). storage_save_to_media_store(_Path, _Type) -> erlang:nif_error(not_loaded).