In [None]:
#Accounts for warp depending on curr lane the pxl head search is offset by n1-4 degrees from vertical

In [1]:
#!/usr/bin/env python3
# Ultra-fast, batched pipeline with threaded I/O + per-frame timing & counts
# Renders ALL masks + triangles coloured by the mask sampled N px along a ray
# + Scout lines (render only)
# + Starburst lines from each triangle tip to Jake's lane point
# + Lane-bearing selector for Jake's triangle
# + Lane-aware angled sampling for mask lookup with per-ray degree labels

import os, glob, sys, time, math
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

# SAVE HERE
out_dir    = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "out_overlays6"
out_dir.mkdir(parents=True, exist_ok=True)

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Sampling ray length (pixels in frame space)
SAMPLE_UP_PX        = 65

# ===== BEND PARAMETERS (degrees; + = lean LEFT, - = lean RIGHT)
# If JAKE_POINT = LANE_LEFT, triangles to the RIGHT bend LEFT by N1
BEND_LEFT_STATE_RIGHT_DEG  = -20.0  # N1
# If JAKE_POINT = LANE_MID, triangles RIGHT of JakeTri bend LEFT by N2
BEND_MID_STATE_RIGHT_DEG   = -20.0  # N2
# If JAKE_POINT = LANE_MID, triangles LEFT of JakeTri bend RIGHT by N3
BEND_MID_STATE_LEFT_DEG    = +20.0  # N3
# If JAKE_POINT = LANE_RIGHT, triangles to the LEFT bend RIGHT by N4
BEND_RIGHT_STATE_LEFT_DEG  = +20.0  # N4

# Colours (BGR)
COLOR_GREEN  = (0, 255, 0)
COLOR_PINK   = (203, 192, 255)
COLOR_YELLOW = (0, 255, 255)
COLOR_RED    = (0, 0, 255)
COLOR_WHITE  = (255, 255, 255)
COLOR_CYAN   = (255, 255, 0)
COLOR_BLACK  = (0, 0, 0)

# Runtime
BATCH               = 1
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None   # None → all frames
RENDER_FIRST_N      = 300    # render overlays for first N frames only

# =======================
# Jake lane points
# =======================
LANE_LEFT   = (300, 1340)
LANE_MID    = (490, 1340)
LANE_RIGHT  = (680, 1340)
JAKE_POINT  = LANE_MID  # pick one: LANE_LEFT / LANE_MID / LANE_RIGHT

# Lane target bearings (degrees from vertical; left=positive, right=negative)
LANE_TARGET_DEG = {
    "left":  -10.7,
    "mid":   +1.5,
    "right": +15.0,
}

def lane_name_from_point(p):
    if p == LANE_LEFT:  return "left"
    if p == LANE_MID:   return "mid"
    if p == LANE_RIGHT: return "right"
    return "mid"

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

# Warmup
_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

CLASS_COLOURS = {
    0:(255,255,0),1:(192,192,192),2:(0,128,255),3:(0,255,0),
    4:(255,0,255),5:(0,255,255),6:(255,128,0),7:(128,0,255),
    8:(0,0,128),9:(0,0,255),10:(128,128,0),11:(255,255,102)
}
LABELS = {
    0:"BOOTS",1:"GREYTRAIN",2:"HIGHBARRIER1",3:"JUMP",4:"LOWBARRIER1",
    5:"LOWBARRIER2",6:"ORANGETRAIN",7:"PILLAR",8:"RAMP",9:"RAILS",
    10:"SIDEWALK",11:"YELLOWTRAIN"
}

SAFE_GREEN = {9, 10}
WARN_YELLOW = {2,3,4,5,8}

# =======================
# Helpers
# =======================
def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    t1 = time.perf_counter()
    return img, (t1 - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    H, W = img_bgr.shape[:2]
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    ys, xs = np.where(rail_mask)
    y0, y1 = ys.min(), ys.max()+1
    x0, x1 = xs.min(), xs.max()+1

    img_roi  = img_bgr[y0:y1, x0:x1]
    mask_roi = rail_mask[y0:y1, x0:x1]

    img_f = img_roi.astype(np.float32)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = np.sum(diff * diff, axis=-1)
    colour_hit = np.any(dist2 <= TOL2, axis=-1)

    combined = np.logical_and(colour_hit, mask_roi)
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)
    if n <= 1:
        return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    areas  = stats[1:, cv2.CC_STAT_AREA]
    hs     = stats[1:, cv2.CC_STAT_HEIGHT]
    keep   = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep:
        good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y0:y1, x0:x1] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32), k)
    g = cv2.blur(green_mask.astype(np.float32), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255.0).astype(np.uint8)

def purple_triangles(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)

    dark = (score >= RED_SCORE_THRESH).astype(np.uint8)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)),
        iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return [], None

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return [], None

    tris = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            ys, xs = np.where(lbls == lbl)
            if ys.size == 0:
                continue
            y_top = ys.min()
            x_mid = int(xs[ys == y_top].mean())
            tris.append((int(x_mid), int(y_top)))

    if not tris:
        return [], None

    best = min(tris, key=lambda xy: xy[1])
    return tris, best

# ===== Angled-bend helper =====
def pick_bend_angle(jake_point, xt, x_ref, idx, best_idx):
    """Return θ (deg from vertical; +left, -right) for sampling.
       Jake's triangle (best_idx) always returns 0°."""
    if idx == best_idx:
        return 0.0

    if jake_point == LANE_LEFT:
        # Triangles to the RIGHT of Jake bend LEFT by N1
        return BEND_LEFT_STATE_RIGHT_DEG if xt > x_ref else 0.0

    if jake_point == LANE_RIGHT:
        # Triangles to the LEFT of Jake bend RIGHT by N4
        return BEND_RIGHT_STATE_LEFT_DEG if xt < x_ref else 0.0

    # Middle lane: use JakeTri's x as the divider; right bends LEFT (N2), left bends RIGHT (N3)
    if xt > x_ref:
        return BEND_MID_STATE_RIGHT_DEG
    elif xt < x_ref:
        return BEND_MID_STATE_LEFT_DEG
    return 0.0

# ===== Bearing-based Jake triangle selection =====
def signed_degrees_from_vertical(dx, dy):
    if dx == 0 and dy == 0:
        return 0.0
    return -math.degrees(math.atan2(dx, -dy))

def select_triangle_by_bearing(tri_positions, jx, jy, target_deg, min_dy=6):
    best_i, best_deg, best_err = -1, None, None
    for i, (xt, yt) in enumerate(tri_positions):
        dx = xt - jx
        dy = yt - jy
        if dy >= -min_dy:  # must be above Jake
            continue
        deg = signed_degrees_from_vertical(dx, dy)
        err = abs(deg - target_deg)
        if (best_err is None) or (err < best_err):
            best_i, best_deg, best_err = i, deg, err
    return best_i, best_deg, best_err

# ===== UPDATED: triangle classification using angled sampling =====
def classify_triangles_at_sample_curved(
    tri_positions, masks_np, classes_np, frame_H, frame_W,
    jake_point, x_ref, best_idx, sample_px=SAMPLE_UP_PX
):
    """
    For each triangle tip (x,y), sample masks at a point N px away along a lane-aware
    angled ray. Returns:
      colours: list of BGR colours per triangle
      rays:    list of ((x0,y0),(x1,y1),theta_deg) for debug drawing/labels
    """
    if masks_np is None or classes_np is None or len(tri_positions) == 0:
        return [], []

    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (frame_W - 1))
    sy = (mh - 1) / max(1, (frame_H - 1))

    colours, rays = [], []

    for idx, (x, y) in enumerate(tri_positions):
        # Choose bend angle (Jake tri = 0°, others per lane rules)
        theta = pick_bend_angle(jake_point, x, x_ref, idx, best_idx)

        # Direction from vertical (0,-1) rotated by theta (deg)
        rad = math.radians(theta)
        dx = math.sin(rad)          # +left, -right
        dy = -math.cos(rad)         # up

        xs = int(round(x + dx * sample_px))
        ys = int(round(y + dy * sample_px))

        # Clamp to frame
        xs = max(0, min(frame_W - 1, xs))
        ys = max(0, min(frame_H - 1, ys))

        # Map to mask space
        mx = int(round(xs * sx))
        my = int(round(ys * sy))
        mx = 0 if mx < 0 else (mw - 1 if mx >= mw else mx)
        my = 0 if my < 0 else (mh - 1 if my >= mh else my)

        # Class at sample point
        cls_here = None
        for m, c in zip(masks_np, classes_np):
            if m[my, mx] > 0.5:
                cls_here = int(c)
                break

        if (cls_here is None) or (cls_here in SAFE_GREEN):
            colours.append(COLOR_GREEN)
        elif cls_here == 0:
            colours.append(COLOR_PINK)
        elif cls_here in WARN_YELLOW:
            colours.append(COLOR_YELLOW)
        else:
            colours.append(COLOR_RED)

        rays.append(((int(x), int(y)), (xs, ys), float(theta)))

    return colours, rays

# Returns: (..., tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref)
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]
    if yolo_res.masks is None:
        return None, 0, 0, 0.0, 0.0, None, None, None, None, [], [], [], None, None, None

    t0_to_cpu = time.perf_counter()
    masks_np = yolo_res.masks.data.cpu().numpy()  # [n,h,w]
    mask_count = int(masks_np.shape[0])
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.cpu().numpy().astype(int)
    t1_to_cpu = time.perf_counter()
    to_cpu_ms = (t1_to_cpu - t0_to_cpu) * 1000.0

    if mask_count == 0 or classes_np.size == 0:
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    t0_post = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool)        # [k,h,w]
    union = np.any(rail_masks, axis=0).astype(np.uint8) # [h,w]
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri_positions, tri_best = purple_triangles(score, H)

    # --- decide Jake triangle first (bearing vs target)
    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    xj, yj = JAKE_POINT
    best_idx, best_deg, _ = select_triangle_by_bearing(tri_positions, xj, yj, target_deg, min_dy=6)

    # x reference for bending:
    # mid lane -> x of JakeTri if found, else Jake point x; side lanes -> Jake point x
    if lane_name == "mid" and best_idx is not None and 0 <= best_idx < len(tri_positions):
        x_ref = tri_positions[best_idx][0]
    else:
        x_ref = xj

    # classify with angled sampling (JakeTri ray = 0°)
    tri_colours, tri_rays = classify_triangles_at_sample_curved(
        tri_positions, masks_np, classes_np, H, W, JAKE_POINT, x_ref, best_idx, SAMPLE_UP_PX
    )

    t1_post = time.perf_counter()
    post_ms = (t1_post - t0_post) * 1000.0

    return tri_best, len(tri_positions), mask_count, to_cpu_ms, post_ms, masks_np, classes_np, rail_mask, green, tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref

def _colour_for_point(x, y, masks_np, classes_np, frame_H, frame_W):
    if masks_np is None or classes_np is None or masks_np.size == 0:
        return COLOR_GREEN
    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (frame_W - 1))
    sy = (mh - 1) / max(1, (frame_H - 1))
    mx = int(round(x * sx)); my = int(round(y * sy))
    mx = 0 if mx < 0 else (mw - 1 if mx >= mw else mx)
    my = 0 if my < 0 else (mh - 1 if my >= mh else my)
    cls_here = None
    for m, c in zip(masks_np, classes_np):
        if m[my, mx] > 0.5:
            cls_here = int(c); break
    if (cls_here is None) or (cls_here in SAFE_GREEN): return COLOR_GREEN
    if cls_here == 0: return COLOR_PINK
    if cls_here in WARN_YELLOW: return COLOR_YELLOW
    return COLOR_RED

def draw_triangle(img, x, y, size=TRI_SIZE_PX, colour=COLOR_RED):
    h = int(size * 1.2)
    pts = np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)
    cv2.fillConvexPoly(img, pts, colour)
    cv2.polylines(img, [pts.reshape(-1,1,2)], True, COLOR_BLACK, 1, cv2.LINE_AA)

def triangle_pts(x, y, size=TRI_SIZE_PX):
    h = int(size * 1.2)
    return np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)

# --- rendering (excluded from timing) ---
def render_overlays(frame_bgr, masks_np, classes_np, rail_mask, green_mask,
                    tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref):
    out = frame_bgr.copy()
    H, W = out.shape[:2]
    alpha = 0.45

    # masks
    if masks_np is not None and classes_np is not None and masks_np.size:
        for m, c in zip(masks_np, classes_np):
            m_full = m
            if m.shape != (H, W):
                m_full = cv2.resize(m.astype(np.uint8), (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)
            color = CLASS_COLOURS.get(int(c), (255,255,255))
            out[m_full] = (np.array(color, dtype=np.uint8) * alpha + out[m_full] * (1 - alpha)).astype(np.uint8)
            ys, xs = np.where(m_full)
            if xs.size:
                xc, yc = int(xs.mean()), int(ys.mean())
                label = LABELS.get(int(c), f"C{int(c)}")
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, COLOR_BLACK, 2, cv2.LINE_AA)
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)

    if rail_mask is not None:
        tint = out.copy()
        tint[rail_mask] = (0, 0, 255)
        out = cv2.addWeighted(tint, 0.30, out, 0.70, 0)
    if green_mask is not None:
        out[green_mask] = (0, 255, 0)

    # Scout lines (tiny)
    if tri_positions:
        for (x, y) in tri_positions:
            y_end = max(0, y - SAMPLE_UP_PX)
            for yy in range(y, y_end - 1, -1):
                col = _colour_for_point(x, yy, masks_np, classes_np, H, W)
                out[yy, x] = col

    # Starburst lines to Jake with degree labels
    xj, yj = JAKE_POINT
    for idx, (xt, yt) in enumerate(tri_positions):
        xt = max(0, min(W-1, int(xt)))
        yt = max(0, min(H-1, int(yt)))
        dx = xt - xj; dy = yt - yj
        deg_signed = signed_degrees_from_vertical(dx, dy)
        line_color = COLOR_CYAN if idx == best_idx else COLOR_WHITE
        cv2.line(out, (xj, yj), (xt, yt), line_color, 2, cv2.LINE_AA)
        mid_x = int((xj + xt) / 2); mid_y = int((yj + yt) / 2)
        cv2.putText(out, f"{deg_signed:.1f}°", (mid_x, mid_y),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, f"{deg_signed:.1f}°", (mid_x, mid_y),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    # Draw angled sampling rays + bend-degree labels (JakeTri shows 0.0°)
    for (p0, p1, theta) in tri_rays:
        cv2.line(out, p0, p1, (255,255,255), 2, cv2.LINE_AA)
        mx = (p0[0] + p1[0]) // 2
        my = (p0[1] + p1[1]) // 2
        txt = f"{theta:+.1f}°"
        cv2.putText(out, txt, (mx, my),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, txt, (mx, my),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    # Triangles after lines
    for (x, y), col in zip(tri_positions, tri_colours):
        draw_triangle(out, x, y, colour=col)

    # Emphasize selected triangle + tag
    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    if best_idx is not None and 0 <= best_idx < len(tri_positions):
        xt, yt = tri_positions[best_idx]
        pts = triangle_pts(int(xt), int(yt), size=TRI_SIZE_PX)
        cv2.polylines(out, [pts.reshape(-1,1,2)], True, COLOR_CYAN, 3, cv2.LINE_AA)
        tag = f"JAKE_TRI ({lane_name}: target {target_deg:.1f}°)"
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    return out

# =======================
# Batched execution with prints; overlays saved for first N
# =======================
def run_pipeline_with_prints_and_overlays():
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)
    results_triangle_xy = [None] * N

    def load_batch(batch_paths):
        imgs = [None] * len(batch_paths)
        read_ms = [0.0] * len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2idx = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2idx):
                i = fut2idx[fut]
                img, r_ms = fut.result()
                imgs[i] = img
                read_ms[i] = r_ms
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok:
            return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, BATCH):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        B = len(imgs_bgr)
        if B == 0:
            idx_global += len(batch_paths)
            continue

        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET,
            batch=B
        )
        try:
            if device == 0 and torch.cuda.is_available():
                torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception:
            pass
        t1_inf = time.perf_counter()
        infer_ms_share = ((t1_inf - t0_inf) * 1000.0) / B

        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            (tri_best_xy, tri_count, mask_count, to_cpu_ms, post_ms,
             masks_np, classes_np, rail_mask, green_mask, tri_positions, tri_colours,
             tri_rays, best_idx, best_deg, x_ref) = process_frame_post(img, yres)

            results_triangle_xy[idx_global + j] = tri_best_xy
            proc_ms = infer_ms_share + to_cpu_ms + post_ms
            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1

            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms_share:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} | "
                  f"masks {mask_count} | triangles {tri_count} "
                  f"=> proc {proc_ms:.1f} ms")

            if frame_idx <= RENDER_FIRST_N:
                overlay = render_overlays(img, masks_np, classes_np, rail_mask, green_mask,
                                          tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref)
                out_path = out_dir / f"overlay_{frame_idx:04d}_{fname}"
                cv2.imwrite(str(out_path), overlay)

        idx_global += B

    return results_triangle_xy

# =======================
# Entry
# =======================
if __name__ == "__main__":
    _ = run_pipeline_with_prints_and_overlays()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/312] frame_00000.png  read 37.5 | infer 2179.2 | to_cpu 0.7 | post 0.0 | masks 1 | triangles 0 => proc 2179.9 ms
[2/312] frame_00001.png  read 36.6 | infer 197.2 | to_cpu 1.0 | post 78.1 | masks 3 | triangles 1 => proc 276.3 ms
[3/312] frame_00002.png  read 36.3 | infer 76.3 | to_cpu 0.6 | post 119.5 | masks 3 | triangles 3 => proc 196.4 ms
[4/312] frame_00003.png  read 52.7 | infer 84.2 | to_cpu 0.8 | post 130.0 | masks 3 | triangles 2 => proc 215.0 ms
[5/312] frame_00004.png  read 48.5 | infer 206.7 | to_cpu 3.5 | post 1259.1 | masks 4 | triangles 1 => proc 1469.3 ms
[6/312] frame_00005.png  read 51.9 | infer 545.2 | to_cpu 1.1 | post 302.6 | masks 5 | triangles 1 => proc 848.9 ms
[7/312] frame_00006.png  read 73.1 | infer 118.9 | to_cpu 0.8 | post 240.1 | masks 5 | triangles 2 => proc 359.8 ms
[8/312] frame_00007.png  read 41.5 | infer 123.2 | to_cpu 1.2 | post 285.7 | masks 4 | triangles 2 =>

In [None]:
#Speed optimised version of code above. Optimal function as of 10-08-2025 -> 12:31am

In [7]:
#!/usr/bin/env python3
# Ultra-fast overlays + lane-aware curved sampling (optimized postproc)

import os, glob, time, math
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

# SAVE HERE
out_dir    = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "out_overlays6"
out_dir.mkdir(parents=True, exist_ok=True)

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Sampling ray length
SAMPLE_UP_PX        = 180

# ===== Bend degrees (tune here) =====
BEND_LEFT_STATE_RIGHT_DEG  = -20.0  # N1
BEND_MID_STATE_RIGHT_DEG   = -20.0  # N2
BEND_MID_STATE_LEFT_DEG    = +20.0  # N3
BEND_RIGHT_STATE_LEFT_DEG  = +20.0  # N4

# Colours (BGR)
COLOR_GREEN  = (0, 255, 0)
COLOR_PINK   = (203, 192, 255)
COLOR_YELLOW = (0, 255, 255)
COLOR_RED    = (0, 0, 255)
COLOR_WHITE  = (255, 255, 255)
COLOR_CYAN   = (255, 255, 0)
COLOR_BLACK  = (0, 0, 0)

# Runtime
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None
RENDER_FIRST_N      = 300

# =======================
# Jake lane points
# =======================
LANE_LEFT   = (300, 1340)
LANE_MID    = (490, 1340)
LANE_RIGHT  = (680, 1340)
JAKE_POINT  = LANE_RIGHT  # pick: LANE_LEFT / LANE_MID / LANE_RIGHT

LANE_TARGET_DEG = {"left": -10.7, "mid": +1.5, "right": +15.0}

def lane_name_from_point(p):
    if p == LANE_LEFT:  return "left"
    if p == LANE_MID:   return "mid"
    if p == LANE_RIGHT: return "right"
    return "mid"

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE
SAFE_GREEN      = {9, 10}
WARN_YELLOW     = {2,3,4,5,8}

CLASS_COLOURS = {
    0:(255,255,0),1:(192,192,192),2:(0,128,255),3:(0,255,0),
    4:(255,0,255),5:(0,255,255),6:(255,128,0),7:(128,0,255),
    8:(0,0,128),9:(0,0,255),10:(128,128,0),11:(255,255,102)
}
LABELS = {
    0:"BOOTS",1:"GREYTRAIN",2:"HIGHBARRIER1",3:"JUMP",4:"LOWBARRIER1",
    5:"LOWBARRIER2",6:"ORANGETRAIN",7:"PILLAR",8:"RAMP",9:"RAILS",
    10:"SIDEWALK",11:"YELLOWTRAIN"
}

# ====== tiny helpers ======
def _clampi(v, lo, hi):
    return lo if v < lo else (hi if v > hi else v)

def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    return img, (time.perf_counter() - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

# =======================
# Fast rails green finder
# =======================
def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    # Expect rail_mask as bool (H,W)
    H, W = rail_mask.shape
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    # Fast bbox via boundingRect on a uint8 view
    rail_u8 = rail_mask.view(dtype=np.uint8) * 255
    x, y, w, h = cv2.boundingRect(rail_u8)
    img_roi  = img_bgr[y:y+h, x:x+w]
    mask_roi = rail_u8[y:y+h, x:x+w]  # uint8 0/255

    img_f = img_roi.astype(np.float32, copy=False)
    # dist^2 to each target color (BGR)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = (diff * diff).sum(-1)
    colour_hit = (dist2 <= TOL2).any(-1)

    # inside rail AND close to target colors
    combined = np.logical_and(colour_hit, mask_roi.astype(bool))

    # blobs filter by area & height (use CC with stats inside ROI only)
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)
    if n <= 1:
        return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    # stats: [label, x, y, w, h, area] -> OpenCV layout
    areas = stats[1:, cv2.CC_STAT_AREA]
    hs    = stats[1:, cv2.CC_STAT_HEIGHT]
    keep  = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep:
        good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y:y+h, x:x+w] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    # both masks already bool in frame space
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32, copy=False), k)
    g = cv2.blur(green_mask.astype(np.float32, copy=False), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255.0).astype(np.uint8, copy=False)

def purple_triangles(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)
    dark = (score >= RED_SCORE_THRESH).astype(np.uint8, copy=False)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)), iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return [], None

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return [], None

    tris = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            ys, xs = np.where(lbls == lbl)
            if ys.size == 0: continue
            y_top = ys.min()
            x_mid = int(xs[ys == y_top].mean())
            tris.append((x_mid, int(y_top)))

    if not tris:
        return [], None
    best = min(tris, key=lambda xy: xy[1])
    return tris, best

# ===== Bearing-based Jake triangle selection =====
def signed_degrees_from_vertical(dx, dy):
    if dx == 0 and dy == 0: return 0.0
    return -math.degrees(math.atan2(dx, -dy))

def select_triangle_by_bearing(tri_positions, jx, jy, target_deg, min_dy=6):
    best_i, best_deg, best_err = -1, None, None
    for i, (xt, yt) in enumerate(tri_positions):
        dy = yt - jy
        if dy >= -min_dy:  # must be above Jake
            continue
        deg = signed_degrees_from_vertical(xt - jx, dy)
        err = abs(deg - target_deg)
        if (best_err is None) or (err < best_err):
            best_i, best_deg, best_err = i, deg, err
    return best_i, best_deg, best_err

# ===== Lane-aware curved sampling (precompute sin/cos) =====
def _precompute_trig():
    # unique angle set includes 0°, N1, N2, N3, N4 (duplicates folded by set)
    angles = sorted(set([0.0,
        BEND_LEFT_STATE_RIGHT_DEG,
        BEND_MID_STATE_RIGHT_DEG,
        BEND_MID_STATE_LEFT_DEG,
        BEND_RIGHT_STATE_LEFT_DEG
    ]))
    table = {}
    for a in angles:
        r = math.radians(a)
        table[a] = (math.sin(r), -math.cos(r))  # (dx, dy) for unit ray (up = -y)
    return table
TRIG_TABLE = _precompute_trig()

def pick_bend_angle(jake_point, xt, x_ref, idx, best_idx):
    if idx == best_idx:
        return 0.0
    if jake_point == LANE_LEFT:
        return BEND_LEFT_STATE_RIGHT_DEG if xt > x_ref else 0.0
    if jake_point == LANE_RIGHT:
        return BEND_RIGHT_STATE_LEFT_DEG if xt < x_ref else 0.0
    # mid lane: right of JakeTri bends left (N2), left bends right (N3)
    if xt > x_ref: return BEND_MID_STATE_RIGHT_DEG
    if xt < x_ref: return BEND_MID_STATE_LEFT_DEG
    return 0.0

def classify_triangles_at_sample_curved(
    tri_positions, masks_np, classes_np, H, W,
    jake_point, x_ref, best_idx, sample_px=SAMPLE_UP_PX
):
    if masks_np is None or classes_np is None or len(tri_positions) == 0:
        return [], []

    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (W - 1))
    sy = (mh - 1) / max(1, (H - 1))

    colours, rays = [], []
    # fast refs
    SAFE, WARN = SAFE_GREEN, WARN_YELLOW
    for idx, (x, y) in enumerate(tri_positions):
        theta = pick_bend_angle(jake_point, x, x_ref, idx, best_idx)
        dx1, dy1 = TRIG_TABLE[theta]  # unit direction
        xs = int(round(x + dx1 * sample_px))
        ys = int(round(y + dy1 * sample_px))
        xs = _clampi(xs, 0, W-1); ys = _clampi(ys, 0, H-1)

        mx = int(round(xs * sx)); my = int(round(ys * sy))
        mx = _clampi(mx, 0, mw-1); my = _clampi(my, 0, mh-1)

        cls_here = None
        for m, c in zip(masks_np, classes_np):
            if m[my, mx] > 0.5:
                cls_here = int(c); break

        if (cls_here is None) or (cls_here in SAFE):
            colours.append(COLOR_GREEN)
        elif cls_here == 0:
            colours.append(COLOR_PINK)
        elif cls_here in WARN:
            colours.append(COLOR_YELLOW)
        else:
            colours.append(COLOR_RED)

        rays.append(((int(x), int(y)), (xs, ys), float(theta)))

    return colours, rays

# =======================
# Frame post-processing
# =======================
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]
    if yolo_res.masks is None:
        return None, 0, 0, 0.0, 0.0, None, None, None, None, [], [], [], None, None, None

    t0 = time.perf_counter()
    masks_np = yolo_res.masks.data.detach().cpu().numpy()  # [n,h,w]
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.detach().cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.detach().cpu().numpy().astype(int)
    to_cpu_ms = (time.perf_counter() - t0) * 1000.0
    mask_count = int(masks_np.shape[0])
    if mask_count == 0 or classes_np.size == 0:
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    t1 = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool, copy=False)
    union = np.any(rail_masks, axis=0).astype(np.uint8, copy=False)
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool, copy=False)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri_positions, tri_best = purple_triangles(score, H)

    # Jake triangle by bearing
    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    xj, yj = JAKE_POINT
    best_idx, best_deg, _ = select_triangle_by_bearing(tri_positions, xj, yj, target_deg, min_dy=6)

    # x_ref for bending
    if lane_name == "mid" and (best_idx is not None) and (0 <= best_idx < len(tri_positions)):
        x_ref = tri_positions[best_idx][0]
    else:
        x_ref = xj

    tri_colours, tri_rays = classify_triangles_at_sample_curved(
        tri_positions, masks_np, classes_np, H, W, JAKE_POINT, x_ref, best_idx, SAMPLE_UP_PX
    )
    post_ms = (time.perf_counter() - t1) * 1000.0

    return tri_best, len(tri_positions), mask_count, to_cpu_ms, post_ms, masks_np, classes_np, rail_mask, green, tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref

# =======================
# Viz helpers
# =======================
def _colour_for_point(x, y, masks_np, classes_np, H, W):
    if masks_np is None or classes_np is None or masks_np.size == 0: return COLOR_GREEN
    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (W - 1)); sy = (mh - 1) / max(1, (H - 1))
    mx = _clampi(int(round(x * sx)), 0, mw-1)
    my = _clampi(int(round(y * sy)), 0, mh-1)
    cls_here = None
    for m, c in zip(masks_np, classes_np):
        if m[my, mx] > 0.5: cls_here = int(c); break
    if (cls_here is None) or (cls_here in SAFE_GREEN): return COLOR_GREEN
    if cls_here == 0: return COLOR_PINK
    if cls_here in WARN_YELLOW: return COLOR_YELLOW
    return COLOR_RED

def draw_triangle(img, x, y, size=TRI_SIZE_PX, colour=COLOR_RED):
    h = int(size * 1.2)
    pts = np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)
    cv2.fillConvexPoly(img, pts, colour)
    cv2.polylines(img, [pts.reshape(-1,1,2)], True, COLOR_BLACK, 1, cv2.LINE_AA)

def triangle_pts(x, y, size=TRI_SIZE_PX):
    h = int(size * 1.2)
    return np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)

def render_overlays(frame_bgr, masks_np, classes_np, rail_mask, green_mask,
                    tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref):
    out = frame_bgr.copy()
    H, W = out.shape[:2]
    alpha = 0.45

    if masks_np is not None and classes_np is not None and masks_np.size:
        for m, c in zip(masks_np, classes_np):
            m_full = m
            if m.shape != (H, W):
                m_full = cv2.resize(m.astype(np.uint8), (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)
            color = CLASS_COLOURS.get(int(c), (255,255,255))
            out[m_full] = (np.array(color, dtype=np.uint8) * alpha + out[m_full] * (1 - alpha)).astype(np.uint8)
            ys, xs = np.where(m_full)
            if xs.size:
                xc, yc = int(xs.mean()), int(ys.mean())
                label = LABELS.get(int(c), f"C{int(c)}")
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, COLOR_BLACK, 2, cv2.LINE_AA)
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)

    if rail_mask is not None:
        tint = out.copy()
        tint[rail_mask] = (0, 0, 255)
        out = cv2.addWeighted(tint, 0.30, out, 0.70, 0)
    if green_mask is not None:
        out[green_mask] = (0, 255, 0)

    # tiny scout lines
    if tri_positions:
        for (x, y) in tri_positions:
            y_end = max(0, y - SAMPLE_UP_PX)
            for yy in range(y, y_end - 1, -1):
                out[yy, x] = _colour_for_point(x, yy, masks_np, classes_np, H, W)

    # starburst to Jake
    xj, yj = JAKE_POINT
    for idx, (xt, yt) in enumerate(tri_positions):
        xt = _clampi(int(xt), 0, W-1); yt = _clampi(int(yt), 0, H-1)
        deg_signed = signed_degrees_from_vertical(xt - xj, yt - yj)
        cv2.line(out, (xj, yj), (xt, yt),
                 COLOR_CYAN if idx == best_idx else COLOR_WHITE, 2, cv2.LINE_AA)
        mx = (xj + xt) // 2; my = (yj + yt) // 2
        txt = f"{deg_signed:.1f}°"
        cv2.putText(out, txt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, txt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    # curved sampling rays
    for (p0, p1, theta) in tri_rays:
        cv2.line(out, p0, p1, (255,255,255), 2, cv2.LINE_AA)
        mx = (p0[0] + p1[0]) // 2; my = (p0[1] + p1[1]) // 2
        ttxt = f"{theta:+.1f}°"
        cv2.putText(out, ttxt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, ttxt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    for (x, y), col in zip(tri_positions, tri_colours):
        draw_triangle(out, int(x), int(y), colour=col)

    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    if best_idx is not None and 0 <= best_idx < len(tri_positions):
        xt, yt = tri_positions[best_idx]
        pts = triangle_pts(int(xt), int(yt), size=TRI_SIZE_PX)
        cv2.polylines(out, [pts.reshape(-1,1,2)], True, COLOR_CYAN, 3, cv2.LINE_AA)
        tag = f"JAKE_TRI ({lane_name}: target {target_deg:.1f}°)"
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)
    return out

# =======================
# Run
# =======================
def run_pipeline_with_prints_and_overlays():
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)

    def load_batch(batch_paths):
        imgs, read_ms = [None]*len(batch_paths), [0.0]*len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2i = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2i):
                i = fut2i[fut]
                im, r = fut.result()
                imgs[i] = im; read_ms[i] = r
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok: return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, 1):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        if not imgs_bgr:
            idx_global += len(batch_paths); continue

        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET, batch=1
        )
        try:
            if device == 0 and torch.cuda.is_available(): torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception: pass
        infer_ms_share = (time.perf_counter() - t0_inf) * 1000.0

        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            (tri_best_xy, tri_count, mask_count, to_cpu_ms, post_ms,
             masks_np, classes_np, rail_mask, green_mask, tri_positions, tri_colours,
             tri_rays, best_idx, best_deg, x_ref) = process_frame_post(img, yres)

            proc_ms = infer_ms_share + to_cpu_ms + post_ms
            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1

            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms_share:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} | "
                  f"masks {mask_count} | triangles {tri_count} "
                  f"=> proc {proc_ms:.1f} ms")

            if frame_idx <= RENDER_FIRST_N:
                overlay = render_overlays(img, masks_np, classes_np, rail_mask, green_mask,
                                          tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref)
                out_path = out_dir / f"overlay_{frame_idx:04d}_{fname}"
                cv2.imwrite(str(out_path), overlay)

        idx_global += 1

# =======================
# Entry
# =======================
if __name__ == "__main__":
    run_pipeline_with_prints_and_overlays()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/312] frame_00000.png  read 42.6 | infer 111.3 | to_cpu 1.2 | post 0.0 | masks 1 | triangles 0 => proc 112.6 ms
[2/312] frame_00001.png  read 43.0 | infer 50.9 | to_cpu 0.9 | post 109.6 | masks 3 | triangles 1 => proc 161.4 ms
[3/312] frame_00002.png  read 42.9 | infer 56.0 | to_cpu 0.8 | post 148.7 | masks 3 | triangles 3 => proc 205.5 ms
[4/312] frame_00003.png  read 50.3 | infer 66.2 | to_cpu 1.2 | post 133.2 | masks 3 | triangles 2 => proc 200.5 ms
[5/312] frame_00004.png  read 47.3 | infer 47.0 | to_cpu 0.8 | post 187.5 | masks 4 | triangles 1 => proc 235.3 ms
[6/312] frame_00005.png  read 68.6 | infer 232.2 | to_cpu 2.4 | post 343.4 | masks 5 | triangles 1 => proc 578.0 ms
[7/312] frame_00006.png  read 84.1 | infer 100.3 | to_cpu 1.3 | post 339.9 | masks 5 | triangles 2 => proc 441.6 ms
[8/312] frame_00007.png  read 47.2 | infer 54.5 | to_cpu 0.9 | post 198.5 | masks 4 | triangles 2 => proc 

In [8]:
#Upgraded order logic and mask scanning 

In [12]:
#!/usr/bin/env python3
# Ultra-fast overlays + lane-aware curved sampling (optimized postproc)
# Now: walk the bent probe ray in 15px steps (first-hit wins: RED→YELLOW→BOOTS→else GREEN)

import os, glob, time, math
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

# SAVE HERE
out_dir    = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "out_overlays7"
out_dir.mkdir(parents=True, exist_ok=True)

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Sampling ray length
SAMPLE_UP_PX        = 180
RAY_STEP_PX         = 20   # << walk the ray every 15 px

# ===== Bend degrees (tune here) =====
BEND_LEFT_STATE_RIGHT_DEG  = -20.0  # N1
BEND_MID_STATE_RIGHT_DEG   = -20.0  # N2
BEND_MID_STATE_LEFT_DEG    = +20.0  # N3
BEND_RIGHT_STATE_LEFT_DEG  = +20.0  # N4

# Colours (BGR)
COLOR_GREEN  = (0, 255, 0)
COLOR_PINK   = (203, 192, 255)
COLOR_YELLOW = (0, 255, 255)
COLOR_RED    = (0, 0, 255)
COLOR_WHITE  = (255, 255, 255)
COLOR_CYAN   = (255, 255, 0)
COLOR_BLACK  = (0, 0, 0)

# Runtime
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None
RENDER_FIRST_N      = 300

# =======================
# Jake lane points
# =======================
LANE_LEFT   = (300, 1340)
LANE_MID    = (490, 1340)
LANE_RIGHT  = (680, 1340)
JAKE_POINT  = LANE_LEFT  # pick: LANE_LEFT / LANE_MID / LANE_RIGHT

LANE_TARGET_DEG = {"left": -10.7, "mid": +1.5, "right": +15.0}

def lane_name_from_point(p):
    if p == LANE_LEFT:  return "left"
    if p == LANE_MID:   return "mid"
    if p == LANE_RIGHT: return "right"
    return "mid"

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

# Class buckets for probe classification
DANGER_RED   = {1, 6, 7, 11}
WARN_YELLOW  = {2, 3, 4, 5, 8}
BOOTS_PINK   = {0}

CLASS_COLOURS = {
    0:(255,255,0),1:(192,192,192),2:(0,128,255),3:(0,255,0),
    4:(255,0,255),5:(0,255,255),6:(255,128,0),7:(128,0,255),
    8:(0,0,128),9:(0,0,255),10:(128,128,0),11:(255,255,102)
}
LABELS = {
    0:"BOOTS",1:"GREYTRAIN",2:"HIGHBARRIER1",3:"JUMP",4:"LOWBARRIER1",
    5:"LOWBARRIER2",6:"ORANGETRAIN",7:"PILLAR",8:"RAMP",9:"RAILS",
    10:"SIDEWALK",11:"YELLOWTRAIN"
}

# ====== tiny helpers ======
def _clampi(v, lo, hi):
    return lo if v < lo else (hi if v > hi else v)

def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    return img, (time.perf_counter() - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

# =======================
# Fast rails green finder
# =======================
def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    H, W = rail_mask.shape
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    rail_u8 = rail_mask.view(dtype=np.uint8) * 255
    x, y, w, h = cv2.boundingRect(rail_u8)
    img_roi  = img_bgr[y:y+h, x:x+w]
    mask_roi = rail_u8[y:y+h, x:x+w]

    img_f = img_roi.astype(np.float32, copy=False)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = (diff * diff).sum(-1)
    colour_hit = (dist2 <= TOL2).any(-1)

    combined = np.logical_and(colour_hit, mask_roi.astype(bool))

    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)
    if n <= 1: return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    areas = stats[1:, cv2.CC_STAT_AREA]
    hs    = stats[1:, cv2.CC_STAT_HEIGHT]
    keep  = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep: good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y:y+h, x:x+w] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32, copy=False), k)
    g = cv2.blur(green_mask.astype(np.float32, copy=False), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255.0).astype(np.uint8, copy=False)

def purple_triangles(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)
    dark = (score >= RED_SCORE_THRESH).astype(np.uint8, copy=False)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)), iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0: return [], None

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1: return [], None

    tris = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            ys, xs = np.where(lbls == lbl)
            if ys.size == 0: continue
            y_top = ys.min()
            x_mid = int(xs[ys == y_top].mean())
            tris.append((x_mid, int(y_top)))

    if not tris: return [], None
    best = min(tris, key=lambda xy: xy[1])
    return tris, best

# ===== Bearing-based Jake triangle selection =====
def signed_degrees_from_vertical(dx, dy):
    if dx == 0 and dy == 0: return 0.0
    return -math.degrees(math.atan2(dx, -dy))

def select_triangle_by_bearing(tri_positions, jx, jy, target_deg, min_dy=6):
    best_i, best_deg, best_err = -1, None, None
    for i, (xt, yt) in enumerate(tri_positions):
        dy = yt - jy
        if dy >= -min_dy:  # must be above Jake
            continue
        deg = signed_degrees_from_vertical(xt - jx, dy)
        err = abs(deg - target_deg)
        if (best_err is None) or (err < best_err):
            best_i, best_deg, best_err = i, deg, err
    return best_i, best_deg, best_err

# ===== Lane-aware curved sampling (precompute sin/cos) =====
def _precompute_trig():
    angles = sorted(set([0.0,
        BEND_LEFT_STATE_RIGHT_DEG,
        BEND_MID_STATE_RIGHT_DEG,
        BEND_MID_STATE_LEFT_DEG,
        BEND_RIGHT_STATE_LEFT_DEG
    ]))
    table = {}
    for a in angles:
        r = math.radians(a)
        table[a] = (math.sin(r), -math.cos(r))  # (dx, dy) for unit ray (up = -y)
    return table
TRIG_TABLE = _precompute_trig()

def pick_bend_angle(jake_point, xt, x_ref, idx, best_idx):
    if idx == best_idx:
        return 0.0
    if jake_point == LANE_LEFT:
        return BEND_LEFT_STATE_RIGHT_DEG if xt > x_ref else 0.0
    if jake_point == LANE_RIGHT:
        return BEND_RIGHT_STATE_LEFT_DEG if xt < x_ref else 0.0
    if xt > x_ref: return BEND_MID_STATE_RIGHT_DEG
    if xt < x_ref: return BEND_MID_STATE_LEFT_DEG
    return 0.0

# --------- NEW: walk-the-ray classifier (15px steps, first-hit wins) ----------
def classify_triangles_at_sample_curved(
    tri_positions, masks_np, classes_np, H, W,
    jake_point, x_ref, best_idx, sample_px=SAMPLE_UP_PX, step_px=RAY_STEP_PX
):
    if masks_np is None or classes_np is None or len(tri_positions) == 0:
        return [], []

    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (W - 1))
    sy = (mh - 1) / max(1, (H - 1))

    # Build index lists once per frame for speed
    red_idx    = [i for i, c in enumerate(classes_np) if int(c) in DANGER_RED]
    yellow_idx = [i for i, c in enumerate(classes_np) if int(c) in WARN_YELLOW]
    boots_idx  = [i for i, c in enumerate(classes_np) if int(c) in BOOTS_PINK]

    colours, rays = [], []
    max_k = max(1, sample_px // max(1, step_px))

    for idx, (x0, y0) in enumerate(tri_positions):
        theta = pick_bend_angle(jake_point, x0, x_ref, idx, best_idx)
        dx1, dy1 = TRIG_TABLE[theta]

        # Default = GREEN unless we hit something
        hit_colour = COLOR_GREEN

        # march along the ray (tip -> outward), first-hit wins
        found = False
        for k in range(1, max_k + 1):
            t  = k * step_px
            xs = _clampi(int(round(x0 + dx1 * t)), 0, W-1)
            ys = _clampi(int(round(y0 + dy1 * t)), 0, H-1)
            mx = _clampi(int(round(xs * sx)), 0, mw-1)
            my = _clampi(int(round(ys * sy)), 0, mh-1)

            # RED first
            for i in red_idx:
                if masks_np[i][my, mx] > 0.5:
                    hit_colour = COLOR_RED; found = True
                    break
            if found: break
            # then YELLOW
            for i in yellow_idx:
                if masks_np[i][my, mx] > 0.5:
                    hit_colour = COLOR_YELLOW; found = True
                    break
            if found: break
            # then BOOTS (pink)
            for i in boots_idx:
                if masks_np[i][my, mx] > 0.5:
                    hit_colour = COLOR_PINK; found = True
                    break
            if found: break
            # otherwise keep walking; stays GREEN

        # end point for viz = full length along bend
        x1 = _clampi(int(round(x0 + dx1 * sample_px)), 0, W-1)
        y1 = _clampi(int(round(y0 + dy1 * sample_px)), 0, H-1)

        colours.append(hit_colour)
        rays.append(((int(x0), int(y0)), (x1, y1), float(theta)))

    return colours, rays
# ------------------------------------------------------------------------------

# =======================
# Frame post-processing
# =======================
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]
    if yolo_res.masks is None:
        return None, 0, 0, 0.0, 0.0, None, None, None, None, [], [], [], None, None, None

    t0 = time.perf_counter()
    masks_np = yolo_res.masks.data.detach().cpu().numpy()  # [n,h,w]
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.detach().cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.detach().cpu().numpy().astype(int)
    to_cpu_ms = (time.perf_counter() - t0) * 1000.0
    mask_count = int(masks_np.shape[0])
    if mask_count == 0 or classes_np.size == 0:
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, [], [], [], None, None, None

    t1 = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool, copy=False)
    union = np.any(rail_masks, axis=0).astype(np.uint8, copy=False)
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool, copy=False)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri_positions, tri_best = purple_triangles(score, H)

    # Jake triangle by bearing
    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    xj, yj = JAKE_POINT
    best_idx, best_deg, _ = select_triangle_by_bearing(tri_positions, xj, yj, target_deg, min_dy=6)

    # x_ref for bending
    if lane_name == "mid" and (best_idx is not None) and (0 <= best_idx < len(tri_positions)):
        x_ref = tri_positions[best_idx][0]
    else:
        x_ref = xj

    tri_colours, tri_rays = classify_triangles_at_sample_curved(
        tri_positions, masks_np, classes_np, H, W, JAKE_POINT, x_ref, best_idx,
        SAMPLE_UP_PX, RAY_STEP_PX
    )
    post_ms = (time.perf_counter() - t1) * 1000.0

    return tri_best, len(tri_positions), mask_count, to_cpu_ms, post_ms, masks_np, classes_np, rail_mask, green, tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref

# =======================
# Viz helpers
# =======================
def _colour_for_point(x, y, masks_np, classes_np, H, W):
    if masks_np is None or classes_np is None or masks_np.size == 0: return COLOR_GREEN
    mh, mw = masks_np.shape[1], masks_np.shape[2]
    sx = (mw - 1) / max(1, (W - 1)); sy = (mh - 1) / max(1, (H - 1))
    mx = _clampi(int(round(x * sx)), 0, mw-1)
    my = _clampi(int(round(y * sy)), 0, mh-1)
    cls_here = None
    for m, c in zip(masks_np, classes_np):
        if m[my, mx] > 0.5: cls_here = int(c); break
    if cls_here in DANGER_RED:   return COLOR_RED
    if cls_here in WARN_YELLOW:  return COLOR_YELLOW
    if cls_here in BOOTS_PINK:   return COLOR_PINK
    return COLOR_GREEN

def draw_triangle(img, x, y, size=TRI_SIZE_PX, colour=COLOR_RED):
    h = int(size * 1.2)
    pts = np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)
    cv2.fillConvexPoly(img, pts, colour)
    cv2.polylines(img, [pts.reshape(-1,1,2)], True, COLOR_BLACK, 1, cv2.LINE_AA)

def triangle_pts(x, y, size=TRI_SIZE_PX):
    h = int(size * 1.2)
    return np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)

def render_overlays(frame_bgr, masks_np, classes_np, rail_mask, green_mask,
                    tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref):
    out = frame_bgr.copy()
    H, W = out.shape[:2]
    alpha = 0.45

    if masks_np is not None and classes_np is not None and masks_np.size:
        for m, c in zip(masks_np, classes_np):
            m_full = m
            if m.shape != (H, W):
                m_full = cv2.resize(m.astype(np.uint8), (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)
            color = CLASS_COLOURS.get(int(c), (255,255,255))
            out[m_full] = (np.array(color, dtype=np.uint8) * alpha + out[m_full] * (1 - alpha)).astype(np.uint8)
            ys, xs = np.where(m_full)
            if xs.size:
                xc, yc = int(xs.mean()), int(ys.mean())
                label = LABELS.get(int(c), f"C{int(c)}")
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, COLOR_BLACK, 2, cv2.LINE_AA)
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)

    if rail_mask is not None:
        tint = out.copy()
        tint[rail_mask] = (0, 0, 255)
        out = cv2.addWeighted(tint, 0.30, out, 0.70, 0)
    if green_mask is not None:
        out[green_mask] = (0, 255, 0)

    # tiny scout lines (viz only)
    if tri_positions:
        for (x, y) in tri_positions:
            y_end = max(0, y - SAMPLE_UP_PX)
            for yy in range(y, y_end - 1, -1):
                out[yy, x] = _colour_for_point(x, yy, masks_np, classes_np, H, W)

    # starburst to Jake
    xj, yj = JAKE_POINT
    for idx, (xt, yt) in enumerate(tri_positions):
        xt = _clampi(int(xt), 0, W-1); yt = _clampi(int(yt), 0, H-1)
        deg_signed = signed_degrees_from_vertical(xt - xj, yt - yj)
        cv2.line(out, (xj, yj), (xt, yt),
                 COLOR_CYAN if idx == best_idx else COLOR_WHITE, 2, cv2.LINE_AA)
        mx = (xj + xt) // 2; my = (yj + yt) // 2
        txt = f"{deg_signed:.1f}°"
        cv2.putText(out, txt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, txt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    # curved sampling rays (viz)
    for (p0, p1, theta) in tri_rays:
        cv2.line(out, p0, p1, (255,255,255), 2, cv2.LINE_AA)
        mx = (p0[0] + p1[0]) // 2; my = (p0[1] + p1[1]) // 2
        ttxt = f"{theta:+.1f}°"
        cv2.putText(out, ttxt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, ttxt, (mx, my), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)

    for (x, y), col in zip(tri_positions, tri_colours):
        draw_triangle(out, int(x), int(y), colour=col)

    lane_name = lane_name_from_point(JAKE_POINT)
    target_deg = LANE_TARGET_DEG[lane_name]
    if best_idx is not None and 0 <= best_idx < len(tri_positions):
        xt, yt = tri_positions[best_idx]
        pts = triangle_pts(int(xt), int(yt), size=TRI_SIZE_PX)
        cv2.polylines(out, [pts.reshape(-1,1,2)], True, COLOR_CYAN, 3, cv2.LINE_AA)
        tag = f"JAKE_TRI ({lane_name}: target {target_deg:.1f}°)"
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, COLOR_BLACK, 2, cv2.LINE_AA)
        cv2.putText(out, tag, (max(5, int(xt)-70), max(20, int(yt)-16)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 1, cv2.LINE_AA)
    return out

# =======================
# Run
# =======================
def run_pipeline_with_prints_and_overlays():
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)

    def load_batch(batch_paths):
        imgs, read_ms = [None]*len(batch_paths), [0.0]*len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2i = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2i):
                i = fut2i[fut]
                im, r = fut.result()
                imgs[i] = im; read_ms[i] = r
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok: return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, 1):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        if not imgs_bgr:
            idx_global += len(batch_paths); continue

        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET, batch=1
        )
        try:
            if device == 0 and torch.cuda.is_available(): torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception: pass
        infer_ms_share = (time.perf_counter() - t0_inf) * 1000.0

        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            (tri_best_xy, tri_count, mask_count, to_cpu_ms, post_ms,
             masks_np, classes_np, rail_mask, green_mask, tri_positions, tri_colours,
             tri_rays, best_idx, best_deg, x_ref) = process_frame_post(img, yres)

            proc_ms = infer_ms_share + to_cpu_ms + post_ms
            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1

            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms_share:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} | "
                  f"masks {mask_count} | triangles {tri_count} "
                  f"=> proc {proc_ms:.1f} ms")

            if frame_idx <= RENDER_FIRST_N:
                overlay = render_overlays(img, masks_np, classes_np, rail_mask, green_mask,
                                          tri_positions, tri_colours, tri_rays, best_idx, best_deg, x_ref)
                out_path = out_dir / f"overlay_{frame_idx:04d}_{fname}"
                cv2.imwrite(str(out_path), overlay)

        idx_global += 1

# =======================
# Entry
# =======================
if __name__ == "__main__":
    run_pipeline_with_prints_and_overlays()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/312] frame_00000.png  read 29.0 | infer 94.1 | to_cpu 0.6 | post 0.0 | masks 1 | triangles 0 => proc 94.7 ms
[2/312] frame_00001.png  read 29.3 | infer 38.1 | to_cpu 0.7 | post 65.8 | masks 3 | triangles 1 => proc 104.6 ms
[3/312] frame_00002.png  read 29.5 | infer 33.2 | to_cpu 0.6 | post 76.3 | masks 3 | triangles 3 => proc 110.1 ms
[4/312] frame_00003.png  read 34.3 | infer 36.2 | to_cpu 0.7 | post 87.4 | masks 3 | triangles 2 => proc 124.3 ms
[5/312] frame_00004.png  read 32.4 | infer 38.5 | to_cpu 0.7 | post 119.3 | masks 4 | triangles 1 => proc 158.5 ms
[6/312] frame_00005.png  read 32.5 | infer 38.4 | to_cpu 0.8 | post 117.9 | masks 5 | triangles 1 => proc 157.1 ms
[7/312] frame_00006.png  read 30.0 | infer 115.0 | to_cpu 5.4 | post 130.7 | masks 5 | triangles 2 => proc 251.1 ms
[8/312] frame_00007.png  read 29.3 | infer 34.6 | to_cpu 0.7 | post 125.4 | masks 4 | triangles 2 => proc 160.7 