In [1]:
#!/usr/bin/env python3
# Ultra-fast, no-UI, no-prints, batched pipeline with threaded I/O

import os, glob, sys, time
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config (tweak for your box)
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Runtime
BATCH               = 16           # adjust to saturate your GPU
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)  # file I/O overlap
SHOW_FIRST_N        = None         # None → all frames
RETURN_TIMINGS      = False        # True if you want per-frame timing dicts returned

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try:
    cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception:
    pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

# Warmup (avoids first-batch penalty; keep on device)
_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE,
                  device=device, conf=CONF, iou=IOU,
                  verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed constants
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

# =======================
# Helpers (vectorized/fast)
# =======================
def load_image(path: str):
    # Fast path cv2.imread; IMREAD_COLOR is default; avoid extra conversions
    return cv2.imread(path, cv2.IMREAD_COLOR)

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    # Color match on ROI only, then CC filter by area/height
    H, W = img_bgr.shape[:2]
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    ys, xs = np.where(rail_mask)
    y0, y1 = ys.min(), ys.max()+1
    x0, x1 = xs.min(), xs.max()+1

    img_roi  = img_bgr[y0:y1, x0:x1]
    mask_roi = rail_mask[y0:y1, x0:x1]

    img_f = img_roi.astype(np.float32)
    # [h,w,1,3] - [1,1,K,3] → [h,w,K,3]
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = np.sum(diff * diff, axis=-1)                       # [h,w,K]
    colour_hit = np.any(dist2 <= TOL2, axis=-1)                # [h,w]

    combined = np.logical_and(colour_hit, mask_roi)            # bool
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)

    if n <= 1:
        out = np.zeros_like(combined)
        full = np.zeros((H, W), dtype=bool)
        return full

    good = np.zeros_like(combined)
    # vectorized filter: get idx of components satisfying both constraints
    areas  = stats[1:, cv2.CC_STAT_AREA]
    hs     = stats[1:, cv2.CC_STAT_HEIGHT]
    keep   = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    if keep.size:
        # fast mask assembly
        for k in keep:
            good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y0:y1, x0:x1] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    # Use box blur; separable under the hood in OpenCV
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32), k)
    g = cv2.blur(green_mask.astype(np.float32), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255).astype(np.uint8)

def purple_triangle_from_dark_map(score, H):
    # Mask score with top/bottom exclusions, morph open, then take CCs
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)

    dark = (score >= RED_SCORE_THRESH).astype(np.uint8)
    if top_ex:
        dark[:top_ex, :] = 0
    if bot_ex:
        dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)),
        iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return None  # no triangle

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))

    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return None

    # Keep components meeting both absolute and fractional thresholds
    candidates = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            candidates.append(lbl)
    if not candidates:
        return None

    # For the triangle location: pick the component whose topmost y is smallest
    # (closest to top after exclusions), and place triangle at (x_mid, y_top).
    y_mins = []
    x_mids = []
    for lbl in candidates:
        ys, xs = np.where(lbls == lbl)
        if ys.size == 0:
            continue
        y_top = ys.min()
        x_mid = int(xs[ys == y_top].mean())
        y_mins.append(y_top)
        x_mids.append(x_mid)
    if not y_mins:
        return None

    idx = int(np.argmin(y_mins))
    return (int(x_mids[idx]), int(y_mins[idx]))  # (x, y)

# =======================
# Core processing of one frame (expects YOLO result already on CPU)
# =======================
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]

    # masks at model size → union rails → upsample to frame size
    if yolo_res.masks is None:
        return None  # triangle pos: None

    masks_np = yolo_res.masks.data.cpu().numpy()          # [n,h,w]
    # class vector: prefer masks.cls if present, else boxes.cls
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.cpu().numpy().astype(int)

    # union only rail masks
    if classes_np.size == 0:
        return None

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None

    rail_masks = masks_np[rail_sel].astype(bool)          # [k,h,w]
    union = np.any(rail_masks, axis=0).astype(np.uint8)   # [h,w]
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)

    # green (target color on rails) vs red
    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))

    # heat precursor
    score = red_vs_green_score(red, green)
    tri = purple_triangle_from_dark_map(score, H)
    return tri  # (x, y) or None

# =======================
# Batched execution with overlapped disk I/O
# =======================
def run_pipeline():
    # List frames
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")

    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    results_triangle_xy = [None] * len(paths)
    timings = [None] * len(paths) if RETURN_TIMINGS else None

    # Threaded I/O loader
    def load_batch(batch_paths):
        imgs = [None] * len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            future_to_idx = {ex.submit(load_image, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(future_to_idx):
                i = future_to_idx[fut]
                img = fut.result()
                imgs[i] = img
        # drop unreadables
        ok = [(p, im) for p, im in zip(batch_paths, imgs) if im is not None]
        if not ok:
            return [], []
        batch_paths2, imgs2 = zip(*ok)
        return list(batch_paths2), list(imgs2)

    # Process in batches
    idx_global = 0
    for batch_paths in chunked(paths, BATCH):
        batch_paths, imgs_bgr = load_batch(batch_paths)
        if not imgs_bgr:
            idx_global += len(batch_paths)
            continue

        # Inference (batched)
        t0 = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET,
            batch=len(imgs_bgr)
        )
        # Ensure device sync so post timing reflects host-only work
        try:
            if device == 0 and torch.cuda.is_available():
                torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception:
            pass
        t1 = time.perf_counter()

        # Post-process each frame (CPU, vectorized where possible)
        t2 = time.perf_counter()
        for j, (img, yres) in enumerate(zip(imgs_bgr, res_list)):
            tri = process_frame_post(img, yres)
            results_triangle_xy[idx_global + j] = tri
        t3 = time.perf_counter()

        if RETURN_TIMINGS:
            # same timings for each frame in batch (approximate)
            infer_ms = (t1 - t0) * 1000.0 / max(1, len(imgs_bgr))
            post_ms  = (t3 - t2) * 1000.0 / max(1, len(imgs_bgr))
            for j in range(len(imgs_bgr)):
                timings[idx_global + j] = {"infer_ms": infer_ms, "post_ms": post_ms}

        idx_global += len(imgs_bgr)

    return (results_triangle_xy, timings) if RETURN_TIMINGS else results_triangle_xy

# =======================
# Entry
# =======================
if __name__ == "__main__":
    _ = run_pipeline()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs


In [3]:
#!/usr/bin/env python3
# Ultra-fast, batched pipeline with threaded I/O + per-frame timing prints

import os, glob, sys, time
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Runtime
BATCH               = 1                            # tune to your GPU
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None                          # None → all frames

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

# Warmup
_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed constants
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

# =======================
# Helpers
# =======================
def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    t1 = time.perf_counter()
    return img, (t1 - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    H, W = img_bgr.shape[:2]
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    ys, xs = np.where(rail_mask)
    y0, y1 = ys.min(), ys.max()+1
    x0, x1 = xs.min(), xs.max()+1

    img_roi  = img_bgr[y0:y1, x0:x1]
    mask_roi = rail_mask[y0:y1, x0:x1]

    img_f = img_roi.astype(np.float32)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = np.sum(diff * diff, axis=-1)
    colour_hit = np.any(dist2 <= TOL2, axis=-1)

    combined = np.logical_and(colour_hit, mask_roi)
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)

    if n <= 1:
        return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    areas  = stats[1:, cv2.CC_STAT_AREA]
    hs     = stats[1:, cv2.CC_STAT_HEIGHT]
    keep   = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep:
        good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y0:y1, x0:x1] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32), k)
    g = cv2.blur(green_mask.astype(np.float32), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255).astype(np.uint8)

def purple_triangle_from_dark_map(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)

    dark = (score >= RED_SCORE_THRESH).astype(np.uint8)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)),
        iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return None

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return None

    candidates = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            candidates.append(lbl)
    if not candidates:
        return None

    y_mins, x_mids = [], []
    for lbl in candidates:
        ys, xs = np.where(lbls == lbl)
        if ys.size == 0: continue
        y_top = ys.min()
        x_mid = int(xs[ys == y_top].mean())
        y_mins.append(y_top)
        x_mids.append(x_mid)
    if not y_mins:
        return None
    idx = int(np.argmin(y_mins))
    return (int(x_mids[idx]), int(y_mins[idx]))

# Return: tri, to_cpu_ms, post_ms (per-frame)
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]

    if yolo_res.masks is None:
        return None, 0.0, 0.0

    t0_to_cpu = time.perf_counter()
    masks_np = yolo_res.masks.data.cpu().numpy()  # [n,h,w]
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.cpu().numpy().astype(int)
    t1_to_cpu = time.perf_counter()
    to_cpu_ms = (t1_to_cpu - t0_to_cpu) * 1000.0

    if classes_np.size == 0:
        return None, to_cpu_ms, 0.0

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, to_cpu_ms, 0.0

    t0_post = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool)        # [k,h,w]
    union = np.any(rail_masks, axis=0).astype(np.uint8) # [h,w]
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri   = purple_triangle_from_dark_map(score, H)
    t1_post = time.perf_counter()
    post_ms = (t1_post - t0_post) * 1000.0

    return tri, to_cpu_ms, post_ms

# =======================
# Batched execution with overlapped disk I/O + per-frame prints
# =======================
def run_pipeline_with_prints():
    # Gather frames
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)
    results_triangle_xy = [None] * N

    # Threaded I/O
    def load_batch(batch_paths):
        imgs = [None] * len(batch_paths)
        read_ms = [0.0] * len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2idx = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2idx):
                i = fut2idx[fut]
                img, r_ms = fut.result()
                imgs[i] = img
                read_ms[i] = r_ms
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok:
            return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, BATCH):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        B = len(imgs_bgr)
        if B == 0:
            idx_global += len(batch_paths)
            continue

        # Batched inference
        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET,
            batch=B
        )
        # Device sync so timing is clean
        try:
            if device == 0 and torch.cuda.is_available():
                torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception:
            pass
        t1_inf = time.perf_counter()
        infer_ms_share = ((t1_inf - t0_inf) * 1000.0) / B  # approx per-frame share

        # Per-frame post + prints
        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            t0_post_all = time.perf_counter()
            tri, to_cpu_ms, post_ms = process_frame_post(img, yres)
            t1_post_all = time.perf_counter()  # includes everything inside post
            results_triangle_xy[idx_global + j] = tri

            # Compose per-frame timing
            infer_ms = infer_ms_share
            # post_ms already measured (CPU only after to_cpu)
            proc_ms = infer_ms + to_cpu_ms + post_ms

            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1
            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} "
                  f"=> proc {proc_ms:.1f} ms")

        idx_global += B

    return results_triangle_xy

# =======================
# Entry
# =======================
if __name__ == "__main__":
    _ = run_pipeline_with_prints()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/79] frame_00000.png  read 40.0 | infer 675.7 | to_cpu 0.7 | post 151.9 => proc 828.2 ms
[2/79] frame_00001.png  read 38.8 | infer 49.6 | to_cpu 1.0 | post 160.2 => proc 210.8 ms
[3/79] frame_00002.png  read 36.4 | infer 59.7 | to_cpu 0.9 | post 172.2 => proc 232.7 ms
[4/79] frame_00003.png  read 36.4 | infer 37.7 | to_cpu 0.8 | post 170.3 => proc 208.8 ms
[5/79] frame_00004.png  read 36.0 | infer 42.0 | to_cpu 0.9 | post 143.9 => proc 186.9 ms
[6/79] frame_00005.png  read 43.0 | infer 40.7 | to_cpu 1.0 | post 125.9 => proc 167.6 ms
[7/79] frame_00006.png  read 39.8 | infer 41.0 | to_cpu 1.0 | post 122.9 => proc 164.9 ms
[8/79] frame_00007.png  read 37.4 | infer 42.4 | to_cpu 1.0 | post 147.6 => proc 191.1 ms
[9/79] frame_00008.png  read 37.1 | infer 41.7 | to_cpu 0.8 | post 143.9 => proc 186.4 ms
[10/79] frame_00009.png  read 38.1 | infer 40.7 | to_cpu 0.8 | post 156.8 => proc 198.3 ms
[11/79] fr

In [5]:
#!/usr/bin/env python3
# Ultra-fast, batched pipeline with threaded I/O + per-frame timing & counts

import os, glob, sys, time
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18

# Runtime
BATCH               = 1
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None  # None → all

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

# Warmup
_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

# =======================
# Helpers
# =======================
def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    t1 = time.perf_counter()
    return img, (t1 - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    H, W = img_bgr.shape[:2]
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    ys, xs = np.where(rail_mask)
    y0, y1 = ys.min(), ys.max()+1
    x0, x1 = xs.min(), xs.max()+1

    img_roi  = img_bgr[y0:y1, x0:x1]
    mask_roi = rail_mask[y0:y1, x0:x1]

    img_f = img_roi.astype(np.float32)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = np.sum(diff * diff, axis=-1)
    colour_hit = np.any(dist2 <= TOL2, axis=-1)

    combined = np.logical_and(colour_hit, mask_roi)
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)
    if n <= 1:
        return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    areas  = stats[1:, cv2.CC_STAT_AREA]
    hs     = stats[1:, cv2.CC_STAT_HEIGHT]
    keep   = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep:
        good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y0:y1, x0:x1] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32), k)
    g = cv2.blur(green_mask.astype(np.float32), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255.0).astype(np.uint8)

# Returns: (best_triangle_xy or None, triangle_count)
def purple_triangles(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)

    dark = (score >= RED_SCORE_THRESH).astype(np.uint8)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)),
        iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return None, 0

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return None, 0

    candidates = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            candidates.append(lbl)
    tri_count = len(candidates)
    if tri_count == 0:
        return None, 0

    y_mins, x_mids = [], []
    for lbl in candidates:
        ys, xs = np.where(lbls == lbl)
        if ys.size == 0: 
            continue
        y_top = ys.min()
        x_mid = int(xs[ys == y_top].mean())
        y_mins.append(y_top)
        x_mids.append(x_mid)
    if not y_mins:
        return None, tri_count

    idx = int(np.argmin(y_mins))
    return (int(x_mids[idx]), int(y_mins[idx])), tri_count

# Returns: (tri_xy, tri_count, mask_count, to_cpu_ms, post_ms)
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]

    if yolo_res.masks is None:
        return None, 0, 0, 0.0, 0.0

    t0_to_cpu = time.perf_counter()
    masks_np = yolo_res.masks.data.cpu().numpy()  # [n,h,w]
    mask_count = int(masks_np.shape[0])
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.cpu().numpy().astype(int)
    t1_to_cpu = time.perf_counter()
    to_cpu_ms = (t1_to_cpu - t0_to_cpu) * 1000.0

    if mask_count == 0 or classes_np.size == 0:
        return None, 0, mask_count, to_cpu_ms, 0.0

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, 0, mask_count, to_cpu_ms, 0.0

    t0_post = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool)        # [k,h,w]
    union = np.any(rail_masks, axis=0).astype(np.uint8) # [h,w]
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri_xy, tri_count = purple_triangles(score, H)
    t1_post = time.perf_counter()
    post_ms = (t1_post - t0_post) * 1000.0

    return tri_xy, tri_count, mask_count, to_cpu_ms, post_ms

# =======================
# Batched execution with prints
# =======================
def run_pipeline_with_prints():
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)
    results_triangle_xy = [None] * N

    def load_batch(batch_paths):
        imgs = [None] * len(batch_paths)
        read_ms = [0.0] * len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2idx = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2idx):
                i = fut2idx[fut]
                img, r_ms = fut.result()
                imgs[i] = img
                read_ms[i] = r_ms
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok:
            return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, BATCH):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        B = len(imgs_bgr)
        if B == 0:
            idx_global += len(batch_paths)
            continue

        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET,
            batch=B
        )
        try:
            if device == 0 and torch.cuda.is_available():
                torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception:
            pass
        t1_inf = time.perf_counter()
        infer_ms_share = ((t1_inf - t0_inf) * 1000.0) / B

        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            tri_xy, tri_count, mask_count, to_cpu_ms, post_ms = process_frame_post(img, yres)
            results_triangle_xy[idx_global + j] = tri_xy

            proc_ms = infer_ms_share + to_cpu_ms + post_ms
            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1

            # NEW: print mask_count and triangle_count
            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms_share:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} | "
                  f"masks {mask_count} | triangles {tri_count} "
                  f"=> proc {proc_ms:.1f} ms")

        idx_global += B

    return results_triangle_xy

# =======================
# Entry
# =======================
if __name__ == "__main__":
    _ = run_pipeline_with_prints()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/79] frame_00000.png  read 49.8 | infer 131.7 | to_cpu 0.8 | post 157.2 | masks 4 | triangles 1 => proc 289.8 ms
[2/79] frame_00001.png  read 44.2 | infer 46.9 | to_cpu 1.2 | post 164.3 | masks 6 | triangles 1 => proc 212.4 ms
[3/79] frame_00002.png  read 37.2 | infer 55.9 | to_cpu 0.9 | post 316.5 | masks 3 | triangles 3 => proc 373.3 ms
[4/79] frame_00003.png  read 36.6 | infer 39.6 | to_cpu 0.7 | post 157.4 | masks 3 | triangles 2 => proc 197.7 ms
[5/79] frame_00004.png  read 45.6 | infer 45.6 | to_cpu 0.8 | post 187.1 | masks 5 | triangles 1 => proc 233.5 ms
[6/79] frame_00005.png  read 36.3 | infer 43.9 | to_cpu 0.9 | post 130.8 | masks 4 | triangles 1 => proc 175.6 ms
[7/79] frame_00006.png  read 37.1 | infer 44.3 | to_cpu 1.1 | post 122.8 | masks 5 | triangles 1 => proc 168.1 ms
[8/79] frame_00007.png  read 38.0 | infer 41.0 | to_cpu 1.0 | post 148.0 | masks 6 | triangles 2 => proc 190.0 ms

In [6]:
#!/usr/bin/env python3
# Ultra-fast, batched pipeline with threaded I/O + per-frame timing & counts
# Now also renders ALL masks + purple triangles onto ORIGINAL frames for first N=20 (excluded from timings)

import os, glob, sys, time
import cv2, torch, numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from ultralytics import YOLO

# =======================
# Config
# =======================
home       = os.path.expanduser("~")
weights    = f"{home}/models/jakes-loped/jakes-finder-mk1/1/weights.pt"
frames_dir = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "frames"
out_dir    = Path(home) / "Documents" / "GitHub" / "Ai-plays-SubwaySurfers" / "out_overlays"
out_dir.mkdir(parents=True, exist_ok=True)

RAIL_ID    = 9
IMG_SIZE   = 512
CONF, IOU  = 0.30, 0.45
MAX_DET    = 30

# Color/region filter
TARGET_COLORS_RGB  = [(119,104,67), (81,42,45)]
TOLERANCE          = 20.0
MIN_REGION_SIZE    = 30
MIN_REGION_HEIGHT  = 150

# Heat/triangle
HEAT_BLUR_KSIZE     = 51
RED_SCORE_THRESH    = 220
EXCLUDE_TOP_FRAC    = 0.40
EXCLUDE_BOTTOM_FRAC = 0.15
MIN_DARK_RED_AREA   = 1200
MIN_DARK_FRACTION   = 0.15
TRI_SIZE_PX         = 18
PURPLE              = (255, 0, 255)

# Runtime
BATCH               = 1
THREADS_IO          = max(2, (os.cpu_count() or 4) // 2)
SHOW_FIRST_N        = None  # None → all frames
RENDER_FIRST_N      = 5    # render overlays for first 20 frames only

# =======================
# System/Backends
# =======================
cv2.setUseOptimized(True)
try: cv2.setNumThreads(max(1, (os.cpu_count() or 1) - 1))
except Exception: pass

if torch.cuda.is_available():
    device, half = 0, True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision('high')
    except Exception: pass
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device, half = "mps", False
else:
    device, half = "cpu", False

# =======================
# Model
# =======================
model = YOLO(weights)
try: model.fuse()
except Exception: pass

# Warmup
_dummy = np.zeros((IMG_SIZE, IMG_SIZE, 3), np.uint8)
_ = model.predict(_dummy, task="segment", imgsz=IMG_SIZE, device=device,
                  conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET)

# =======================
# Precomputed
# =======================
TARGETS_BGR_F32 = np.array([(r,g,b)[::-1] for (r,g,b) in TARGET_COLORS_RGB], dtype=np.float32)
TOL2            = TOLERANCE * TOLERANCE

CLASS_COLOURS = {
    0:(255,255,0),1:(192,192,192),2:(0,128,255),3:(0,255,0),
    4:(255,0,255),5:(0,255,255),6:(255,128,0),7:(128,0,255),
    8:(0,0,128),9:(0,0,255),10:(128,128,0),11:(255,255,102)
}
LABELS = {
    0:"BOOTS",1:"GREYTRAIN",2:"HIGHBARRIER1",3:"JUMP",4:"LOWBARRIER1",
    5:"LOWBARRIER2",6:"ORANGETRAIN",7:"PILLAR",8:"RAMP",9:"RAILS",
    10:"SIDEWALK",11:"YELLOWTRAIN"
}

# =======================
# Helpers
# =======================
def load_image_with_time(path: str):
    t0 = time.perf_counter()
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    t1 = time.perf_counter()
    return img, (t1 - t0) * 1000.0

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def highlight_rails_mask_only_fast(img_bgr, rail_mask):
    H, W = img_bgr.shape[:2]
    if not rail_mask.any():
        return np.zeros((H, W), dtype=bool)

    ys, xs = np.where(rail_mask)
    y0, y1 = ys.min(), ys.max()+1
    x0, x1 = xs.min(), xs.max()+1

    img_roi  = img_bgr[y0:y1, x0:x1]
    mask_roi = rail_mask[y0:y1, x0:x1]

    img_f = img_roi.astype(np.float32)
    diff  = img_f[:, :, None, :] - TARGETS_BGR_F32[None, None, :, :]
    dist2 = np.sum(diff * diff, axis=-1)
    colour_hit = np.any(dist2 <= TOL2, axis=-1)

    combined = np.logical_and(colour_hit, mask_roi)
    comp = combined.astype(np.uint8)
    n, lbls, stats, _ = cv2.connectedComponentsWithStats(comp, 8)
    if n <= 1:
        return np.zeros((H, W), dtype=bool)

    good = np.zeros_like(combined)
    areas  = stats[1:, cv2.CC_STAT_AREA]
    hs     = stats[1:, cv2.CC_STAT_HEIGHT]
    keep   = np.where((areas >= MIN_REGION_SIZE) & (hs >= MIN_REGION_HEIGHT))[0] + 1
    for k in keep:
        good[lbls == k] = True

    full = np.zeros((H, W), dtype=bool)
    full[y0:y1, x0:x1] = good
    return full

def red_vs_green_score(red_mask, green_mask):
    k = (HEAT_BLUR_KSIZE, HEAT_BLUR_KSIZE)
    r = cv2.blur(red_mask.astype(np.float32), k)
    g = cv2.blur(green_mask.astype(np.float32), k)
    diff = r - g
    amax = float(np.max(np.abs(diff))) + 1e-6
    norm = (diff / (2.0 * amax) + 0.5)
    return np.clip(norm * 255.0, 0, 255.0).astype(np.uint8)

# Returns: (list_of_triangle_xy, best_xy_or_None)
def purple_triangles(score, H):
    top_ex = int(H * EXCLUDE_TOP_FRAC)
    bot_ex = int(H * EXCLUDE_BOTTOM_FRAC)

    dark = (score >= RED_SCORE_THRESH).astype(np.uint8)
    if top_ex: dark[:top_ex, :] = 0
    if bot_ex: dark[-bot_ex:, :] = 0

    dark = cv2.morphologyEx(
        dark, cv2.MORPH_OPEN,
        cv2.getStructuringElement(cv2.MORPH_RECT, (5, 9)),
        iterations=1
    )
    total_dark = int(dark.sum())
    if total_dark == 0:
        return [], None

    frac_thresh = int(np.ceil(MIN_DARK_FRACTION * total_dark))
    n_lbl, lbls, stats, _ = cv2.connectedComponentsWithStats(dark, 8)
    if n_lbl <= 1:
        return [], None

    tris = []
    for lbl in range(1, n_lbl):
        area = stats[lbl, cv2.CC_STAT_AREA]
        if area >= MIN_DARK_RED_AREA and area >= frac_thresh:
            ys, xs = np.where(lbls == lbl)
            if ys.size == 0: 
                continue
            y_top = ys.min()
            x_mid = int(xs[ys == y_top].mean())
            tris.append((int(x_mid), int(y_top)))

    if not tris:
        return [], None

    # best is the one with smallest y (closest to top)
    best = min(tris, key=lambda xy: xy[1])
    return tris, best

# Returns: (tri_best_xy, tri_count, mask_count, to_cpu_ms, post_ms, masks_np, classes_np, rail_mask, green, tri_positions)
def process_frame_post(frame_bgr, yolo_res):
    H, W = frame_bgr.shape[:2]
    if yolo_res.masks is None:
        return None, 0, 0, 0.0, 0.0, None, None, None, None, []

    t0_to_cpu = time.perf_counter()
    masks_np = yolo_res.masks.data.cpu().numpy()  # [n,h,w]
    mask_count = int(masks_np.shape[0])
    if hasattr(yolo_res.masks, "cls") and yolo_res.masks.cls is not None:
        classes_np = yolo_res.masks.cls.cpu().numpy().astype(int)
    else:
        classes_np = yolo_res.boxes.cls.cpu().numpy().astype(int)
    t1_to_cpu = time.perf_counter()
    to_cpu_ms = (t1_to_cpu - t0_to_cpu) * 1000.0

    if mask_count == 0 or classes_np.size == 0:
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, []

    rail_sel = (classes_np == RAIL_ID)
    if not np.any(rail_sel):
        return None, 0, mask_count, to_cpu_ms, 0.0, masks_np, classes_np, None, None, []

    t0_post = time.perf_counter()
    rail_masks = masks_np[rail_sel].astype(bool)        # [k,h,w]
    union = np.any(rail_masks, axis=0).astype(np.uint8) # [h,w]
    rail_mask = cv2.resize(union, (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)

    green = highlight_rails_mask_only_fast(frame_bgr, rail_mask)
    red   = np.logical_and(rail_mask, np.logical_not(green))
    score = red_vs_green_score(red, green)
    tri_positions, tri_best = purple_triangles(score, H)
    t1_post = time.perf_counter()
    post_ms = (t1_post - t0_post) * 1000.0

    return tri_best, len(tri_positions), mask_count, to_cpu_ms, post_ms, masks_np, classes_np, rail_mask, green, tri_positions

# --- rendering (excluded from timing) ---
def draw_triangle(img, x, y, size=TRI_SIZE_PX, colour=PURPLE):
    h = int(size * 1.2)
    pts = np.array([[x, y], [x-size, y+h], [x+size, y+h]], np.int32)
    cv2.fillConvexPoly(img, pts, colour)
    cv2.polylines(img, [pts.reshape(-1,1,2)], True, (0,0,0), 1, cv2.LINE_AA)

def render_overlays(frame_bgr, masks_np, classes_np, rail_mask, green_mask, tri_positions):
    """Draw all masks (class color) + labels, rail tint/green, and purple triangles on a copy of original frame."""
    out = frame_bgr.copy()
    H, W = out.shape[:2]
    alpha = 0.45

    if masks_np is not None and classes_np is not None and masks_np.size:
        # upsample masks to frame res only once per mask
        for m, c in zip(masks_np, classes_np):
            m_full = m
            if m.shape != (H, W):
                m_full = cv2.resize(m.astype(np.uint8), (W, H), interpolation=cv2.INTER_NEAREST).astype(bool)
            color = CLASS_COLOURS.get(int(c), (255,255,255))
            # overlay
            out[m_full] = (np.array(color, dtype=np.uint8) * alpha + out[m_full] * (1 - alpha)).astype(np.uint8)
            # label near centroid
            ys, xs = np.where(m_full)
            if xs.size:
                xc, yc = int(xs.mean()), int(ys.mean())
                label = LABELS.get(int(c), f"C{int(c)}")
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,0), 2, cv2.LINE_AA)
                cv2.putText(out, label, (max(5, xc-40), max(20, yc)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)

    # Rail tint + green highlight (if available)
    if rail_mask is not None:
        tint = out.copy()
        tint[rail_mask] = (0, 0, 255)  # red tint for rails
        out = cv2.addWeighted(tint, 0.30, out, 0.70, 0)
    if green_mask is not None:
        out[green_mask] = (0, 255, 0)

    # Draw ALL purple triangles we found
    for (x, y) in tri_positions:
        draw_triangle(out, x, y)

    return out

# =======================
# Batched execution with prints; overlays saved for first N
# =======================
def run_pipeline_with_prints_and_overlays():
    paths = (
        glob.glob(str(frames_dir/"frame_*.jpg")) +
        glob.glob(str(frames_dir/"frame_*.png")) +
        glob.glob(str(frames_dir/"*.jpg")) +
        glob.glob(str(frames_dir/"*.png"))
    )
    paths = sorted(set(paths))
    if not paths:
        raise FileNotFoundError(f"No images in: {frames_dir}")
    if SHOW_FIRST_N is not None:
        paths = paths[:SHOW_FIRST_N]

    N = len(paths)
    results_triangle_xy = [None] * N

    def load_batch(batch_paths):
        imgs = [None] * len(batch_paths)
        read_ms = [0.0] * len(batch_paths)
        with ThreadPoolExecutor(max_workers=THREADS_IO) as ex:
            fut2idx = {ex.submit(load_image_with_time, p): i for i, p in enumerate(batch_paths)}
            for fut in as_completed(fut2idx):
                i = fut2idx[fut]
                img, r_ms = fut.result()
                imgs[i] = img
                read_ms[i] = r_ms
        ok = [(p, im, rm) for p, im, rm in zip(batch_paths, imgs, read_ms) if im is not None]
        if not ok:
            return [], [], []
        b_paths, b_imgs, b_read = zip(*ok)
        return list(b_paths), list(b_imgs), list(b_read)

    idx_global = 0
    for batch_paths in chunked(paths, BATCH):
        batch_paths, imgs_bgr, read_ms_list = load_batch(batch_paths)
        B = len(imgs_bgr)
        if B == 0:
            idx_global += len(batch_paths)
            continue

        t0_inf = time.perf_counter()
        res_list = model.predict(
            imgs_bgr, task="segment", imgsz=IMG_SIZE, device=device,
            conf=CONF, iou=IOU, verbose=False, half=half, max_det=MAX_DET,
            batch=B
        )
        try:
            if device == 0 and torch.cuda.is_available():
                torch.cuda.synchronize()
            elif device == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
                torch.mps.synchronize()
        except Exception:
            pass
        t1_inf = time.perf_counter()
        infer_ms_share = ((t1_inf - t0_inf) * 1000.0) / B

        for j, (img, yres, read_ms) in enumerate(zip(imgs_bgr, res_list, read_ms_list)):
            (tri_best_xy, tri_count, mask_count, to_cpu_ms, post_ms,
             masks_np, classes_np, rail_mask, green_mask, tri_positions) = process_frame_post(img, yres)

            results_triangle_xy[idx_global + j] = tri_best_xy
            proc_ms = infer_ms_share + to_cpu_ms + post_ms
            fname = os.path.basename(batch_paths[j])
            frame_idx = idx_global + j + 1

            # Timing/diagnostic print (no rendering time included)
            print(f"[{frame_idx}/{N}] {fname}  "
                  f"read {read_ms:.1f} | infer {infer_ms_share:.1f} | "
                  f"to_cpu {to_cpu_ms:.1f} | post {post_ms:.1f} | "
                  f"masks {mask_count} | triangles {tri_count} "
                  f"=> proc {proc_ms:.1f} ms")

            # --- RENDERING (EXCLUDED from timing) ---
            if frame_idx <= RENDER_FIRST_N:
                overlay = render_overlays(img, masks_np, classes_np, rail_mask, green_mask, tri_positions)
                out_path = out_dir / f"overlay_{frame_idx:04d}_{fname}"
                cv2.imwrite(str(out_path), overlay)

        idx_global += B

    return results_triangle_xy

# =======================
# Entry
# =======================
if __name__ == "__main__":
    _ = run_pipeline_with_prints_and_overlays()


YOLO11n-seg summary (fused): 113 layers, 2,836,908 parameters, 0 gradients, 10.2 GFLOPs
[1/79] frame_00000.png  read 37.3 | infer 122.5 | to_cpu 0.7 | post 158.6 | masks 4 | triangles 1 => proc 281.7 ms
[2/79] frame_00001.png  read 43.8 | infer 42.1 | to_cpu 2.0 | post 156.4 | masks 6 | triangles 1 => proc 200.5 ms
[3/79] frame_00002.png  read 126.1 | infer 53.3 | to_cpu 0.7 | post 178.3 | masks 3 | triangles 3 => proc 232.4 ms
[4/79] frame_00003.png  read 37.4 | infer 45.3 | to_cpu 0.9 | post 157.9 | masks 3 | triangles 2 => proc 204.2 ms
[5/79] frame_00004.png  read 39.2 | infer 46.0 | to_cpu 1.5 | post 145.1 | masks 5 | triangles 1 => proc 192.5 ms
[6/79] frame_00005.png  read 36.8 | infer 48.6 | to_cpu 0.8 | post 120.5 | masks 4 | triangles 1 => proc 169.8 ms
[7/79] frame_00006.png  read 37.4 | infer 50.7 | to_cpu 1.1 | post 122.3 | masks 5 | triangles 1 => proc 174.0 ms
[8/79] frame_00007.png  read 37.7 | infer 41.3 | to_cpu 0.9 | post 159.4 | masks 6 | triangles 2 => proc 201.6 m