In [1]:
import cv2
import time
import numpy as np


def draw_initial_instructions(frame, trackers_text):
    """Draw instructions on the initial frame with adaptive text size."""
    h, w = frame.shape[:2]
    
    # Calculate adaptive font scale based on frame width
    base_scale = w / 1280  # Normalize to 1920p width
    title_scale = 2.0 * base_scale
    text_scale = 1.5 * base_scale
    thickness_title = max(3, int(4 * base_scale))
    thickness_text = max(2, int(3 * base_scale))
    
    # Thickness for white border (outline) - moderately thicker for better visibility
    border_thickness_title = thickness_title + max(2, int(4 * base_scale))
    border_thickness_text = thickness_text + max(2, int(3 * base_scale))
    
    # Create black overlay for entire frame
    overlay = np.zeros_like(frame)
    cv2.rectangle(overlay, (0, 0), (w, h), (0, 0, 0), -1)
    frame = cv2.addWeighted(overlay, 0.7, frame, 0.3, 0)
    
    # Calculate text positions centered
    title_text = f"TRACKER COMPARISON: {trackers_text}"
    inst1_text = "1. Drag to select ROI (Region of Interest)"
    inst2_text = "2. Press ENTER to confirm selection"
    inst3_text = "3. Press ESC to cancel"
    inst4_text = "4. Press Q to quit during comparison"
    
    # Get text sizes
    (title_w, title_h), _ = cv2.getTextSize(title_text, cv2.FONT_HERSHEY_SIMPLEX, title_scale, thickness_title)
    
    # Center positions
    y_start = int(h * 0.3)
    y_spacing = int(80 * base_scale)
    x_center = w // 2
    
    # Draw title with WHITE BORDER + RED TEXT
    title_pos = (x_center - title_w // 2, y_start)
    # First: white border (outline)
    cv2.putText(frame, title_text, title_pos,
                cv2.FONT_HERSHEY_SIMPLEX, title_scale, (255, 255, 255), border_thickness_title, cv2.LINE_AA)
    # Second: red text on top
    cv2.putText(frame, title_text, title_pos,
                cv2.FONT_HERSHEY_SIMPLEX, title_scale, (255, 0, 0), thickness_title, cv2.LINE_AA)
    
    # Draw instructions with WHITE BORDER + RED TEXT
    y_pos = y_start + y_spacing * 2
    for inst_text in [inst1_text, inst2_text, inst3_text, inst4_text]:
        (text_w, text_h), _ = cv2.getTextSize(inst_text, cv2.FONT_HERSHEY_SIMPLEX, text_scale, thickness_text)
        inst_pos = (x_center - text_w // 2, y_pos)
        # First: white border (outline)
        cv2.putText(frame, inst_text, inst_pos,
                    cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 255, 255), border_thickness_text, cv2.LINE_AA)
        # Second: red text on top
        cv2.putText(frame, inst_text, inst_pos,
                    cv2.FONT_HERSHEY_SIMPLEX, text_scale, (255, 0, 0), thickness_text, cv2.LINE_AA)
        y_pos += y_spacing
    
    return frame


def moving_average(data, window=30):
    """Smooth FPS values with larger window for more realistic values."""
    if len(data) < window:
        return np.mean(data)
    return np.mean(data[-window:])


def compute_tracking_stability(failures, total_frames):
    """Estimate tracking stability as (1 - failure_ratio)."""
    if total_frames == 0:
        return 0
    return max(0, 1 - failures / total_frames)


def draw_metrics_with_background(frame, tracker_name, processing_fps, playback_fps, stability, failures):
    """Draw metrics with black background box and larger text."""
    h, w = frame.shape[:2]
    
    # Calculate adaptive sizes
    base_scale = w / 3840  # Normalize based on half frame width
    name_scale = 2.5 * base_scale
    metric_scale = 3.0 * base_scale
    thickness_name = max(4, int(6 * base_scale))
    thickness_metric = max(3, int(5 * base_scale))
    
    # Draw black semi-transparent background box - increased height for new metric
    box_height = int(350 * base_scale)
    box_width = int(w * 0.9)
    box_x = int(w * 0.05)
    box_y = 20
    
    overlay = frame.copy()
    cv2.rectangle(overlay, (box_x, box_y), (box_x + box_width, box_y + box_height), (0, 0, 0), -1)
    frame = cv2.addWeighted(overlay, 0.6, frame, 0.4, 0)
    
    # Text positions
    text_x = box_x + 30
    y_start = box_y + int(70 * base_scale)
    y_spacing = int(70 * base_scale)
    
    # Draw tracker name
    cv2.putText(frame, tracker_name, (text_x, y_start),
                cv2.FONT_HERSHEY_SIMPLEX, name_scale, (255, 255, 255), thickness_name, cv2.LINE_AA)
    
    # Draw metrics - Processing FPS
    cv2.putText(frame, f"Processing FPS: {processing_fps:.1f}", (text_x, y_start + y_spacing),
                cv2.FONT_HERSHEY_SIMPLEX, metric_scale, (0, 255, 255), thickness_metric, cv2.LINE_AA)
    
    # Draw Playback FPS (real reproduction speed)
    cv2.putText(frame, f"Playback FPS: {playback_fps:.1f}", (text_x, y_start + y_spacing * 2),
                cv2.FONT_HERSHEY_SIMPLEX, metric_scale, (255, 165, 0), thickness_metric, cv2.LINE_AA)
    
    cv2.putText(frame, f"Stability: {stability:.2f}", (text_x, y_start + y_spacing * 3),
                cv2.FONT_HERSHEY_SIMPLEX, metric_scale, (0, 255, 0), thickness_metric, cv2.LINE_AA)
    
    cv2.putText(frame, f"Failures: {failures}", (text_x, y_start + y_spacing * 4),
                cv2.FONT_HERSHEY_SIMPLEX, metric_scale, (100, 100, 255), thickness_metric, cv2.LINE_AA)
    
    return frame


def run_tracker_comparison(video_path, tracker_A, tracker_B):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Unable to open video.")
        return

    # Get video FPS for proper playback speed
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    if video_fps <= 0:
        video_fps = 30  # Default fallback
    target_frame_time = 1.0 / video_fps  # Target time per frame in seconds
    print(f"[INFO] Video FPS: {video_fps:.2f}, Target frame time: {target_frame_time*1000:.2f}ms")

    ok, frame = cap.read()
    if not ok:
        print("Error: Cannot read the first frame.")
        return

    # Show initial instructions on full frame
    instruction_frame = draw_initial_instructions(frame.copy(), f"{tracker_A} vs {tracker_B}")
    
    # Scale down frame for ROI selection if video is large
    max_display = 1080
    h, w = frame.shape[:2]
    scale = 1.0
    if max(h, w) > max_display:
        scale = max_display / max(h, w)
        display_frame = cv2.resize(instruction_frame, (int(w * scale), int(h * scale)))
        roi_frame = cv2.resize(frame, (int(w * scale), int(h * scale)))
        print(f"[INFO] ROI selection scaled by {scale:.3f} for large video")
    else:
        display_frame = instruction_frame.copy()
        roi_frame = frame.copy()

    # Create window and center it on screen
    window_name = "Select ROI"
    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
    
    # Get display frame dimensions
    display_h, display_w = display_frame.shape[:2]
    
    # Estimate screen size (common resolutions) and calculate center position
    # For macOS with Retina displays, adjust these values based on your screen
    screen_width = 2560  # Adjust to your screen resolution
    screen_height = 1440  # Adjust to your screen resolution
    
    # Calculate position to center the window
    window_x = max(0, (screen_width - display_w) // 2)
    window_y = max(0, (screen_height - display_h) // 2)
    
    # Resize window to match display frame
    cv2.resizeWindow(window_name, display_w, display_h)
    
    # Move window to center
    cv2.moveWindow(window_name, window_x, window_y)

    # Show instructions
    cv2.imshow(window_name, display_frame)
    cv2.waitKey(5000)  # Show instructions for 5 seconds
    
    # Now show frame for ROI selection
    bbox_scaled = cv2.selectROI(window_name, roi_frame, fromCenter=False, showCrosshair=True)
    cv2.destroyWindow(window_name)
    bbox = tuple(int(v / scale) for v in bbox_scaled)
    print(f"[ROI] Selected region: {bbox}")

    # Define trackers
    tracker_dict = {
        "KCF": cv2.TrackerKCF_create,
        "CSRT": cv2.TrackerCSRT_create,
        "MOSSE": cv2.legacy.TrackerMOSSE_create,
        "MIL": cv2.TrackerMIL_create,
        "MedianFlow": cv2.legacy.TrackerMedianFlow_create,
    }

    if tracker_A not in tracker_dict or tracker_B not in tracker_dict:
        print("Error: Unknown tracker name. Use one of: KCF, CSRT, MOSSE, MIL, MedianFlow.")
        return

    tracker1 = tracker_dict[tracker_A]()
    tracker2 = tracker_dict[tracker_B]()
    tracker1.init(frame, bbox)
    tracker2.init(frame, bbox)

    # Metrics tracking
    tracking_times_A, tracking_times_B = [], []  # Algorithm speed
    real_frame_times_A, real_frame_times_B = [], []  # Real processing FPS
    playback_times = []  # Actual playback FPS (with wait time)
    failures_A, failures_B = 0, 0
    consecutive_failures_A, consecutive_failures_B = 0, 0
    max_consecutive_failures_A, max_consecutive_failures_B = 0, 0
    total_frames = 0

    print(f"\nRunning comparison {tracker_A} vs {tracker_B}...")
    print("Press [Q] or [ESC] to exit.\n")

    while True:
        iteration_start = time.time()  # Start measuring total iteration time
        
        ok, frame = cap.read()
        if not ok:
            break
        total_frames += 1

        frameA = frame.copy()
        frameB = frame.copy()

        # ========== Tracker A - Measure REAL frame processing time ==========
        frame_start_A = time.time()  # Start measuring full frame time
        
        track_start_A = time.time()
        okA, bboxA = tracker1.update(frameA)
        track_time_A = time.time() - track_start_A
        tracking_times_A.append(track_time_A)

        if okA:
            # Draw bounding box with thicker line
            p1 = (int(bboxA[0]), int(bboxA[1]))
            p2 = (int(bboxA[0] + bboxA[2]), int(bboxA[1] + bboxA[3]))
            cv2.rectangle(frameA, p1, p2, (255, 0, 0), 5)
            consecutive_failures_A = 0
        else:
            failures_A += 1
            consecutive_failures_A += 1
            max_consecutive_failures_A = max(max_consecutive_failures_A, consecutive_failures_A)
            
            # Draw failure message
            h_a, w_a = frameA.shape[:2]
            base_scale_a = w_a / 3840
            fail_scale = 2.0 * base_scale_a
            fail_thickness = max(4, int(6 * base_scale_a))
            cv2.putText(frameA, "TRACKING FAILURE", (int(w_a * 0.25), int(h_a * 0.5)),
                        cv2.FONT_HERSHEY_SIMPLEX, fail_scale, (0, 0, 255), fail_thickness, cv2.LINE_AA)

        stability_A = compute_tracking_stability(failures_A, total_frames)
        
        # Calculate processing FPS from frame times
        processing_fps_A = moving_average([1/max(t, 1e-6) for t in real_frame_times_A]) if real_frame_times_A else 0
        
        # Calculate playback FPS
        playback_fps = moving_average([1/max(t, 1e-6) for t in playback_times]) if playback_times else video_fps

        # Draw metrics with background
        frameA = draw_metrics_with_background(frameA, tracker_A, processing_fps_A, playback_fps, stability_A, failures_A)
        
        # Measure total frame time AFTER all drawing
        frame_time_A = time.time() - frame_start_A
        real_frame_times_A.append(frame_time_A)

        # ========== Tracker B - Measure REAL frame processing time ==========
        frame_start_B = time.time()  # Start measuring full frame time
        
        track_start_B = time.time()
        okB, bboxB = tracker2.update(frameB)
        track_time_B = time.time() - track_start_B
        tracking_times_B.append(track_time_B)

        if okB:
            # Draw bounding box with thicker line
            p1 = (int(bboxB[0]), int(bboxB[1]))
            p2 = (int(bboxB[0] + bboxB[2]), int(bboxB[1] + bboxB[3]))
            cv2.rectangle(frameB, p1, p2, (0, 255, 0), 5)
            consecutive_failures_B = 0
        else:
            failures_B += 1
            consecutive_failures_B += 1
            max_consecutive_failures_B = max(max_consecutive_failures_B, consecutive_failures_B)
            
            # Draw failure message
            h_b, w_b = frameB.shape[:2]
            base_scale_b = w_b / 3840
            fail_scale = 2.0 * base_scale_b
            fail_thickness = max(4, int(6 * base_scale_b))
            cv2.putText(frameB, "TRACKING FAILURE", (int(w_b * 0.25), int(h_b * 0.5)),
                        cv2.FONT_HERSHEY_SIMPLEX, fail_scale, (0, 0, 255), fail_thickness, cv2.LINE_AA)

        stability_B = compute_tracking_stability(failures_B, total_frames)
        
        # Calculate processing FPS from frame times
        processing_fps_B = moving_average([1/max(t, 1e-6) for t in real_frame_times_B]) if real_frame_times_B else 0

        # Draw metrics with background
        frameB = draw_metrics_with_background(frameB, tracker_B, processing_fps_B, playback_fps, stability_B, failures_B)
        
        # Measure total frame time AFTER all drawing
        frame_time_B = time.time() - frame_start_B
        real_frame_times_B.append(frame_time_B)

        combined_frame = np.hstack((frameA, frameB))

        # Scale display for large resolutions
        max_width = 1920
        if combined_frame.shape[1] > max_width:
            scale_disp = max_width / combined_frame.shape[1]
            combined_frame = cv2.resize(combined_frame, (0, 0), fx=scale_disp, fy=scale_disp)

        cv2.imshow("Tracker Comparison", combined_frame)
        
        # Calculate remaining time to maintain target FPS
        processing_time = time.time() - iteration_start
        remaining_time = max(0.001, target_frame_time - processing_time)  # At least 1ms
        wait_time_ms = int(remaining_time * 1000)
        
        key = cv2.waitKey(wait_time_ms) & 0xFF
        
        # Track actual playback time (processing + wait)
        iteration_time = time.time() - iteration_start
        playback_times.append(iteration_time)
        
        if key in [27, ord('q')]:
            print("Comparison stopped by user.")
            break

    cap.release()
    cv2.destroyAllWindows()

    # Calculate final metrics
    tracker_fps_A = np.mean([1/max(t, 1e-6) for t in tracking_times_A])
    tracker_fps_B = np.mean([1/max(t, 1e-6) for t in tracking_times_B])
    processing_fps_A = np.mean([1/max(t, 1e-6) for t in real_frame_times_A])
    processing_fps_B = np.mean([1/max(t, 1e-6) for t in real_frame_times_B])
    actual_playback_fps = np.mean([1/max(t, 1e-6) for t in playback_times])
    stability_A_final = compute_tracking_stability(failures_A, total_frames)
    stability_B_final = compute_tracking_stability(failures_B, total_frames)
    failure_rate_A = (failures_A / total_frames * 100) if total_frames > 0 else 0
    failure_rate_B = (failures_B / total_frames * 100) if total_frames > 0 else 0

    print("\n" + "="*60)
    print("FINAL METRICS COMPARISON")
    print("="*60)
    print(f"\nVideo Target FPS: {video_fps:.2f}")
    print(f"Actual Playback FPS: {actual_playback_fps:.2f}")
    print(f"\n{tracker_A}:")
    print(f"  - Processing FPS: {processing_fps_A:.2f}")
    print(f"  - Tracker FPS (Algorithm): {tracker_fps_A:.2f}")
    print(f"  - Tracking Stability: {stability_A_final:.2%}")
    print(f"  - Total Failures: {failures_A}/{total_frames} frames ({failure_rate_A:.1f}%)")
    print(f"  - Max Consecutive Failures: {max_consecutive_failures_A}")
    print(f"  - Avg Tracking Time: {np.mean(tracking_times_A)*1000:.2f} ms/frame")
    
    print(f"\n{tracker_B}:")
    print(f"  - Processing FPS: {processing_fps_B:.2f}")
    print(f"  - Tracker FPS (Algorithm): {tracker_fps_B:.2f}")
    print(f"  - Tracking Stability: {stability_B_final:.2%}")
    print(f"  - Total Failures: {failures_B}/{total_frames} frames ({failure_rate_B:.1f}%)")
    print(f"  - Max Consecutive Failures: {max_consecutive_failures_B}")
    print(f"  - Avg Tracking Time: {np.mean(tracking_times_B)*1000:.2f} ms/frame")
    
    print("\n" + "="*60)
    
    # Determine winner based on stability first, then processing FPS
    if stability_A_final > stability_B_final:
        print(f"Winner: {tracker_A} (better stability)")
    elif stability_B_final > stability_A_final:
        print(f"Winner: {tracker_B} (better stability)")
    else:
        if processing_fps_A > processing_fps_B:
            print(f"Winner: {tracker_A} (same stability, better processing FPS)")
        else:
            print(f"Winner: {tracker_B} (same stability, better processing FPS)")
    print("="*60 + "\n")


# ==========================================================
# RUN
# ==========================================================
if __name__ == "__main__":
    video_path = "14508384_2160_3840_60fps.mp4"
    run_tracker_comparison(video_path, tracker_A="CSRT", tracker_B="MOSSE")

[INFO] Video FPS: 60.00, Target frame time: 16.67ms
[INFO] ROI selection scaled by 0.281 for large video


2025-10-31 18:15:10.484 python[54210:37015667] +[IMKClient subclass]: chose IMKClient_Modern
2025-10-31 18:15:10.484 python[54210:37015667] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Select a ROI and then press SPACE or ENTER button!
Cancel the selection process by pressing c button!
[ROI] Selected region: (1973, 2968, 117, 113)

Running comparison CSRT vs MOSSE...
Press [Q] or [ESC] to exit.


FINAL METRICS COMPARISON

Video Target FPS: 60.00
Actual Playback FPS: 8.50

CSRT:
  - Processing FPS: 23.27
  - Tracker FPS (Algorithm): 33.79
  - Tracking Stability: 100.00%
  - Total Failures: 0/308 frames (0.0%)
  - Max Consecutive Failures: 0
  - Avg Tracking Time: 29.74 ms/frame

MOSSE:
  - Processing FPS: 75.53
  - Tracker FPS (Algorithm): 1452.85
  - Tracking Stability: 100.00%
  - Total Failures: 0/308 frames (0.0%)
  - Max Consecutive Failures: 0
  - Avg Tracking Time: 0.73 ms/frame

Winner: MOSSE (same stability, better processing FPS)



# Summary Report: CSRT vs MOSSE Trackers

## 1. Experiment Setup
- **Video resolution:** 4K (3840×2160)  
- **Total frames:** 308  
- **Original FPS:** 60.00  
- **Actual playback FPS:** 8.39  
- **ROI:** (1955, 2944, 149, 142) → 149×142 pixels  

---

## 2. Key Results

| Metric | CSRT | MOSSE | Winner |
|--------|------|--------|--------|
| **Tracker FPS (algorithm only)** | 32.59 | **1093.39** | MOSSE (~33× faster) |
| **Processing FPS (with rendering)** | 22.68 | **74.35** | MOSSE (~3.3× faster) |
| **Tracking Stability** | 100% | 100% | Tie |
| **Total Failures** | 0 / 308 | 0 / 308 | Tie |
| **Average Tracking Time** | 30.93 ms | **0.93 ms** | MOSSE |

**Summary:** Both trackers achieved perfect stability, but MOSSE was significantly faster than CSRT in both algorithm and overall processing speed.

---

## 3. Playback FPS Analysis

The actual playback speed (8.39 fps) is low due to the high processing time per frame (~119 ms total).  
Estimated breakdown per iteration:

| Component | Time (ms) | Percentage |
|------------|-----------|-------------|
| CSRT Processing | 44.09 | 37% |
| MOSSE Processing | 13.45 | 11% |
| Frame Combination (`np.hstack`) | 2.0 | 2% |
| Resize (4K → 1920px) | 3.0 | 2% |
| Display (`cv2.imshow`) | 1.0 | 1% |
| System Overhead | ~55.46 | 47% |
| **Total** | **~119 ms** | **100%** |

**Main causes of slowdown:**
- High-resolution (4K) frame processing  
- Two trackers running simultaneously  
- Expensive drawing operations (anti-aliasing, transparency)  
- System-level display overhead  

To reach 60 fps, the total processing time per iteration must decrease from approximately 119 ms to 16.67 ms (around 7× faster).

---

## 4. Interpretation

- **Rendering overhead:** ~13 ms per tracker (constant cost)  
- **MOSSE:** 93% of its total time is spent on rendering; the algorithm itself is extremely fast.  
- **CSRT:** The tracking computation is the main bottleneck (~31 ms per frame).  

Rendering time remains nearly constant across trackers, so optimization should focus on reducing frame size and drawing complexity.

---

## 5. Practical Recommendations

**Use CSRT when:**
- Tracking accuracy and robustness are more important than speed  
- Suitable for offline or post-processing workflows  

**Use MOSSE when:**
- Real-time performance is required  
- Working with high-resolution or long-duration videos  

**To improve playback FPS:**
1. Downscale video to 1080p  
2. Simplify rendering (avoid transparency and anti-aliased overlays)  
3. Use faster trackers (e.g., MOSSE for both sides)  
4. Enable parallel processing (multi-threading)  
5. Update metrics every *N* frames instead of every frame  

---

## 6. Final Conclusions

1. MOSSE is approximately 33× faster than CSRT while maintaining 100% tracking stability.  
2. Both trackers performed perfectly on the selected ROI.  
3. Rendering (~13 ms) represents a major portion of the overhead regardless of the tracker.  
4. Playback FPS (8.39) is primarily limited by CSRT’s slower processing speed.  
5. Achieving 60 fps would require about a 7× improvement in overall processing efficiency.  

**Winner: MOSSE** — Faster, equally stable, and suitable for real-time tracking applications.
