In [6]:
import base64

import cv2
import numpy as np
import requests
import os
from scipy.interpolate import interp1d
from scipy.interpolate import griddata


IMG_PATH = "image.jpg"
API_KEY = os.environ["ROBOFLOW_API_KEY"]
DISTANCE_TO_OBJECT = 1250  # mm             # Mejor funcionamieto - 50cm reales de la camara
HEIGHT_OF_HUMAN_FACE = 210  # mm
GAZE_DETECTION_URL = (
    "http://127.0.0.1:9001/gaze/gaze_detection?api_key=" + API_KEY
)
previous_gaze_point = None  # Para almacenar el punto de mirada anterior
SMOOTHING_FACTOR = 0.7  # Factor de suavizado (ajusta este valor)

In [None]:
def detect_gazes(frame: np.ndarray):
    """
    Sends an image frame to the Roboflow API for gaze detection.
    """
    img_encode = cv2.imencode(".jpg", frame)[1]
    img_base64 = base64.b64encode(img_encode)
    resp = requests.post(
        GAZE_DETECTION_URL,
        json={
            "api_key": API_KEY,
            "image": {"type": "base64", "value": img_base64.decode("utf-8")},
        },
    )
    if resp.status_code != 200:
        print("[ERROR] Failed to get gaze predictions from Roboflow API.")
        return []
    return resp.json()[0].get("predictions", [])

def calibrate_with_roboflow(cap, calibration_points, capture_time=3):
    """
    Calibrates the gaze detection by mapping gaze predictions (yaw, pitch)
    to screen coordinates using user-provided calibration points.
    """
    calibration_data = []

    for (sx, sy) in calibration_points:
        print(f"[CALIBRATION] Look at point ({sx}, {sy}) for {capture_time} seconds.")
        start_time = cv2.getTickCount()

        while (cv2.getTickCount() - start_time) / cv2.getTickFrequency() < capture_time:
            ret, frame = cap.read()
            if not ret:
                print("[ERROR] Failed to capture frame during calibration.")
                break

            # Display the calibration point
            canvas = np.zeros_like(frame)
            cv2.circle(canvas, (sx, sy), 10, (0, 0, 255), -1)
            cv2.imshow("Calibration", canvas)
            cv2.waitKey(1)

            # Get gaze data from Roboflow
            gazes = detect_gazes(frame)
            if gazes:
                gaze = gazes[0]
                yaw, pitch = gaze["yaw"], gaze["pitch"]
                calibration_data.append(((yaw, pitch), (sx, sy)))

    # Create interpolation functions
    yaw_pitch = [data[0] for data in calibration_data]
    screen_points = [data[1] for data in calibration_data]
    f_interp_x = griddata(yaw_pitch, [p[0] for p in screen_points], method="linear", fill_value="extrapolate")
    f_interp_y = griddata(yaw_pitch, [p[1] for p in screen_points], method="linear", fill_value="extrapolate")

    cv2.destroyAllWindows()
    return f_interp_x, f_interp_y

def smooth_gaze_point(gaze_point, previous_gaze_point, screen_width, screen_height):
    """
    Smoothens the gaze point using exponential smoothing.
    """
    if previous_gaze_point is None:
        return gaze_point

    smoothed_x = int(SMOOTHING_FACTOR * previous_gaze_point[0] + (1 - SMOOTHING_FACTOR) * gaze_point[0])
    smoothed_y = int(SMOOTHING_FACTOR * previous_gaze_point[1] + (1 - SMOOTHING_FACTOR) * gaze_point[1])

    smoothed_x = max(0, min(smoothed_x, screen_width - 1))
    smoothed_y = max(0, min(smoothed_y, screen_height - 1))
    return smoothed_x, smoothed_y

def main():
    # Camera setup
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("[ERROR] Unable to open the camera.")
        return

    # Screen resolution
    screen_width, screen_height = 1920, 1080

    # Define calibration points
    x_positions = [0, screen_width // 4, screen_width // 2, 3 * screen_width // 4, screen_width - 1]
    calibration_points = []
    for y in [0, screen_height // 2, screen_height - 1]:
        for x in x_positions:
            calibration_points.append((x, y))

    # Calibration
    f_interp_x, f_interp_y = calibrate_with_roboflow(cap, calibration_points)

    global previous_gaze_point

    print("[INFO] Starting gaze tracking. Press ESC to exit.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("[ERROR] Failed to capture frame.")
            break

        # Detect gaze
        gazes = detect_gazes(frame)
        if not gazes:
            continue

        gaze = gazes[0]
        yaw, pitch = gaze["yaw"], gaze["pitch"]
        face_bbox = gaze["face"]

        # Calculate gaze point in screen coordinates
        dx = -DISTANCE_TO_OBJECT * np.tan(yaw) / (HEIGHT_OF_HUMAN_FACE / face_bbox["height"])
        dy = -DISTANCE_TO_OBJECT * np.tan(pitch) / (HEIGHT_OF_HUMAN_FACE / face_bbox["height"])
        gaze_point = int(screen_width / 2 + dx), int(screen_height / 2 + dy)

        # Map to calibrated screen coordinates
        gaze_point = (int(f_interp_x([yaw, pitch])), int(f_interp_y([yaw, pitch])))

        # Smooth the gaze point
        smoothed_gaze_point = smooth_gaze_point(gaze_point, previous_gaze_point, screen_width, screen_height)
        previous_gaze_point = smoothed_gaze_point

        # Visualize the gaze point
        canvas = np.zeros((screen_height, screen_width, 3), dtype=np.uint8)
        cv2.circle(canvas, smoothed_gaze_point, 20, (0, 0, 255), -1)
        cv2.imshow("Gaze Tracking", canvas)

        if cv2.waitKey(1) & 0xFF == 27:  # ESC key
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

[CALIBRATION] Look at point (0, 0) for 3 seconds.
[CALIBRATION] Look at point (480, 0) for 3 seconds.
