In [2]:
import cv2
import numpy as np
from skimage import io
from batch_face import RetinaFace, LandmarkPredictor, draw_landmarks, Timer
from live_pose_estimator import SixDRep
import mediapipe as mp
from mediapipe.python.solutions.drawing_utils import DrawingSpec
from mediapipe.python.solutions.pose import PoseLandmark
import time

In [3]:
# For Mac users
#import torch

## Force CPU usage
#torch.set_default_tensor_type('torch.FloatTensor')

In [4]:
def get_landmarks(frame, faces):
    ### Predict landmarks from given face co-ordinates ###
    landmarks = predictor(faces, frame, from_fd=True)
    return landmarks

In [5]:
def draw_landmarks_cv(frame, faces, landmarks):
    ### Draw landmarks on faces using CV2 - Possible to draw multiple faces with a For loop, however we are only interested in having one face in the frame ### 
    frame = draw_landmarks(frame, faces[0][0], landmarks[0])
    return frame

In [6]:
def get_head_pose(frame, faces_pose):
    head_poses = head_pose_estimator(faces_pose, frame, input_face_type='tuple', update_dict=True)
    return head_poses

In [7]:
def draw_head_pose_cube_cv(frame, faces, pose):
    head_pose_estimator.plot_pose_cube(frame, faces[0][0], **pose)

In [8]:
def updated_bbox(landmarks):
    ldm_new = landmarks[0]
    (x1, y1), (x2, y2) = ldm_new.min(0), ldm_new.max(0)
    box_new = np.array([x1, y1, x2, y2])
    box_new[:2] -= 10
    box_new[2:] += 10
    faces = [[box_new, None, None]]
    return faces

In [30]:
from batch_face import (
    RetinaFace,
)
from sixdrepnet.model import SixDRepNet
import os
import numpy as np
import cv2
from math import cos, sin

import torch
from PIL import Image
from sixdrepnet import utils

import torch
from torch import nn
import torch.nn.functional as F

detector = RetinaFace(gpu_id=0) # -1 for mac 
cam = 1
device = torch.device('cuda') #-1 for mac
model = SixDRepNet(backbone_name='RepVGG-B1g2',
                   backbone_file='',
                   deploy=True,
                   pretrained=False)
model.to(device)
bw = False

class SixthEyeNet(nn.ModuleList):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 9, 3)
        self.pool = nn.MaxPool2d(3, 3)
        self.conv2 = nn.Conv2d(9, 26, 3)
        self.fc1 = nn.Linear(3432, 600)
        self.fc2 = nn.Linear(600, 50)
        self.fc3 = nn.Linear(53, 2)

    def forward(self, x):
        x, head_pos = x
        head_pos = head_pos
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.cat((x, head_pos), 1)
        x = self.fc3(x)
        return x

In [12]:
def get_input_data(image, offset_coeff=1) -> dict:
    try:
        coeff = 1280 / image.shape[1]
        resized_image = cv2.resize(image, (1280, int(image.shape[0]*coeff)))
        with torch.no_grad():
            faces = detector(resized_image)
            result = []
            for box, landmarks, score in faces:

                # Print the location of each face in this image
                if score < .95:
                    continue
                x_min = int(box[0])
                y_min = int(box[1])
                x_max = int(box[2])
                y_max = int(box[3])

                x_min2 = int(box[0])
                y_min2 = int(box[1])
                x_max2 = int(box[2])
                y_max2 = int(box[3])

                x_3 = int(landmarks[0][0])
                y_3 = int(landmarks[0][1])
                x_4 = int(landmarks[1][0])
                y_4 = int(landmarks[1][1])

                bbox_width = abs(x_max - x_min)
                bbox_height = abs(y_max - y_min)

                x_min = max(0, x_min-int(0.2*bbox_height))
                y_min = max(0, y_min-int(0.2*bbox_width))
                x_max += int(0.2*bbox_height)
                y_max += int(0.2*bbox_width)
                img = resized_image[y_min:y_max, x_min:x_max]
                img = Image.fromarray(img)
                img = img.convert('RGB')
                img = transformations(img)
            
                img = torch.Tensor(img[None, :]).to(device)
                
                R_pred = model(img)
                
                euler = utils.compute_euler_angles_from_rotation_matrices(
                    R_pred)*180/np.pi
                
                curr = {'p_pred_deg': euler[:, 0].cpu(),
                        'y_pred_deg': euler[:, 1].cpu(),
                        'r_pred_deg': euler[:, 2].cpu()
                        }
                
                offset = abs(((x_3 - x_min2)/2 + (x_max2-x_4)/2)/2)
                x_offset = int(offset*1.2*offset_coeff)
                y_offset = int(offset*0.8*offset_coeff)

                y_3_min = int((y_3 - y_offset) / coeff)
                y_3_max = int((y_3 + y_offset) / coeff)
                x_3_min = int((x_3 - x_offset) / coeff)
                x_3_max = int((x_3 + x_offset) / coeff)

                y_4_min = int((y_4 - y_offset) / coeff)
                y_4_max = int((y_4 + y_offset) / coeff)
                x_4_min = int((x_4 - x_offset) / coeff)
                x_4_max = int((x_4 + x_offset) / coeff)

                right_eye = image[y_3_min:y_3_max, x_3_min: x_3_max]
                left_eye = image[y_4_min:y_4_max, x_4_min: x_4_max]
                left_eye = cv2.resize(
                    left_eye, (right_eye.shape[1], right_eye.shape[0]))
                curr['image'] = cv2.hconcat([right_eye, left_eye])
                curr['box'] = list(map(lambda x: x/coeff, box))
                curr['landmarks'] = list(
                    map(lambda y: list(map(lambda x: x/coeff, y)), landmarks))
                result.append(curr)
    except Exception as e:
        print(e.args)
        return None
    return result

In [13]:
def draw_eye_axis(img, yaw, pitch, roll, tdx, tdy, size=100):

    pitch = pitch * np.pi / 180
    yaw = -(yaw * np.pi / 180)
    roll = roll * np.pi / 180

    x = size * (sin(yaw)) + tdx
    y = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x), int(y)), (255, 255, 0), 3)

    return img

In [61]:
def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size = 100):

    pitch = pitch * np.pi / 180
    yaw = -(yaw * np.pi / 180)
    roll = roll * np.pi / 180

    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 1.5
        tdy = height / 1.5

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    #        v
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    #cv2.line(img, (int(tdx), int(tdy)), (int(x1),int(y1)),(0,0,255),4)
    #cv2.line(img, (int(tdx), int(tdy)), (int(x2),int(y2)),(0,255,0),4)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3),int(y3)),(255,69,0),7)

    return img

In [41]:
def draw_eye_gaze(face, net, affinity_frame):
    box = face['box']

    # Print the location of each face in this image
    x_min = int(box[0])
    y_min = int(box[1])
    x_max = int(box[2])
    y_max = int(box[3])

    bbox_width = abs(x_max - x_min)
    bbox_height = abs(y_max - y_min)

    x_min = max(0, x_min-int(0.2*bbox_height))
    y_min = max(0, y_min-int(0.2*bbox_width))
    x_max += int(0.2*bbox_height)
    y_max += int(0.2*bbox_width)

    hp = face['p_pred_deg']
    hy = face['y_pred_deg']
    hr = face['r_pred_deg']

    image = face['image']
    image = cv2.resize(image, (210, 70),
                        interpolation=cv2.INTER_CUBIC)
    if bw:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = transforms(image).to(device)
    # Check the devices of the inputs before the forward pass
    head_pos = torch.unsqueeze(torch.tensor(
        [float(hp), float(hr), float(hy)], dtype=torch.float32), dim=0).to(device)
    image = torch.unsqueeze(image, dim=0).to(device)
    res = net((image, head_pos))
    res = res.tolist()[0]
    pitch = res[0]
    yaw = -res[1]
    
    print(pitch, yaw)

    draw_axis(affinity_frame, yaw, pitch, hr,
                    x_min+int(.5*(x_max-x_min)), y_min+int(.5*(y_max-y_min)), size=130)
    return

In [17]:
show_advanced_face_mesh_landmarks = False #Flag to show advanced face mesh for end-user to get a more detailed face mesh, purely visual for end user.

mp_draw = mp.solutions.drawing_utils
mp_hol = mp.solutions.holistic

custom_connections = list(mp_hol.POSE_CONNECTIONS)

hand_connections_style = DrawingSpec(color=(0, 255, 0), thickness=2)

excluded_pose_landmarks = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 18, 19, 20, 21, 22, 25, 26, 27, 28, 29, 30, 31, 32]

holistic = mp_hol.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

def draw(live_frame, affinity_frame):
    global pose_estimation_state
    global target_frames_counter
    global key_actions_detected
    global show_advanced_face_mesh_landmarks  # Use the global flag here

    # Detection
    # Upload frame to GPU DOES NOT WORK DUE TO INCOMPATABILITY WITH VERSIONS D:
    #gpu_frame = cv2.cuda_GpuMat()
    #gpu_frame.upload(frame)
    #gpu_frame = cv2.cuda.cvtColor(gpu_frame, cv2.COLOR_BGR2RGB)
    image = cv2.cvtColor(live_frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    img_h, img_w = live_frame.shape[:2]
    #image = gpu_frame.download()  # Download image for further processing
    results = holistic.process(image)
    image.flags.writeable = True

    # Check if 'f' is pressed to toggle face landmarks, idk if we need this or not, it's just something i added, doesnt really slow down the program
    if cv2.waitKey(10) & 0xFF == ord('f'):
        show_advanced_face_mesh_landmarks = not show_advanced_face_mesh_landmarks  # Toggle the flag
        
    if show_advanced_face_mesh_landmarks and results.face_landmarks:
        mp_draw.draw_landmarks(affinity_frame, results.face_landmarks, mp_hol.FACEMESH_TESSELATION, 
                                 mp_draw.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                 mp_draw.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                                 )
    
     # Draw Hand Landmarks & Connections
    mp_draw.draw_landmarks(affinity_frame, results.right_hand_landmarks, mp_hol.HAND_CONNECTIONS, connection_drawing_spec=hand_connections_style)
    mp_draw.draw_landmarks(affinity_frame, results.left_hand_landmarks, mp_hol.HAND_CONNECTIONS, connection_drawing_spec=hand_connections_style)

    # Draw Upper Body Pose Landmarks & Connections
    if results.pose_landmarks:
        # Draw Upper Body Pose Landmarks
        for idx, landmark in enumerate(results.pose_landmarks.landmark):
            if landmark.visibility > 0.5:
                if PoseLandmark(idx) not in excluded_pose_landmarks:
                    cv2.circle(affinity_frame, (int(landmark.x * img_w), int(landmark.y * img_h)), 5, (0, 0, 255), -1)

        # Draw Upper Body Pose Connections
        for connection in custom_connections:
            start = results.pose_landmarks.landmark[connection[0]]
            end = results.pose_landmarks.landmark[connection[1]]
            if (start.visibility > 0.5 and end.visibility > 0.5):
                if (PoseLandmark(connection[0]) not in excluded_pose_landmarks and PoseLandmark(connection[1]) not in excluded_pose_landmarks):
                    cv2.line(affinity_frame, (int(start.x * img_w), int(start.y * img_h)), (int(end.x * img_w), int(end.y * img_h)), (0, 255, 0), 2)
    return affinity_frame


In [31]:
transformations = transforms.Compose([transforms.Resize(224),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

In [None]:
print(cv2.getBuildInformation())


  Version control:               4.10.0

  Extra modules:
    Location (extra):            D:/a/opencv-python/opencv-python/opencv_contrib/modules
    Version control (extra):     4.10.0

  Platform:
    Timestamp:                   2024-06-17T18:00:01Z
    Host:                        Windows 10.0.17763 AMD64
    CMake:                       3.24.2
    CMake generator:             Visual Studio 14 2015
    CMake build tool:            MSBuild.exe
    MSVC:                        1900
    Configuration:               Debug Release

  CPU/HW features:
    Baseline:                    SSE SSE2 SSE3
      requested:                 SSE3
    Dispatched code generation:  SSE4_1 SSE4_2 FP16 AVX AVX2
      requested:                 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX
      SSE4_1 (16 files):         + SSSE3 SSE4_1
      SSE4_2 (1 files):          + SSSE3 SSE4_1 POPCNT SSE4_2
      FP16 (0 files):            + SSSE3 SSE4_1 POPCNT SSE4_2 FP16 AVX
      AVX (8 files):             + SSSE3 SS

Opens camera and passes frames to functions, comment/uncomment functions for desired tracking

In [62]:
from torchvision import transforms
### Open camera ###
""" cap = cv2.VideoCapture(0)
detector = RetinaFace(0) #user gpu_id=-1 for Mac to indicate CPU
predictor = LandmarkPredictor(0) #user gpu_id=-1 for Mac to indicate CPU
head_pose_estimator = SixDRep(0) #user gpu_id=-1 for Mac to indicate CPU
detect_time = time.time()
faces = None """

cap = cv2.VideoCapture(0)
detector = RetinaFace(gpu_id=0) #user gpu_id=-1 for Mac to indicate CPU
predictor = LandmarkPredictor(gpu_id=0) #user gpu_id=-1 for Mac to indicate CPU
head_pose_estimator = SixDRep(gpu_id=0) #user gpu_id=-1 for Mac to indicate CPU
detect_time = time.time()
faces = None

#cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
#cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)


transforms = transforms.Compose([transforms.ToPILImage(),
                                     transforms.Resize((70, 210)),
                                     transforms.ToTensor()])



net = SixthEyeNet()
EYE_MODEL_PATH = '../eye-gaze-data-loader/models/sixth_eye_net_combined.pth'
bw = False
net.load_state_dict(torch.load(EYE_MODEL_PATH))
net.to(device)

with torch.no_grad():
    n = 0

    while True:
        # Capture frame-by-frame
        ret, live_frame = cap.read()
        loop_time = time.time()
        
        ### NOTE: RGB values are normalized within RetinaFace ###
        ### Detect faces if none exist ###
        
        # Calculate the time difference
        elapsed_time = time.time() - detect_time

        ### Initialise a black frame ###
        black_frame = np.zeros_like(live_frame)

        # Check if n seconds has passed: The shorter the elapsed time - the more face detections are done, but also the lower the fps and efficiency
        if faces is None or elapsed_time >= 1:
            faces = detector(live_frame, cv=True, threshold=0.5)
            detect_time = time.time()
        else:
            ### This is an efficiency method of predicting the face bound-box - especially for live camera. It uses the min and max values from the results of the previous landmark 'predictor' function. Helps increase the fps rate ###
            ### However, it will not detect new faces, or when a face has gone ###
            faces = updated_bbox(landmarks)

        if len(faces) == 0:
            print("NO face is detected!")
            continue

        ### Predict landmarks from face ###
        landmarks = get_landmarks(live_frame, faces)

        ### Estimate head pose from face ###
        pose = get_head_pose(live_frame, faces)
        
        ### Draw landmarks (AND/OR) pose cube ###
        black_frame = draw_landmarks_cv(black_frame, faces, landmarks)
        #draw_head_pose_cube_cv(black_frame, faces, pose[0])

        ###
        affinity_frame = draw(live_frame, black_frame)

        # Calculate and display FPS, Pitch, Yaw and Roll
        fps = 1 / (time.time() - loop_time)
        #cv2.putText(black_frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
        #cv2.putText(black_frame, f"Pitch: {pose[0]['pitch']:.2f}", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        #cv2.putText(black_frame, f"Yaw: {pose[0]['yaw']:.2f}", (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        #cv2.putText(black_frame, f"Roll: {pose[0]['roll']:.2f}", (10, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        input_data = get_input_data(live_frame)
        if input_data is not None:
            if len(input_data) != 0:
                for face in input_data:
                    draw_eye_gaze(face, net, affinity_frame)

        
        ### Display the resulting frame ###
        cv2.imshow('', affinity_frame)

        ### Press 'q' to exit the video window ###
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

### Release the capture when done ###
cap.release()
cv2.destroyAllWindows()

  net.load_state_dict(torch.load(EYE_MODEL_PATH))


-12.016529083251953 0.11101677268743515
-22.414222717285156 -11.37082290649414
-24.22809600830078 -7.765000343322754
-19.75263023376465 -5.324861526489258
-21.600011825561523 -3.755385398864746
-24.07997703552246 -3.1457579135894775
-23.663419723510742 -2.9836950302124023
-22.876117706298828 -3.7056872844696045
-19.965925216674805 -2.3867428302764893
-23.294145584106445 3.9738831520080566
-22.95243263244629 -7.888794422149658
-22.89484214782715 -5.209085941314697
-22.80501365661621 -4.522103309631348
-22.058324813842773 -6.145267486572266
-21.32048225402832 -4.668886661529541
-22.311908721923828 -7.458078861236572
-20.839506149291992 -3.63850474357605
-20.514352798461914 -3.0966858863830566
-19.410627365112305 -0.8767509460449219
-18.05751609802246 -3.1525533199310303
-19.21438980102539 -1.0462682247161865
-17.18753433227539 -1.663245439529419
-17.942350387573242 -2.2920351028442383
-16.59168815612793 -18.043699264526367
-16.748689651489258 -16.148513793945312
-17.32608985900879 -11.44

In [None]:
from sixdrepnet import SixDRepNet
import sixdrepnet.utils as utils
from opencv_transforms import transforms as cv_transforms
import torch
import numpy as np

crop_resize = cv_transforms.Compose([cv_transforms.Resize(224),
                                    cv_transforms.CenterCrop(224)])

normalize = cv_transforms.Compose([cv_transforms.ToTensor(),
                                    cv_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


def chunk_generator(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def flatten(l):
    return [item for sublist in l for item in sublist]

def chunk_call(model, chunk_size, input_tensor):
    outputs = []
    for chunk in chunk_generator(input_tensor, chunk_size):
        outputs.append(model(chunk))
    if isinstance(outputs[0], torch.Tensor):
        return torch.cat(outputs, dim=0)
    else:
        return flatten(outputs)

class SixDRep:
    def __init__(self, gpu_id: int= -1, dict_path: str='') -> None:
        self.model = SixDRepNet(gpu_id=gpu_id, dict_path=dict_path)
        if gpu_id == -1:
            self.device = torch.device('cpu')
        else:
            self.device = torch.device('cuda:{}'.format(gpu_id))

    def plot_pose_cube(self, frame, box, yaw, pitch, roll):
        x_min = int(box[0])
        y_min = int(box[1])
        x_max = int(box[2])
        y_max = int(box[3])
        bbox_width = abs(x_max - x_min)
        bbox_height = abs(y_max - y_min)

        x_min = max(0, x_min-int(0.2*bbox_height))
        y_min = max(0, y_min-int(0.2*bbox_width))
        x_max = x_max+int(0.2*bbox_height)
        y_max = y_max+int(0.2*bbox_width)
        utils.plot_pose_cube(frame,  yaw, pitch, roll, x_min + int(.5*(x_max-x_min)), y_min + int(.5*(y_max-y_min)), size=bbox_width)

    def __call__(self, all_faces, frames, batch_size=None, input_face_type='tuple', update_dict=True):
        '''
        frames: list of np.ndarray, 0~255, uint8, rgb order
        batch_size: int, if None, no chunking
        input_face_type: str, 'tuple' or 'dict' or 'box'
        update_dict: bool, if True, update the input dictionary with head pose
        '''
        # if update_dict:
        #     assert input_face_type == 'dict', 'input_face_type should be dict when updating dictionary'

        #assert len(frames) == len(all_faces) M
        if batch_size is None:
            batch_size = len(all_faces) # no chunking
        imgs_for_model = []
        metas = []
        for face, i in zip(all_faces, range(len(frames))):
            #for j, face in enumerate(faces): M
            frame = frames #M
            if input_face_type == 'tuple':
                box = face[0]
            elif input_face_type == 'dict':
                box = face['box']
            elif input_face_type == 'box':
                box = face
            x_min = int(box[0])
            y_min = int(box[1])
            x_max = int(box[2])
            y_max = int(box[3])
            
            bbox_width = abs(x_max - x_min)
            bbox_height = abs(y_max - y_min)

            x_min = max(0, x_min-int(0.2*bbox_height))
            y_min = max(0, y_min-int(0.2*bbox_width))
            x_max = x_max+int(0.2*bbox_height)
            y_max = y_max+int(0.2*bbox_width)
            img = frame[y_min:y_max, x_min:x_max]
            imgs_for_model.append(normalize(crop_resize(img)))
            metas.append((i, 0, x_min, y_min, x_max, y_max, bbox_width, bbox_height))

                # pitch, yaw, roll = model.predict(img)
                # img = model.draw_axis(img, yaw, pitch, roll)
                # frame[y_min:y_max, x_min:x_max] = img

                # utils.plot_pose_cube(frame,  yaw, pitch, roll, x_min + int(.5*(
                #             x_max-x_min)), y_min + int(.5*(y_max-y_min)), size=bbox_width)

        imgs_for_model = torch.stack(imgs_for_model).to(self.device)
        with torch.no_grad():
            pred = chunk_call(self.model.model, batch_size, imgs_for_model)

        euler = utils.compute_euler_angles_from_rotation_matrices(pred)*180/np.pi
        p = euler[:, 0].cpu().detach().numpy()
        y = euler[:, 1].cpu().detach().numpy()
        r = euler[:, 2].cpu().detach().numpy()

        # reorganize the output
        outputs = [] #[] for _ in range(len(frames)) M

        for (i, j, x_min, y_min, x_max, y_max, bbox_width, bbox_height), pitch, yaw, roll in zip(metas, p, y, r):
            #utils.plot_pose_cube(frames[i], yaw, pitch, roll, x_min + int(.5*(x_max-x_min)), y_min + int(.5*(y_max-y_min)), size=bbox_width)
            head_pose = {
                'pitch': pitch,
                'yaw': yaw,
                'roll': roll
            }
            outputs.append(head_pose)
            if update_dict and input_face_type == 'dict':

                all_faces[0]['head_pose'] = head_pose
        #for faces, output in zip(all_faces, outputs):
            #assert len(faces) == len(output)
        return outputs