In [6]:
import cv2
import glob
import pickle
import scipy.io
import scipy.stats
import numpy as np
import pandas as pd
import mediapipe as mp
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

## Data Pipeline

In [7]:
DEMO_IMG_NUMBER = 805

images_paths = glob.glob('data/AFLW2000/*.jpg')
print('Num of images: ', len(images_paths))

img = cv2.imread(images_paths[DEMO_IMG_NUMBER])
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

print('Image shape: ', img.shape)
plt.figure(figsize=(20, 10))
plt.axis('off')
plt.imshow(img);

Num of images:  0


IndexError: list index out of range

In [None]:
img_info_paths = glob.glob('data/AFLW2000/*.mat')
mat = scipy.io.loadmat(img_info_paths[DEMO_IMG_NUMBER])
mat['Pose_Para'][0][:3]  # Extracting pitch, yaw, and roll

In [None]:
face_mesh = mp.solutions.face_mesh.FaceMesh(
                                            min_detection_confidence=0.5, 
                                            min_tracking_confidence=0.5
                                            )
drawing_specs = mp.solutions.drawing_utils.DrawingSpec(thickness=1, circle_radius=1)
result = face_mesh.process(img)

img_h, img_w, img_c = img.shape
annot_img = img.copy()

if result.multi_face_landmarks != None:
    for face_landmarks in result.multi_face_landmarks:        
        mp.solutions.drawing_utils.draw_landmarks(
                                                    image=annot_img, 
                                                    landmark_list=face_landmarks, 
                                                    landmark_drawing_spec=drawing_specs
                                                    )
        for idx, lm in enumerate(result.multi_face_landmarks[0].landmark):
            if idx == 1:
                nose_x = lm.x * img_w
                nose_y = lm.y * img_h

print('nose position: ', (nose_x, nose_y))
plt.figure(figsize=(20, 10))
plt.axis('off')
plt.imshow(annot_img);

NameError: name 'img' is not defined

In [None]:
cols = []
for pos in ['nose_', 'forehead_', 'left_eye_', 'mouth_left_', 'chin_', 'right_eye_', 'mouth_right_']:
    for dim in ('x', 'y'):
        cols.append(pos+dim)
cols.extend(['pitch', 'yaw', 'roll'])

In [None]:
def extract_features(
                    img, face_mesh, 
                    NOSE = 1,
                    CHIN = 199,
                    FOREHEAD = 10,
                    LEFT_EYE = 33,
                    RIGHT_EYE = 263,
                    MOUTH_LEFT = 61,
                    MOUTH_RIGHT = 291,
                    mat=None
                    ):
    result = face_mesh.process(img)
    face_features = []

    if result.multi_face_landmarks != None:
        for face_landmarks in result.multi_face_landmarks:
            for idx, lm in enumerate(face_landmarks.landmark):
                if idx in [FOREHEAD, NOSE, MOUTH_LEFT, MOUTH_RIGHT, CHIN, LEFT_EYE, RIGHT_EYE]:
                    face_features.append(lm.x)
                    face_features.append(lm.y)
    else:
        face_features.extend([None] * 14)
    
    if mat:
        pose_angles = mat['Pose_Para'][0][:3]
        face_features.extend(pose_angles)
        
    return face_features

In [None]:
face_mesh = mp.solutions.face_mesh.FaceMesh(
                                            min_detection_confidence=0.5, 
                                            min_tracking_confidence=0.5
                                            )
s = extract_features(img, face_mesh)
annot_img = img.copy()

for i in range(len(s)//2):
        cv2.circle(annot_img, center=[int(s[i*2]*img_w), int(s[i*2+1]*img_h)], radius=2, color=(255, 0, 0))
plt.figure(figsize=(20, 10))
plt.axis('off')
plt.imshow(annot_img);

In [None]:
face_mesh = mp.solutions.face_mesh.FaceMesh(
                                            min_detection_confidence=0.5, 
                                            min_tracking_confidence=0.5
                                            )
poses = []

for img_idx, img_path in enumerate(images_paths):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    mat = scipy.io.loadmat(img_info_paths[img_idx])

    try:
        face_features = extract_features(img, face_mesh, mat=mat)
        poses.append(face_features)
    except:
        print("     Skipping image ", img_idx)

    if img_idx % 100 == 0:
        print('Extracted images: ', img_idx)

poses_df = pd.DataFrame(poses, columns=cols)

In [None]:
print(poses_df.head())
poses_df.to_csv('data/AFLW2000/poses.csv', index=False)

## Model Training

In [None]:
poses_df = pd.read_csv('data/AFLW2000/poses.csv')
poses_df = poses_df.dropna(axis=0)
poses_df

In [None]:
train_df, val_df = train_test_split(poses_df, test_size=0.2, random_state=42)

X_train = train_df.drop(['pitch', 'yaw', 'roll'], axis=1)
y_train = train_df[['pitch', 'yaw', 'roll']]

X_val = val_df.drop(['pitch', 'yaw', 'roll'], axis=1)
y_val = val_df[['pitch', 'yaw', 'roll']]

In [None]:
param_grid = {
            'estimator__kernel': ['linear', 'poly', 'rbf'],
            'estimator__C': scipy.stats.expon(scale=5),
            'estimator__degree': np.arange(2, 6),
            'estimator__coef0': np.arange(0, 6),
            'estimator__gamma': scipy.stats.expon(scale=.1),
            'estimator__epsilon': scipy.stats.expon(scale=.1)
            }

svr = SVR()
multi_out_svr = MultiOutputRegressor(svr)
random_search = RandomizedSearchCV(
                                    multi_out_svr, 
                                    param_grid, 
                                    scoring='neg_mean_squared_error', 
                                    random_state=42
                                    )
random_search.fit(X_train, y_train)
svr_model = random_search.best_estimator_

print('best parameters: ', random_search.best_params_)
print('train_rmse: ', np.sqrt(-random_search.best_score_))
print('validation_rmse: ', np.sqrt(mean_squared_error(y_val, svr_model.predict(X_val))))

In [None]:
param_grid = {
            'estimator__kernel': ['rbf'],
            'estimator__C': np.arange(0.6, 0.75, 0.01),
            'estimator__gamma': np.arange(0.09, 0.1, 0.001),
            'estimator__epsilon': np.arange(0.07, 0.08, 0.001)
            }

svr = SVR()
multi_out_svr = MultiOutputRegressor(svr)
grid_search = GridSearchCV(
                            multi_out_svr, 
                            param_grid, 
                            scoring='neg_mean_squared_error'
                            )
grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_

print('best parameters: ', grid_search.best_params_)
print('train_rmse: ', np.sqrt(-grid_search.best_score_))
print('validation_rmse: ', np.sqrt(mean_squared_error(y_val, model.predict(X_val))))

In [None]:
pickle.dump(model, open('models/head_pose.pkl', 'wb'))

### Model Inference

In [1]:
import cv2
import glob
import time
import pickle
import scipy.stats
import numpy as np
import pandas as pd
import mediapipe as mp

In [3]:
#with open('models/head_pose.pkl', 'rb') as f:
 #   model_head_pose = pickle.load(f)

face_mesh = mp.solutions.face_mesh.FaceMesh(
                                            min_detection_confidence=0.5, 
                                            min_tracking_confidence=0.5
                                            )

mp_drawing = mp.solutions.drawing_utils
drawing_spec = mp_drawing.DrawingSpec(
                                    color=(128,0,128),
                                    circle_radius=1,
                                    thickness=2
                                    )
p_face_mesh = mp.solutions.face_mesh

In [None]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    success, image = cap.read()
  
    start = time.time()

    image = cv2.cvtColor(cv2.flip(image,1),cv2.COLOR_BGR2RGB) #flipped for selfie view

    image.flags.writeable = False

    results = face_mesh.process(image)

    image.flags.writeable = True

    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)

    img_h , img_w, img_c = image.shape
    face_2d = []
    face_3d = []

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            for idx, lm in enumerate(face_landmarks.landmark):
                if idx == 33 or idx == 263 or idx ==1 or idx == 61 or idx == 291 or idx==199:
                    if idx ==1:
                        nose_2d = (lm.x * img_w,lm.y * img_h)
                        nose_3d = (lm.x * img_w,lm.y * img_h,lm.z * 3000)
                    x,y = int(lm.x * img_w),int(lm.y * img_h)

                    face_2d.append([x,y])
                    face_3d.append(([x,y,lm.z]))


            #Get 2d Coord
            face_2d = np.array(face_2d,dtype=np.float64)

            face_3d = np.array(face_3d,dtype=np.float64)

            focal_length = 1 * img_w

            cam_matrix = np.array([[focal_length,0,img_h/2],
                                  [0,focal_length,img_w/2],
                                  [0,0,1]])
            distortion_matrix = np.zeros((4,1),dtype=np.float64)

            success,rotation_vec,translation_vec = cv2.solvePnP(face_3d,face_2d,cam_matrix,distortion_matrix)


            #getting rotational of face
            rmat,jac = cv2.Rodrigues(rotation_vec)

            angles,mtxR,mtxQ,Qx,Qy,Qz = cv2.RQDecomp3x3(rmat)

            x = angles[0] * 360
            y = angles[1] * 360
            z = angles[2] * 360

            #here based on axis rot angle is calculated
            if y < -10:
                text="Looking Left"
            elif y > 10:
                text="Looking Right"
            elif x < -10:
                text="Looking Down"
            elif x > 10:
                text="Looking Up"
            else:
                text="Forward"

            nose_3d_projection,jacobian = cv2.projectPoints(nose_3d,rotation_vec,translation_vec,cam_matrix,distortion_matrix)

            p1 = (int(nose_2d[0]),int(nose_2d[1]))
            p2 = (int(nose_2d[0] + y*10), int(nose_2d[1] -x *10))

            cv2.line(image,p1,p2,(255,0,0),3)

            cv2.putText(image,text,(20,50),cv2.FONT_HERSHEY_SIMPLEX,2,(0,255,0),2)
            cv2.putText(image,"x: " + str(np.round(x,2)),(500,50),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
            cv2.putText(image,"y: "+ str(np.round(y,2)),(500,100),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
            cv2.putText(image,"z: "+ str(np.round(z, 2)), (500, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)


        end = time.time()
        totalTime = end-start

        fps = 1/totalTime
        print("FPS: ",fps)

        cv2.putText(image,f'FPS: {int(fps)}',(20,450),cv2.FONT_HERSHEY_SIMPLEX,1.5,(0,255,0),2)

        mp_drawing.draw_landmarks(
                                image=image,
                                landmark_list=face_landmarks,
                                connections=mp.solutions.face_mesh.FACEMESH_CONTOURS,
                                connection_drawing_spec=drawing_spec,
                                landmark_drawing_spec=drawing_spec
                                )
    cv2.imshow('Head Pose Detection',image)
    if cv2.waitKey(5) & 0xFF ==27:
        break
cap.release()


FPS:  55.28714541811663
FPS:  142.94540249471746
FPS:  200.00495922941204
FPS:  221.72141460062377
FPS:  158.96547280651885
FPS:  220.97381592118435
FPS:  221.13692202246006
FPS:  221.62768824306474
FPS:  200.31061655284398
FPS:  222.6512368616626
FPS:  219.29854648122975
FPS:  221.7565824257164
FPS:  71.61426034694713
FPS:  200.3297511582366
FPS:  178.89972275538494
FPS:  61.414510579105354
FPS:  199.64320053310485
FPS:  249.94362672069602
FPS:  249.26035538123253
FPS:  219.9540615658923
FPS:  249.67581403654978
FPS:  250.01812112541725
FPS:  200.17677659523696
FPS:  250.2269418923756
FPS:  250.04793132228448
FPS:  249.4234062797336
FPS:  222.14416609289762
FPS:  221.5223407626492
FPS:  232.33279787292972
FPS:  199.9858866161255
FPS:  250.01812112541725
FPS:  250.00321869225726
FPS:  222.19123801451502
FPS:  249.9883180355227
FPS:  221.81522026548205
FPS:  275.4517633151639
FPS:  249.80964860035735
FPS:  199.97635167350052
FPS:  220.57870102550618
FPS:  181.35178139052232
FPS:  171.91