# Real-Time Head Pose Monitoring (Webcam)

## 0) Setup

In [None]:
# Uncomment if you need installs in a clean environment.
# !pip install opencv-python torch torchvision pillow mediapipe

## 1) Configuration

In [2]:
import os
from dataclasses import dataclass

@dataclass
class Config:
    # Camera
    camera_id: int = 0

    # Model weights
    model_path: str = "hopenet_robust_alpha1.pkl"
    num_bins: int = 66

    # Flag thresholds (degrees)
    yaw_thr: float = 20.0
    pitch_thr: float = 15.0
    roll_thr: float = 10.0

    # Temporal logic
    ema_alpha: float = 0.2              # smoothing factor (0..1), higher = less smoothing
    no_face_grace_s: float = 1.0        # seconds allowed without face before warning
    away_required_s: float = 1.2        # looking-away must persist this long to flag

    # Logging
    log_dir: str = "logs"
    session_name: str = "session"

cfg = Config()
os.makedirs(cfg.log_dir, exist_ok=True)
cfg

Config(camera_id=0, model_path='hopenet_robust_alpha1.pkl', num_bins=66, yaw_thr=20.0, pitch_thr=15.0, roll_thr=10.0, ema_alpha=0.2, no_face_grace_s=1.0, away_required_s=1.2, log_dir='logs', session_name='session')

## 2) Imports

In [None]:
import cv2
import time
import math
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
import mediapipe as mp

## Hopenet Model

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import math
import torch.nn.functional as F

class Hopenet(nn.Module):
    # Hopenet with 3 output layers for yaw, pitch and roll
    # Predicts Euler angles by binning and regression with the expected value
    def __init__(self, block, layers, num_bins):
        self.inplanes = 64
        super(Hopenet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
        self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
        self.fc_roll = nn.Linear(512 * block.expansion, num_bins)

        # Vestigial layer from previous experiments
        self.fc_finetune = nn.Linear(512 * block.expansion + 3, 3)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        pre_yaw = self.fc_yaw(x)
        pre_pitch = self.fc_pitch(x)
        pre_roll = self.fc_roll(x)

        return pre_yaw, pre_pitch, pre_roll

class ResNet(nn.Module):
    # ResNet for regression of 3 Euler angles.
    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        self.fc_angles = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc_angles(x)
        return x

class AlexNet(nn.Module):
    # AlexNet laid out as a Hopenet - classify Euler angles in bins and
    # regress the expected value.
    def __init__(self, num_bins):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
        )
        self.fc_yaw = nn.Linear(4096, num_bins)
        self.fc_pitch = nn.Linear(4096, num_bins)
        self.fc_roll = nn.Linear(4096, num_bins)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 6 * 6)
        x = self.classifier(x)
        yaw = self.fc_yaw(x)
        pitch = self.fc_pitch(x)
        roll = self.fc_roll(x)
        return yaw, pitch, roll


## 3) Hopenet model loader (with correct device handling)

In [5]:
class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x):
        identity = x
        if self.downsample is not None:
            identity = self.downsample(x)

        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += identity
        out = self.relu(out)
        return out

def load_hopenet(model_path: str, num_bins: int, device: torch.device):
    model = Hopenet(Bottleneck, [3, 4, 6, 3], num_bins)
    checkpoint = torch.load(model_path, map_location=device)
    if isinstance(checkpoint, dict):
        model.load_state_dict(checkpoint)
    else:
        raise ValueError("Invalid checkpoint format. Expected a state_dict dict.")
    model.to(device)
    model.eval()
    return model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = load_hopenet(cfg.model_path, cfg.num_bins, device)

Using device: cuda


## 4) Preprocessing + MediaPipe face detection

In [6]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

mp_face = mp.solutions.face_detection
face_detector = mp_face.FaceDetection(model_selection=0, min_detection_confidence=0.5)

def _clamp(v, lo, hi):
    return max(lo, min(hi, v))

def detect_faces_mediapipe(frame_bgr):
    """Return list of (x, y, w, h) in pixel coords."""
    h, w = frame_bgr.shape[:2]
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    results = face_detector.process(frame_rgb)
    boxes = []
    if results.detections:
        for det in results.detections:
            bb = det.location_data.relative_bounding_box
            x = int(bb.xmin * w)
            y = int(bb.ymin * h)
            bw = int(bb.width * w)
            bh = int(bb.height * h)
            # Clamp to image bounds
            x = _clamp(x, 0, w-1)
            y = _clamp(y, 0, h-1)
            bw = _clamp(bw, 1, w - x)
            bh = _clamp(bh, 1, h - y)
            boxes.append((x, y, bw, bh))
    return boxes

def crop_preprocess_face(frame_bgr, bbox):
    x, y, w, h = bbox
    face_bgr = frame_bgr[y:y+h, x:x+w]
    # Convert to RGB before PIL/torch transforms
    face_rgb = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2RGB)
    face_pil = Image.fromarray(face_rgb)
    tensor = preprocess(face_pil)
    return tensor

I0000 00:00:1772212933.244880   55500 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1772212933.247758   55689 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.2.8-0ubuntu0.24.04.1), renderer: AMD Radeon 780M Graphics (radeonsi, phoenix, LLVM 20.1.2, DRM 3.64, 6.17.0-14-generic)


## 5) Head pose estimation + smoothing + flagging + drawing

In [7]:
def estimate_head_pose(model, face_tensor, device: torch.device, num_bins=66):
    """Return yaw, pitch, roll in degrees (float)."""
    face_tensor = face_tensor.unsqueeze(0).to(device)  # [1,3,224,224]
    with torch.no_grad():
        yaw_logits, pitch_logits, roll_logits = model(face_tensor)

    idx_tensor = torch.arange(num_bins, dtype=torch.float32, device=device).unsqueeze(0)
    yaw = torch.sum(F.softmax(yaw_logits, dim=1) * idx_tensor, dim=1) * 3 - 99
    pitch = torch.sum(F.softmax(pitch_logits, dim=1) * idx_tensor, dim=1) * 3 - 99
    roll = torch.sum(F.softmax(roll_logits, dim=1) * idx_tensor, dim=1) * 3 - 99
    return float(yaw.item()), float(pitch.item()), float(roll.item())

class EMASmoother:
    def __init__(self, alpha=0.2):
        self.alpha = alpha
        self.state = None  # (yaw,pitch,roll)

    def update(self, values):
        if self.state is None:
            self.state = values
        else:
            a = self.alpha
            self.state = tuple((1-a)*s + a*v for s, v in zip(self.state, values))
        return self.state

def is_looking_away(yaw, pitch, roll, yaw_thr, pitch_thr, roll_thr):
    return (abs(yaw) > yaw_thr) or (abs(pitch) > pitch_thr) or (abs(roll) > roll_thr)

def draw_overlay(frame, bbox, yaw, pitch, roll, warning_text=None):
    x, y, w, h = bbox
    cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
    cv2.putText(frame, f"Yaw: {yaw:6.2f}", (x, y-50), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 255, 0), 2)
    cv2.putText(frame, f"Pitch:{pitch:6.2f}", (x, y-30), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 255, 0), 2)
    cv2.putText(frame, f"Roll: {roll:6.2f}", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 255, 0), 2)

    if warning_text:
        cv2.putText(frame, warning_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


W0000 00:00:1772212933.263426   55675 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


## 6) CSV event logging

In [8]:
from datetime import datetime
def make_log_path(cfg):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    return os.path.join(cfg.log_dir, f"{cfg.session_name}_{ts}.csv")

log_path = make_log_path(cfg)
print("Logging to:", log_path)

def log_event(writer, t, event, yaw=None, pitch=None, roll=None, extra=""):
    writer.writerow({
        "timestamp_s": f"{t:.3f}",
        "event": event,
        "yaw": "" if yaw is None else f"{yaw:.3f}",
        "pitch": "" if pitch is None else f"{pitch:.3f}",
        "roll": "" if roll is None else f"{roll:.3f}",
        "extra": extra
    })

Logging to: logs/session_20260227_182213.csv


## 7) Run real-time monitoring

In [9]:
# Real-time loop
cap = cv2.VideoCapture(cfg.camera_id)
if not cap.isOpened():
    raise RuntimeError(f"Could not open camera_id={cfg.camera_id}")

smoother = EMASmoother(alpha=cfg.ema_alpha)

start_t = time.time()
last_face_seen_t = start_t

away_start_t = None   # when sustained looking-away began
is_currently_away = False

with open(log_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["timestamp_s","event","yaw","pitch","roll","extra"])
    writer.writeheader()
    log_event(writer, 0.0, "SESSION_START", extra=f"device={device}")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        now = time.time()
        t = now - start_t

        bboxes = detect_faces_mediapipe(frame)

        warning = None

        # Multi-face handling
        if len(bboxes) > 1:
            warning = "CRITICAL: Multiple faces detected!"
            log_event(writer, t, "MULTI_FACE", extra=f"count={len(bboxes)}")
            # Draw all boxes
            for bb in bboxes:
                cv2.rectangle(frame, (bb[0], bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (0, 0, 255), 2)
            cv2.putText(frame, warning, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
            cv2.imshow("Head Pose Monitoring", frame)

        elif len(bboxes) == 0:
            # No face: allow grace period to avoid false alerts from detection misses
            if (now - last_face_seen_t) > cfg.no_face_grace_s:
                warning = "Warning: No face detected!"
                log_event(writer, t, "NO_FACE")
            cv2.putText(frame, warning or "No face (within grace period)", (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255) if warning else (0, 255, 255), 2)
            cv2.imshow("Head Pose Monitoring", frame)

            # Reset away tracking if face disappears
            away_start_t = None
            is_currently_away = False

        else:
            # Exactly one face
            bbox = bboxes[0]
            last_face_seen_t = now

            face_tensor = crop_preprocess_face(frame, bbox)
            yaw, pitch, roll = estimate_head_pose(model, face_tensor, device, num_bins=cfg.num_bins)

            # Smooth angles
            yaw_s, pitch_s, roll_s = smoother.update((yaw, pitch, roll))

            # Away logic
            away_now = is_looking_away(yaw_s, pitch_s, roll_s, cfg.yaw_thr, cfg.pitch_thr, cfg.roll_thr)

            if away_now and not is_currently_away:
                # Just started deviating; start timer
                away_start_t = now
                is_currently_away = True
                log_event(writer, t, "AWAY_START", yaw_s, pitch_s, roll_s)

            if away_now and is_currently_away:
                # Has it lasted long enough to flag?
                if away_start_t and (now - away_start_t) >= cfg.away_required_s:
                    warning = "Warning: Looking away!"
                    log_event(writer, t, "AWAY_FLAG", yaw_s, pitch_s, roll_s, extra=f"duration_s={now-away_start_t:.2f}")

            if (not away_now) and is_currently_away:
                # Returned to normal
                log_event(writer, t, "AWAY_END", yaw_s, pitch_s, roll_s, extra=f"duration_s={now-away_start_t:.2f}" if away_start_t else "")
                away_start_t = None
                is_currently_away = False

            draw_overlay(frame, bbox, yaw_s, pitch_s, roll_s, warning_text=warning)
            cv2.imshow("Head Pose Monitoring", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            log_event(writer, time.time()-start_t, "SESSION_END")
            break

cap.release()
cv2.destroyAllWindows()
print("Done. Log saved to:", log_path)

QFontDatabase: Cannot find font directory /home/gokul-o-s/Desktop/Real-Time-VideoMonitor/real_t/lib/python3.11/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home/gokul-o-s/Desktop/Real-Time-VideoMonitor/real_t/lib/python3.11/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home/gokul-o-s/Desktop/Real-Time-VideoMonitor/real_t/lib/python3.11/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fonts.github.io/ for example) or switch to fontconfig.
QFontDatabase: Cannot find font directory /home/gokul-o-s/Desktop/Real-Time-VideoMonitor/real_t/lib/python3.11/site-packages/cv2/qt/fonts.
Note that Qt no longer ships fonts. Deploy some (from https://dejavu-fon

Done. Log saved to: logs/session_20260227_182213.csv


## 8) Quick log preview

In [10]:
import pandas as pd
df = pd.read_csv(log_path)
df.head(20)

Collecting pandas
  Downloading pandas-3.0.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading pandas-3.0.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pandas
Successfully installed pandas-3.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Unnamed: 0,timestamp_s,event,yaw,pitch,roll,extra
0,0.0,SESSION_START,,,,device=cuda
1,9.133,AWAY_START,-1.129,-15.572,-2.389,
2,9.533,AWAY_END,3.277,-13.986,-7.002,duration_s=0.40
3,11.237,AWAY_START,20.549,0.683,-5.442,
4,11.901,AWAY_END,15.523,3.233,-7.863,duration_s=0.66
5,15.137,AWAY_START,4.991,16.02,-3.011,
6,16.105,AWAY_END,5.036,14.893,-3.538,duration_s=0.97
7,16.737,AWAY_START,-21.69,11.97,-2.419,
8,17.969,AWAY_FLAG,-71.412,-23.709,27.258,duration_s=1.23
9,18.005,AWAY_FLAG,-71.283,-23.586,27.276,duration_s=1.27
