# Computer vision 

**Aim**: This individual assessment aims to consolidate knowledge base and practical skills to process video and develop object detection and face detection applications. Develop a computer vision system which can track and recognise moving objects and human faces.


### Retrieve Task 1 files

Run the below command to retrieve task1.mp4 and the ground truth files from this public repository.

In [None]:
import os
import subprocess

repo = "object-tracking-and-face-detection"
# Clone repository if not already present
if not os.path.exists(repo) and not os.getcwd().endswith(repo):
    subprocess.run(["git", "clone", "https://github.com/HandyBeeApp/object-tracking-and-face-detection.git"])
    os.chdir(repo)
elif os.getcwd().endswith(repo):
    print("Repository already exists and path is correct. Continuing...")
elif os.path.exists(repo):
    print("Repository already exists, enter repo. Continuing...")
    os.chdir(repo)

## Imports and setup

In [None]:
%pip install opencv-python scikit-learn

In [None]:
import numpy as np
import cv2
import torch
import scipy
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms.functional as tvtf
from pathlib import Path
import sklearn.metrics
from typing import List, Dict, Tuple, Optional, Union
from numpy.typing import NDArray

# Colors for drawing boxes
COLOURS = [
    tuple(int(colour_hex.strip("#")[i : i + 2], 16) for i in (0, 2, 4))
    for colour_hex in plt.rcParams["axes.prop_cycle"].by_key()["color"]
]

COLOURS2 = [
    tuple(int(c * 180) for c in plt.cm.tab10(i)[:3])  # Max 180 instead of 255
    for i in range(10)
]



In [None]:
# Class labels that relate to ids from ground_truth_for_task2
#and ground_truth_for_task3 files
CLASS_LABELS = ["dog", "chair", "chair", "book", "person", "face"]

# Frames used to generate ground_truth files
EVAL_FRAMES = [0, 30, 40, 64, 150, 210, 270, 295, 306, 330, 360, 420, 449]

GT_FACES_TRACKS_IN_VIDEO = 2
GT_DIFFERENT_TRACKS_IN_VIDEO = 12


# Convulutional Network settings
RCNN_WEIGHTS = (
    torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights.DEFAULT)
RCNN_SCORE_THRESHOLD = 0.7  # Min confidence to keep detection

# Haar cascade settings
SCALE_FACTOR = 1.1
MIN_NEIGHBORS = 3
MIN_SIZE = (40, 40)
MAX_SIZE = (100, 100)

## Tracking settings to avoid noise detections
# Min length of track to keep detections over 300ms at 30fps
MIN_TRACK_LENGTH = 10


## Functions

### Utility Functions

In [None]:
############################################################################
# UTILITY FUNCTIONS
############################################################################

def tlbr_to_center(boxes: NDArray) -> List[List[float]]:
    """Get center point from box corners"""
    points = []
    for tlx, tly, brx, bry in boxes:
        cx = (tlx + brx) / 2
        cy = (tly + bry) / 2
        points.append([cx, cy])
    return points

def bbox_iou_matrix(a: NDArray, b: NDArray) -> NDArray:
    """
    Calculate IoU between all pairs of boxes

    Parameters:
        a (np.array [N, 4]): N boxes as [tlx, tly, brx, bry]
        b (np.array [M, 4]): M boxes as [tlx, tly, brx, bry]

    Returns:
        np.array [N, M]: IoU for every pair
    """
    a = a[:, None]  # [N, 1, 4]
    b = b[None, :]  # [1, M, 4]

    tlx_a, tly_a, brx_a, bry_a = [a[..., i] for i in range(4)]
    tlx_b, tly_b, brx_b, bry_b = [b[..., i] for i in range(4)]

    # Find overlap area
    tlx_overlap = np.maximum(tlx_a, tlx_b)
    tly_overlap = np.maximum(tly_a, tly_b)
    brx_overlap = np.minimum(brx_a, brx_b)
    bry_overlap = np.minimum(bry_a, bry_b)

    # Clip to zero if no overlap
    intersection = (brx_overlap - tlx_overlap).clip(0) * (
        bry_overlap - tly_overlap
    ).clip(0)

    area_a = abs((brx_a - tlx_a) * (bry_a - tly_a))
    area_b = abs((brx_b - tlx_b) * (bry_b - tly_b))
    union = area_a + area_b - intersection

    return intersection / union

def load_yolo_ground_truth(fname: str, img_width: int, img_height: int) -> NDArray:
    """
    Load ground truth from YOLO format file

    YOLO format: class x_center y_center width height (normalized 0-1)

    Args:
        fname: Path to .txt file
        img_width: Image width
        img_height: Image height

    Returns:
        np.array [N, 5]: [class_id, tlx, tly, brx, bry] in pixels
    """
    if not Path(fname).exists():
        return np.array([]).reshape(0, 5)

    with open(fname) as f:
        rows = []
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 5:
                class_id = int(parts[0])
                x_center = float(parts[1])
                y_center = float(parts[2])
                width = float(parts[3])
                height = float(parts[4])

                # Convert from normalized to pixels
                tlx = (x_center - width / 2) * img_width
                tly = (y_center - height / 2) * img_height
                brx = (x_center + width / 2) * img_width
                bry = (y_center + height / 2) * img_height

                rows.append([class_id, tlx, tly, brx, bry])

    return np.array(rows) if rows else np.array([]).reshape(0, 5)


def draw_tracks_on_video(
    video_path: str, tracks: List[Dict[str, Union[int, List]]], video_name: str
) -> None:
    """
    Draw tracks on video with consecutive track IDs

    Args:
        video_path: Input video
        tracks: Tracks from do_tracking()
        video_name: Output name

    Note: Track IDs are automatically consecutive (0, 1, 2, ...)
          based on enumerate() iteration order
    """

    output_path = f"{video_name}2.mp4"

    vid = cv2.VideoCapture(video_path)
    vid_length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
    vid_out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for i in range(vid_length):
        # Read frame
        _, img = vid.read()

        # Draw all active tracks (track_id is consecutive: 0, 1, 2, ...)
        for track_id, track in enumerate(tracks):
            # Check if track is active on this frame
            start_frame = track["start_frame"]
            inner_idx = i - start_frame

            # print(f"Track ID: {track_id}/{len(tracks)}, Frame: {i}, Inner idx: {inner_idx}")

            if 0 <= inner_idx < len(track["boxes"]):
                # Get box
                tlx, tly, brx, bry = track["boxes"][inner_idx].astype(np.int32)
                label = track["labels"][inner_idx]
                score = track["scores"][inner_idx]

                # Color by track ID
                colour = COLOURS[track_id % len(COLOURS)]

                # Draw box
                cv2.rectangle(img, (tlx, tly), (brx, bry), color=colour, thickness=2)

                # Make label
                categories = RCNN_WEIGHTS.meta["categories"]
                class_name = categories[label]
                text = f"ID:{track_id} {class_name} {score:.2f}"

                # Text background
                (text_width, text_height), _ = cv2.getTextSize(
                    text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
                )
                cv2.rectangle(
                    img,
                    (tlx, tly - text_height - 4),
                    (tlx + text_width, tly),
                    colour,
                    -1,
                )

                # Draw text
                cv2.putText(
                    img,
                    text,
                    (tlx, tly - 2),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (255, 255, 255),
                    1,
                    cv2.LINE_AA,
                )
            # else:
            #     print(f"Track ID: {track_id} skipped on frame {i}")

        vid_out.write(img)
        # print(f"Writing frame {i+1}/{vid_length}")

    vid.release()
    vid_out.release()
    print(f"Video saved to {output_path}")



def draw_detections(
    img: NDArray,
    boxes: NDArray,
    colours: List[Tuple[int, int, int]] = COLOURS,
    labels: Optional[List[str]] = None,
    is_tp: Optional[NDArray] = None,
    class_ids: Optional[NDArray] = None,
) -> None:
    """
    Draw boxes on image

    Args:
        img: Image
        boxes: np.array [N, 4] [tlx, tly, brx, bry]
        colours: Colors
        labels: Box labels
        is_tp: 1=true positive, 0=false positive
        class_ids: Class IDs
    """
    # Convert to numpy
    if not isinstance(boxes, np.ndarray):
        boxes = np.array(boxes)

    # Check for empty
    if boxes.ndim == 0 or boxes.size == 0:
        return

    # Make 2D
    if boxes.ndim == 1:
        if len(boxes) < 4:
            return
        boxes = boxes.reshape(-1, 4)

    for i in range(boxes.shape[0]):
        box = boxes[i]
        tlx, tly, brx, bry = box[0], box[1], box[2], box[3]
        # Green=correct, Red=wrong
        if is_tp is not None:
            color = (0, 255, 0) if is_tp[i] else (255, 0, 0)
        else:
            color = colours[i % len(colours)]

        cv2.rectangle(
            img, (int(tlx), int(tly)), (int(brx), int(bry)), color=color, thickness=2
        )

        # Make label
        label_text = ""
        if labels is not None:
            label_text = f"{labels[i]}"
        elif class_ids is not None and i < len(class_ids):
            class_id = int(class_ids[i])
            if 0 <= class_id < len(CLASS_LABELS):
                label_text = CLASS_LABELS[class_id]

        # Add TP/FP
        if is_tp is not None and label_text:
            label_text += " (TP)" if is_tp[i] else " (FP)"
        elif is_tp is not None:
            label_text = "TP" if is_tp[i] else "FP"

        # Draw text
        if label_text:
            # Background for text
            (text_width, text_height), _ = cv2.getTextSize(
                label_text, cv2.FONT_HERSHEY_PLAIN, 1.0, 1
            )
            cv2.rectangle(
                img,
                (int(tlx), int(tly) - text_height - 4),
                (int(tlx) + text_width, int(tly)),
                color=color,
                thickness=cv2.FILLED,
            )
            cv2.putText(
                img,
                label_text,
                (int(tlx), int(tly) - 2),
                fontFace=cv2.FONT_HERSHEY_PLAIN,
                fontScale=1.0,
                color=(255, 255, 255),
            )


def is_face_in_person_box(face_box: List[float], person_box: List[float]) -> bool:
    """
    Check if face is inside person box

    Args:
        face_box: [x, y, w, h]
        person_box: [tlx, tly, brx, bry]

    Returns:
        True if face center is inside person box
    """
    # Get face center
    face_cx = face_box[0] + face_box[2] / 2
    face_cy = face_box[1] + face_box[3] / 2

    # Check if inside
    return (
        person_box[0] <= face_cx <= person_box[2]
        and person_box[1] <= face_cy <= person_box[3]
    )


def filter_faces_by_person_boxes(
    face_detections: List[Dict[str, Union[List[float], str]]],
    person_boxes: List[List[float]],
) -> List[Dict[str, Union[List[float], str]]]:
    """
    Keep only faces inside person boxes

    Args:
        face_detections: Face detections
        person_boxes: Person boxes [tlx, tly, brx, bry]

    Returns:
        Filtered faces
    """
    if len(person_boxes) == 0:
        # No person = no faces
        return []

    filtered_faces = []
    for face_det in face_detections:
        face_box = face_det["box"]

        # Check if inside any person box
        for person_box in person_boxes:
            if is_face_in_person_box(face_box, person_box):
                filtered_faces.append(face_det)
                break

    return filtered_faces



def remove_duplicate_face_detections(
    detections: List[Dict[str, Union[List[float], str]]], iou_threshold: float = 0.3
) -> List[Dict[str, Union[List[float], str]]]:
    """
    Remove duplicate faces using NMS, keeping larger boxes

    Args:
        detections: Face detections
        iou_threshold: IoU for duplicates

    Returns:
        Unique faces (larger boxes kept)
    """
    if len(detections) == 0:
        return []

    # Get boxes
    boxes = np.array([d["box"] for d in detections])

    # Convert [x, y, w, h] to [x1, y1, x2, y2]
    boxes_tlbr = boxes.copy()
    boxes_tlbr[:, 2] = boxes[:, 0] + boxes[:, 2]
    boxes_tlbr[:, 3] = boxes[:, 1] + boxes[:, 3]

    # Calculate box areas
    areas = boxes[:, 2] * boxes[:, 3]  # width * height

    # Get IoU
    iou_matrix = bbox_iou_matrix(boxes_tlbr, boxes_tlbr)

    # Track what to keep
    keep = []
    processed = set()

    # Sort by area (largest first)
    sorted_indices = np.argsort(areas)[::-1]

    for i in sorted_indices:
        if i in processed:
            continue

        keep.append(i)
        processed.add(i)

        # Mark overlapping smaller boxes as processed
        for j in range(len(detections)):
            if j != i and j not in processed and iou_matrix[i, j] > iou_threshold:
                processed.add(j)

    return [detections[i] for i in keep]



def export_specific_frames(
    video_path: str, output_dir: str, frame_numbers: List[int], prefix: str = "frame"
) -> List[str]:
    """
    Save specific frames from video

    Args:
        video_path: Video file
        output_dir: Where to save
        frame_numbers: Frames to export
        prefix: Filename prefix

    Returns:
        List of saved paths
    """
    import os

    # Make folder
    os.makedirs(output_dir, exist_ok=True)

    vid = cv2.VideoCapture(video_path)
    vid_length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(vid.get(cv2.CAP_PROP_FPS)) + 1

    # Clean frame list
    frame_numbers = sorted(set(frame_numbers))

    # Remove invalid frames
    valid_frames = [f for f in frame_numbers if 0 <= f < vid_length]
    if len(valid_frames) < len(frame_numbers):
        invalid = set(frame_numbers) - set(valid_frames)
        print(f"Warning: Skipping invalid frames: {invalid}")

    exported_frames = []

    print("Video:")
    print(f"  Frames: {vid_length}")
    print(f"  FPS: {fps}")
    print(f"  Exporting {len(valid_frames)} frames")
    print(f"  Numbers: {valid_frames}")

    # Save each frame
    for frame_num in valid_frames:
        # Get frame
        vid.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = vid.read()

        if not ret:
            print(f"Warning: Can't read frame {frame_num}")
            continue

        # Make filename
        filename = f"{prefix}_{frame_num}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Save
        cv2.imwrite(filepath, frame)
        exported_frames.append(filepath)

        print(f"Exported frame {frame_num} -> {filename}")

    vid.release()
    print(f"\nSaved {len(exported_frames)} frames to {output_dir}")

    return exported_frames



### Detection functions

In [None]:
############################################################################
# DETECTION FUNCTIONS
############################################################################

def preprocess_image(image: NDArray) -> torch.Tensor:
    """
    Get image ready for MaskRCNN
    Torchvision handles the transforms
    """
    image = tvtf.to_tensor(image)

    # Get transforms from weights
    preprocess = RCNN_WEIGHTS.transforms()

    # Apply transforms and add batch dimension
    image = preprocess(image).unsqueeze(dim=0)
    return image


def get_detections(
    maskrcnn: torch.nn.Module, vid: cv2.VideoCapture
) -> List[Dict[str, NDArray]]:
    """
    Run model on all video frames
    Returns boxes, labels, and scores for each frame
    """
    vid_length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    initial_frame = int(vid.get(cv2.CAP_PROP_POS_FRAMES))
    vid.set(cv2.CAP_PROP_POS_FRAMES, 0)

    all_detections = []
    for i in range(vid_length):
        _, img = vid.read()
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Run model
        with torch.no_grad():
            result = maskrcnn(preprocess_image(img))[0]

        # Keep only high-confidence detections
        mask = result["scores"] > RCNN_SCORE_THRESHOLD

        boxes = result["boxes"][mask].detach().cpu().numpy()
        labels = result["labels"][mask].detach().cpu().numpy()
        scores = result["scores"][mask].detach().cpu().numpy()

        all_detections.append({"boxes": boxes, "labels": labels, "scores": scores})

        # print(f"Processing frame {i+1}/{vid_length}")
    print(f"Processed {vid_length} frames")

    vid.set(cv2.CAP_PROP_POS_FRAMES, initial_frame)
    return all_detections


def object_detections(
    video_path: str, video_name: str = "task1"
) -> List[Dict[str, NDArray]]:
    """
    Run MaskRCNN on all frames
    """
    import os

    detection_file = f"{video_name}1.th"

    print("Loading MaskRCNN...")
    # Use default weights
    maskrcnn = torchvision.models.detection.maskrcnn_resnet50_fpn(
        weights="DEFAULT", pretrained=True
    )
    maskrcnn.eval()

    print("\nRunning detections...")
    vid = cv2.VideoCapture(video_path)
    all_detections = get_detections(maskrcnn, vid)
    vid.release()

    return all_detections


def associate_object_tracking(
    track_boxes: NDArray, det_boxes: NDArray, method: str = "iou"
) -> Tuple[NDArray, NDArray]:
    """
    Match tracks to detections

    Args:
        track_boxes: Last box of each track [N, 4] [tlx, tly, brx, bry]
        det_boxes: New detections [M, 4] [tlx, tly, brx, bry]
        method: 'iou' or 'distance'

    Returns:
        Matching indices from Hungarian algorithm
    """
    if method == "iou":
        # IoU-based matching
        cost = 1 - bbox_iou_matrix(track_boxes, det_boxes)
    else:
        # Distance-based matching
        track_points = np.array(tlbr_to_center(track_boxes))
        det_points = np.array(tlbr_to_center(det_boxes))
        cost = np.linalg.norm(track_points[:, None] - det_points[None], axis=-1)

    return scipy.optimize.linear_sum_assignment(cost)


def filter_short_tracks(
    tracks: List[Dict[str, Union[int, List]]], min_length: int = MIN_TRACK_LENGTH
) -> List[Dict[str, Union[int, List]]]:
    """
    Remove tracks that appear in fewer than min_length frames.
    This helps remove noise from detections that only appear once.

    Args:
        tracks: List of track dictionaries with 'boxes' or 'points'
        min_length: Minimum number of frames a track must appear in

    Returns:
        Filtered list of tracks
    """
    filtered_tracks = []
    for track in tracks:
        # Check the length of the track
        if 'boxes' in track:
            track_length = len(track['boxes'])
        elif 'points' in track:
            track_length = len(track['points'])
        else:
            continue

        # Only keep tracks that appear in at least min_length frames
        if track_length >= min_length:
            filtered_tracks.append(track)

    return filtered_tracks


def do_tracking(
    detections: List[Dict[str, NDArray]], association_method: str = "iou"
) -> List[Dict[str, Union[int, List]]]:
    """
    Track objects across frames

    Args:
        detections: Detections with boxes, labels, scores
        association_method: 'iou' or 'centroid_distance'

    Returns:
        List of tracks with start_frame, boxes, labels, scores
    """
    open_tracks = []
    closed_tracks = []

    for i, det in enumerate(detections):
        det_boxes = det["boxes"]
        det_labels = det["labels"]
        det_scores = det["scores"]

        track_indices = det_indices = []

        if i > 0 and len(open_tracks) > 0 and len(det_boxes) > 0:
            # Match detections to existing tracks
            track_boxes = np.array([track["boxes"][-1] for track in open_tracks])
            track_indices, det_indices = associate_object_tracking(
                track_boxes, det_boxes, method=association_method
            )

        # Update matched tracks
        for track_idx, det_idx in zip(track_indices, det_indices):
            open_tracks[track_idx]["boxes"].append(det_boxes[det_idx])
            open_tracks[track_idx]["labels"].append(det_labels[det_idx])
            open_tracks[track_idx]["scores"].append(det_scores[det_idx])

        # Close tracks that lost matching
        lost_indices = set(range(len(open_tracks))) - set(track_indices)
        for lost_idx in sorted(lost_indices, reverse=True):
            closed_tracks.append(open_tracks.pop(lost_idx))

        # Start new tracks
        new_indices = set(range(len(det_boxes))) - set(det_indices)
        for new_idx in new_indices:
            open_tracks.append(
                {
                    "start_frame": i,
                    "boxes": [det_boxes[new_idx]],
                    "labels": [det_labels[new_idx]],
                    "scores": [det_scores[new_idx]],
                }
            )

    all_tracks = closed_tracks + open_tracks

    # Filter out short tracks (noise)
    if MIN_TRACK_LENGTH > 1:
        filtered_tracks = filter_short_tracks(all_tracks)
        print(f"Filtered out {len(all_tracks) - len(filtered_tracks)} short tracks (noise)")
        all_tracks = filtered_tracks

    return all_tracks


def do_face_tracking(
    face_detections_per_frame: List[List[Dict[str, Union[List[float], str]]]],
    association_method: str = "iou"
) -> List[Dict[str, Union[int, List]]]:
    """
    Track faces across frames (similar to object tracking)

    Args:
        face_detections_per_frame: List of face detections for each frame
                                   Each detection has 'box' [x, y, w, h] and 'cascade_name'
        association_method: 'iou' or 'centroid_distance'

    Returns:
        List of face tracks with start_frame, boxes, cascade_names
    """
    open_tracks = []
    closed_tracks = []

    for i, face_detections in enumerate(face_detections_per_frame):
        # Convert face boxes from [x, y, w, h] to [tlx, tly, brx, bry]
        det_boxes = []
        cascade_names = []

        for det in face_detections:
            x, y, w, h = det["box"]
            det_boxes.append([x, y, x + w, y + h])
            cascade_names.append(det["cascade_name"])

        det_boxes = np.array(det_boxes) if det_boxes else np.array([]).reshape(0, 4)

        track_indices = det_indices = []

        if i > 0 and len(open_tracks) > 0 and len(det_boxes) > 0:
            # Match detections to existing tracks
            track_boxes = np.array([track["boxes"][-1] for track in open_tracks])
            track_indices, det_indices = associate_object_tracking(
                track_boxes, det_boxes, method=association_method
            )

        # Update matched tracks
        for track_idx, det_idx in zip(track_indices, det_indices):
            open_tracks[track_idx]["boxes"].append(det_boxes[det_idx])
            open_tracks[track_idx]["cascade_names"].append(cascade_names[det_idx])

        # Close tracks that lost matching
        lost_indices = set(range(len(open_tracks))) - set(track_indices)
        for lost_idx in sorted(lost_indices, reverse=True):
            closed_tracks.append(open_tracks.pop(lost_idx))

        # Start new tracks
        new_indices = set(range(len(det_boxes))) - set(det_indices)
        for new_idx in new_indices:
            open_tracks.append(
                {
                    "start_frame": i,
                    "boxes": [det_boxes[new_idx]],
                    "cascade_names": [cascade_names[new_idx]],
                }
            )

    all_tracks = closed_tracks + open_tracks

    # Filter out short tracks (noise)
    if MIN_TRACK_LENGTH > 1:
        filtered_tracks = filter_short_tracks(all_tracks)
        print(f"Filtered out {len(all_tracks) - len(filtered_tracks)} short face tracks (noise)")
        all_tracks = filtered_tracks

    return all_tracks


def facial_recognition_with_cascades(
    frame: NDArray, person_boxes: Optional[List[List[float]]] = None
) -> List[Dict[str, Union[List[float], str]]]:
    """
    Detect faces using Haar cascades

    Args:
        frame: BGR image
        person_boxes: Person boxes [tlx, tly, brx, bry]
                     Only return faces inside these

    Returns:
        List with box [x, y, w, h] and cascade_name
    """
    # Convert to gray
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Removed as histogram equalization can worsen detection, same thing happened in assessment 1
    # And I was penalised for not using it even though it wasn't a requirement :(
    # gray = cv2.equalizeHist(gray)

    # Load cascades
    cascades = {
        "frontalface_default": cv2.CascadeClassifier(
            cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
        ),
        "profileface": cv2.CascadeClassifier(
            cv2.data.haarcascades + "haarcascade_profileface.xml"
        ),
        "frontalface_alt": cv2.CascadeClassifier(
            cv2.data.haarcascades + "haarcascade_frontalface_alt.xml"
        ),
         "frontalface_alt2": cv2.CascadeClassifier(
            cv2.data.haarcascades + "haarcascade_frontalface_alt2.xml"
        ),
         "frontalface_alt3": cv2.CascadeClassifier(
            cv2.data.haarcascades + "haarcascade_frontalface_alt_tree.xml"
        ),

    }

    all_detections = []

    # Run each cascade
    for cascade_name, cascade in cascades.items():
        faces = cascade.detectMultiScale(
            gray,
            scaleFactor=SCALE_FACTOR,
            minNeighbors=MIN_NEIGHBORS,
            minSize=MIN_SIZE,
            maxSize=MAX_SIZE
        )

        for x, y, w, h in faces:
            all_detections.append({"box": [x, y, w, h], "cascade_name": cascade_name})

    # Remove duplicates
    keep_detections = remove_duplicate_face_detections(all_detections)

    # Detect how many faces were removed
    # if(len(all_detections) - len(keep_detections) > 0):
    #     print(f"Removed {len(all_detections) - len(keep_detections)} duplicate face detections out of {len(all_detections)} ")
    all_detections = keep_detections

    # Filter by person boxes
    if person_boxes is not None and len(person_boxes) > 0:
        all_detections = filter_faces_by_person_boxes(all_detections, person_boxes)

    return all_detections

def add_haar_detections_to_video(
    input_video_path: str,
    output_video_path: str,
    all_detections: Optional[List[Dict[str, NDArray]]] = None,
) -> None:
    """
    Add face detection and tracking to video

    Args:
        input_video_path: Input video
        output_video_path: Output video
        all_detections: Object detections to get person boxes
    """
    vid = cv2.VideoCapture(input_video_path)
    vid_length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # First pass: collect all face detections
    print("Detecting faces in all frames...")
    all_face_detections = []

    for i in range(vid_length):
        # Read frame
        ret, frame = vid.read()
        if not ret:
            break

        # Get person boxes for this frame
        person_boxes = []
        if all_detections is not None and i < len(all_detections):
            det = all_detections[i]
            # COCO class 1 = person
            for idx, label in enumerate(det["labels"]):
                if label == 1:
                    person_boxes.append(det["boxes"][idx])

        # Detect faces
        face_detections = facial_recognition_with_cascades(
            frame, person_boxes=person_boxes if person_boxes else None
        )

        all_face_detections.append(face_detections)

    vid.release()
    print(f"Detected faces in {vid_length} frames")

    # Track faces across frames
    print("Tracking faces...")
    face_tracks = do_face_tracking(all_face_detections, association_method="centroid_distance")
    print(f"Created {len(face_tracks)} face tracks")

    # Second pass: draw tracked faces on video
    print("Drawing tracked faces on video...")
    vid = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
    vid_out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    for i in range(vid_length):
        # Read frame
        ret, frame = vid.read()
        if not ret:
            break

        # Draw all active face tracks
        for track_id, track in enumerate(face_tracks):
            # Check if track is active on this frame
            start_frame = track["start_frame"]
            inner_idx = i - start_frame

            if 0 <= inner_idx < len(track["boxes"]):
                # Get box [tlx, tly, brx, bry]
                tlx, tly, brx, bry = track["boxes"][inner_idx].astype(np.int32)
                cascade_name = track["cascade_names"][inner_idx]

                # Color by track ID
                colour = COLOURS[track_id % len(COLOURS)]

                # Draw box
                cv2.rectangle(frame, (tlx, tly), (brx, bry), color=colour, thickness=2)

                # Make label
                text = f"Face ID:{track_id}"
                (text_width, text_height), _ = cv2.getTextSize(
                    text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
                )

                # Background
                cv2.rectangle(
                    frame,
                    (tlx, tly - text_height - 4),
                    (tlx + text_width, tly),
                    colour,
                    -1,
                )

                # Text
                cv2.putText(
                    frame,
                    text,
                    (tlx, tly - 2),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (255, 255, 255),
                    1,
                    cv2.LINE_AA,
                )

        vid_out.write(frame)

    vid.release()
    vid_out.release()
    print(f"Face tracking video saved to {output_video_path}")

### Evaluation Functions

In [None]:
# ============================================================================
# EVALUATION FUNCTIONS
# ============================================================================

def determine_true_positives(
    pred_boxes: NDArray, gt_boxes: NDArray, iou_thresh: float = 0.5
) -> NDArray:
    """
    Check which predictions match ground truth

    Args:
        pred_boxes: np.array [N, 4] predictions [tlx, tly, brx, bry]
        gt_boxes: np.array [M, 4] ground truth [tlx, tly, brx, bry]
        iou_thresh: IoU threshold (default: 0.5)

    Returns:
        np.array [N]: 1 for true positive, 0 for false positive
    """
    # Make sure we have numpy arrays
    if not isinstance(pred_boxes, np.ndarray):
        pred_boxes = np.array(pred_boxes)
    if not isinstance(gt_boxes, np.ndarray):
        gt_boxes = np.array(gt_boxes)

    # Make sure arrays are 2D
    if pred_boxes.ndim == 0 or (pred_boxes.ndim == 1 and len(pred_boxes) == 0):
        return np.array([])
    if pred_boxes.ndim == 1:
        pred_boxes = pred_boxes.reshape(-1, 4)

    if gt_boxes.ndim == 0 or (gt_boxes.ndim == 1 and len(gt_boxes) == 0):
        # No ground truth = all predictions are wrong
        return (
            np.zeros(pred_boxes.shape[0], dtype=int)
            if pred_boxes.ndim > 0
            else np.array([])
        )
    if gt_boxes.ndim == 1:
        gt_boxes = gt_boxes.reshape(-1, 4)

    # Check for empty arrays
    if pred_boxes.size == 0 or pred_boxes.shape[0] == 0:
        return np.array([])

    if gt_boxes.size == 0 or gt_boxes.shape[0] == 0:
        # No ground truth = all predictions are wrong
        return np.zeros(pred_boxes.shape[0], dtype=int)

    # Get IoU for all pairs
    iou_matrix = bbox_iou_matrix(pred_boxes, gt_boxes)

    # Check if prediction matches any ground truth
    max_ious = iou_matrix.max(axis=1)
    is_tp = (max_ious >= iou_thresh).astype(int)

    return is_tp


def calculate_metrics(tp: int, fp: int, fn: int) -> Dict[str, float]:
    """
    Calculate precision, recall, F1 using sklearn

    Args:
        tp: True positives
        fp: False positives
        fn: False negatives

    Returns:
        dict: precision, recall, f1, accuracy, tp, fp, fn, tn
    """
    # Convert to int
    tp = int(tp)
    fp = int(fp)
    fn = int(fn)

    # Build arrays for sklearn
    total_samples = tp + fp + fn

    if total_samples == 0:
        # Nothing detected, nothing expected = correct
        tn = 1
        return {
            "precision": 0.0,  # Undefined, default to 0
            "recall": 0.0,  # Undefined, default to 0
            "f1": 0.0,  # Undefined, default to 0
            "accuracy": 1.0,  # Correctly predicted nothing
            "tp": 0,
            "fp": 0,
            "fn": 0,
            "tn": tn,
        }

    # Build prediction and truth arrays
    y_pred = np.concatenate(
        [
            np.ones(tp + fp),  # What we predicted
            np.zeros(fn),  # What we missed
        ]
    )

    y_true = np.concatenate(
        [
            np.ones(tp),  # Correct predictions
            np.zeros(fp),  # Wrong predictions
            np.ones(fn),  # Missed ones
        ]
    )

    # Use sklearn
    precision = sklearn.metrics.precision_score(y_true, y_pred, zero_division=0)
    recall = sklearn.metrics.recall_score(y_true, y_pred, zero_division=0)
    f1 = sklearn.metrics.f1_score(y_true, y_pred, zero_division=0)
    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)

    # We don't track TN in object detection
    tn = 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
        "tp": int(tp),
        "fp": int(fp),
        "fn": int(fn),
        "tn": int(tn),
    }


def evaluate_track_quality(
    tracks: List[Dict[str, Union[int, List]]],
    fps: int = 30
) -> Dict[str, Union[int, float]]:
    """
    Evaluate tracking quality based on track statistics

    Args:
        tracks: List of tracks from do_tracking() or do_face_tracking()
        fps: Video frames per second (default: 30)

    Returns:
        Dictionary with track quality metrics
    """
    if not tracks:
        return {
            'total_tracks': 0,
            'avg_track_length_frames': 0,
            'avg_track_length_seconds': 0.0,
            'median_track_length_frames': 0,
            'median_track_length_seconds': 0.0,
            'max_track_length_frames': 0,
            'max_track_length_seconds': 0.0,
            'min_track_length_frames': 0,
            'min_track_length_seconds': 0.0,
            'short_tracks': 0,  # < 1 second
            'medium_tracks': 0,  # 1-5 seconds
            'long_tracks': 0,   # > 5 seconds
        }

    # Get track lengths
    track_lengths = [len(t['boxes']) for t in tracks]

    # Convert to seconds
    track_lengths_sec = [l / fps for l in track_lengths]

    # Calculate statistics
    avg_length = np.mean(track_lengths)
    median_length = np.median(track_lengths)
    max_length = np.max(track_lengths)
    min_length = np.min(track_lengths)

    # Categorize tracks by duration
    short_tracks = sum(1 for l in track_lengths if l < fps)  # < 1 sec
    medium_tracks = sum(1 for l in track_lengths if fps <= l <= 5 * fps)  # 1-5 sec
    long_tracks = sum(1 for l in track_lengths if l > 5 * fps)  # > 5 sec

    return {
        'total_tracks': len(tracks),
        'avg_track_length_frames': float(avg_length),
        'avg_track_length_seconds': float(avg_length / fps),
        'median_track_length_frames': float(median_length),
        'median_track_length_seconds': float(median_length / fps),
        'max_track_length_frames': int(max_length),
        'max_track_length_seconds': float(max_length / fps),
        'min_track_length_frames': int(min_length),
        'min_track_length_seconds': float(min_length / fps),
        'short_tracks': short_tracks,
        'medium_tracks': medium_tracks,
        'long_tracks': long_tracks,
    }


def evaluate_tracking_accuracy(
    detected_tracks: int,
    ground_truth_tracks: int
) -> Dict[str, float]:
    """
    Evaluate tracking accuracy using sklearn metrics

    Treats tracking as a binary classification problem:
    - Each ground truth track should result in exactly 1 detected track
    - TP: Correctly detected tracks (min of detected and GT)
    - FP: Extra detected tracks (over-tracking)
    - FN: Missed tracks (under-tracking)

    Args:
        detected_tracks: Number of tracks detected by the algorithm
        ground_truth_tracks: Ground truth number of unique objects/faces

    Returns:
        Dictionary with precision, recall, f1, accuracy (sklearn formulas)
    """
    # Calculate confusion matrix elements
    tp = min(detected_tracks, ground_truth_tracks)
    fp = max(0, detected_tracks - ground_truth_tracks)  # Over-tracking
    fn = max(0, ground_truth_tracks - detected_tracks)  # Under-tracking

    # Use sklearn's calculate_metrics function
    return calculate_metrics(tp, fp, fn)


def visualize_iou_comparison(
    video_path: str,
    frame_num: int,
    pred_boxes: NDArray,
    gt_boxes: NDArray,
    is_tp: NDArray,
    output_dir: str,
    prefix: str = "frame",
    pred_class_ids: Optional[NDArray] = None,
    gt_class_ids: Optional[NDArray] = None,
) -> None:
    """
    Save side-by-side comparison of predictions vs ground truth

    Args:
        video_path: Video file
        frame_num: Frame to show
        pred_boxes: Predictions [N, 4] [tlx, tly, brx, bry]
        gt_boxes: Ground truth [M, 4] [tlx, tly, brx, bry]
        is_tp: True positive flags
        output_dir: Where to save
        prefix: Filename prefix
        pred_class_ids: Class IDs for predictions
        gt_class_ids: Class IDs for ground truth
    """
    # Get frame
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
    ret, frame = cap.read()
    cap.release()

    if not ret:
        print(f"Could not read frame {frame_num}")
        return

    # Convert to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Make side-by-side image
    h, w = frame_rgb.shape[:2]
    combined_img = np.zeros((h, w * 2, 3), dtype=np.uint8)

    # Left: Ground truth
    gt_img = frame_rgb.copy()
    if len(gt_boxes) > 0:
        draw_detections(
            gt_img, gt_boxes, labels=["GT"] * len(gt_boxes), class_ids=gt_class_ids
        )
    cv2.putText(
        gt_img,
        "Ground Truth",
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        1.0,
        (255, 255, 255),
        2,
    )
    combined_img[:, :w] = gt_img

    # Right: Predictions
    pred_img = frame_rgb.copy()
    if len(pred_boxes) > 0:
        draw_detections(pred_img, pred_boxes, is_tp=is_tp, class_ids=pred_class_ids)
    cv2.putText(
        pred_img,
        "Predictions (Green=TP, Red=FP)",
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.6,
        (255, 255, 255),
        2,
    )
    combined_img[:, w:] = pred_img

    # Save
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(
        output_dir, f"{prefix}_{frame_num:06d}_iou_comparison.jpg"
    )
    combined_img_bgr = cv2.cvtColor(combined_img, cv2.COLOR_RGB2BGR)
    cv2.imwrite(output_path, combined_img_bgr)


def evaluate_detections_for_frame(
    pred_boxes: NDArray,
    gt_file: str,
    img_width: int,
    img_height: int,
    iou_thresh: float = 0.5,
    pred_class_ids: Optional[NDArray] = None,
) -> Tuple[Dict[str, float], NDArray, NDArray, NDArray]:
    """
    Check predictions for one frame

    Args:
        pred_boxes: np.array [N, 4] [tlx, tly, brx, bry]
        gt_file: Ground truth file
        img_width: Image width
        img_height: Image height
        iou_thresh: IoU threshold (default: 0.5)
        pred_class_ids: Class IDs

    Returns:
        (metrics, is_tp, gt_boxes, gt_class_ids)
    """
    # Load ground truth
    gt_data = load_yolo_ground_truth(gt_file, img_width, img_height)

    if len(gt_data) == 0:
        # No ground truth = all wrong
        is_tp = (
            np.zeros(len(pred_boxes), dtype=int)
            if len(pred_boxes) > 0
            else np.array([])
        )
        return (
            calculate_metrics(0, len(pred_boxes), 0),
            is_tp,
            np.array([]).reshape(0, 4),
            np.array([]),
        )

    gt_boxes = gt_data[:, 1:5]  # Box coords
    gt_class_ids = gt_data[:, 0].astype(int)  # Class IDs

    # Check what matched
    is_tp = determine_true_positives(pred_boxes, gt_boxes, iou_thresh)

    tp = is_tp.sum()
    fp = len(pred_boxes) - tp
    fn = max(0, len(gt_boxes) - tp)  # Missed boxes

    return calculate_metrics(tp, fp, fn), is_tp, gt_boxes, gt_class_ids


def evaluate_task2_detections(
    all_detections: List[Dict[str, NDArray]],
    video_name: str,
    gt_dir: str = "ground_truth_for_task2",
    iou_thresh: float = 0.5,
    eval_frames: Optional[List[int]] = None,
) -> Dict[str, float]:
    """
    Check object detection results (Task 2)

    Args:
        all_detections: All detections from video
        video_name: Video name (e.g., "task")
        gt_dir: Ground truth folder
        iou_thresh: IoU threshold (default: 0.5)
        eval_frames: Frames to check (default: EVAL_FRAMES)

    Returns:
        Overall metrics
    """
    # Get video size
    video_path = f"./{video_name}1.mp4"
    vid = cv2.VideoCapture(video_path)
    img_width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    img_height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    vid.release()

    # Use default frames if not provided
    if eval_frames is None:
        eval_frames = EVAL_FRAMES

    total_tp = 0
    total_fp = 0
    total_fn = 0
    frame_metrics = []

    print("OBJECT DETECTION EVALUATION")


    # Make output folder
    vis_dir = f"evaluation_results/{video_name.strip('_')}_task2"
    os.makedirs(vis_dir, exist_ok=True)

    for frame_num in eval_frames:
        if frame_num >= len(all_detections):
            continue

        # Get predictions
        det = all_detections[frame_num]
        pred_boxes = det["boxes"]
        pred_labels = det["labels"]

        # GT file
        gt_file = Path(gt_dir) / f"task1_frame_{frame_num}.txt"

        if not gt_file.exists():
            print(f"Warning: No GT for frame {frame_num}")
            continue

        # Evaluate frame
        metrics, is_tp, gt_boxes, gt_class_ids = evaluate_detections_for_frame(
            pred_boxes,
            gt_file,
            img_width,
            img_height,
            iou_thresh,
            pred_class_ids=pred_labels,
        )

        total_tp += metrics["tp"]
        total_fp += metrics["fp"]
        total_fn += metrics["fn"]

        frame_metrics.append({"frame": frame_num, **metrics})

        # print(
        #     f"Frame {frame_num:4d}: TP={metrics['tp']:2d} FP={metrics['fp']:2d} FN={metrics['fn']:2d} "
        #     f"P={metrics['precision']:.3f} R={metrics['recall']:.3f} F1={metrics['f1']:.3f}"
        # )

        # Save comparison image
        visualize_iou_comparison(
            video_path,
            frame_num,
            pred_boxes,
            gt_boxes,
            is_tp,
            vis_dir,
            prefix="task2",
            pred_class_ids=pred_labels,
            gt_class_ids=gt_class_ids,
        )

    # Overall metrics
    overall_metrics = calculate_metrics(total_tp, total_fp, total_fn)

    print(f"Total: TP={total_tp} FP={total_fp} FN={total_fn}")
    print(f"  Precision: {overall_metrics['precision']:.4f}  Recall: {overall_metrics['recall']:.4f}  F1: {overall_metrics['f1']:.4f}  Accuracy: {overall_metrics['accuracy']:.4f}")
    print("=" * 60)

    return {"overall": overall_metrics, "per_frame": frame_metrics}


def evaluate_task3_face_detections(
    video_path: str,
    gt_dir: str = "ground_truth_for_task3",
    iou_thresh: float = 0.5,
    eval_frames: Optional[List[int]] = None,
    all_detections: Optional[List[Dict[str, NDArray]]] = None,
) -> Dict[str, Union[Dict[str, float], List[Dict[str, float]]]]:
    """
    Check face detection results (Task 3)

    Args:
        video_path: Video file
        gt_dir: Ground truth folder
        iou_thresh: IoU threshold (default: 0.5)
        eval_frames: Frames to check (default: EVAL_FRAMES)
        all_detections: Object detections to get person boxes

    Returns:
        Overall metrics
    """
    vid = cv2.VideoCapture(video_path)
    img_width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    img_height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Use default if not provided
    if eval_frames is None:
        eval_frames = EVAL_FRAMES

    total_tp = 0
    total_fp = 0
    total_fn = 0
    frame_metrics = []

    print("FACE DETECTION EVALUATION")


    # Make output folder
    video_name = Path(video_path).stem.replace("2", "")  # Remove task2 suffix
    vis_dir = f"evaluation_results/{video_name}_task3"
    os.makedirs(vis_dir, exist_ok=True)

    for frame_num in eval_frames:
        # Get frame
        vid.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = vid.read()

        if not ret:
            continue

        # Get person boxes from object detections
        person_boxes = []
        if all_detections is not None and frame_num < len(all_detections):
            det = all_detections[frame_num]
            # COCO class 1 = person
            for i, label in enumerate(det["labels"]):
                if label == 1:
                    person_boxes.append(det["boxes"][i])

        # Detect faces
        face_detections = facial_recognition_with_cascades(
            frame, person_boxes=person_boxes if person_boxes else None
        )

        # Convert to boxes
        pred_boxes = []
        for det in face_detections:
            x, y, w, h = det["box"]
            pred_boxes.append([x, y, x + w, y + h])

        pred_boxes = np.array(pred_boxes) if pred_boxes else np.array([]).reshape(0, 4)

        # Class 5 = face
        pred_class_ids = (
            np.full(len(pred_boxes), 5) if len(pred_boxes) > 0 else np.array([])
        )

        # GT file
        gt_file = Path(gt_dir) / f"task1_frame_{frame_num}.txt"

        if not gt_file.exists():
            print(f"No GT for frame {frame_num}")
            continue

        # Evaluate
        metrics, is_tp, gt_boxes, gt_class_ids = evaluate_detections_for_frame(
            pred_boxes,
            gt_file,
            img_width,
            img_height,
            iou_thresh,
            pred_class_ids=pred_class_ids,
        )

        total_tp += metrics["tp"]
        total_fp += metrics["fp"]
        total_fn += metrics["fn"]

        frame_metrics.append({"frame": frame_num, **metrics})

        # print(
        #     f"Frame {frame_num:4d}: TP={metrics['tp']:2d} FP={metrics['fp']:2d} FN={metrics['fn']:2d} "
        #     f"P={metrics['precision']:.3f} R={metrics['recall']:.3f} F1={metrics['f1']:.3f}"
        # )

        # Save comparison
        visualize_iou_comparison(
            f"{video_name}1.mp4",
            frame_num,
            pred_boxes,
            gt_boxes,
            is_tp,
            vis_dir,
            prefix="task3",
            pred_class_ids=pred_class_ids,
            gt_class_ids=gt_class_ids,
        )

    vid.release()

    # Overall metrics
    overall_metrics = calculate_metrics(total_tp, total_fp, total_fn)

    print(f"Total: TP={total_tp} FP={total_fp} FN={total_fn}")
    print(f"  Precision: {overall_metrics['precision']:.4f}  Recall: {overall_metrics['recall']:.4f}  F1: {overall_metrics['f1']:.4f}  Accuracy: {overall_metrics['accuracy']:.4f}")
    print("=" * 60)


    return {"overall": overall_metrics, "per_frame": frame_metrics}


In [None]:

# Export frames for Ground Truth annotation
# video_name = "task"
# FOR_TASK = "1"
# export_specific_frames(
#     video_path=(f"{video_name}{FOR_TASK}.mp4"),
#     output_dir=(f"ground_truth_frames/{video_name}{FOR_TASK}"),
#     frame_numbers=EVAL_FRAMES,
#     prefix=f"{video_name.strip('_')}{FOR_TASK}_frame",
# )

In [None]:
## Run parameters ##
video_name = "task"
evaluate = True
eval_frames = None

## DETECTION PARAMETERS OVERRIDES ##
# MaskRCNN settings
RCNN_SCORE_THRESHOLD = 0.7


## Task 2

### Task 2 - Object detection and tracking

In [None]:
# Tracking method
association_method = "centroid_distance"  # or 'iou'

video_path = f"./{video_name}1.mp4"

all_object_detections = object_detections(video_path, video_name)

In [None]:
print(f"Object Tracking Method: {association_method}")
all_tracks = do_tracking(
    all_object_detections, association_method=association_method
)

# NOTE: Track IDs are already consecutive (0, 1, 2, ...)
# They come from enumerate() in draw_tracks_on_video()
print(f"Total tracks after filtering: {len(all_tracks)}")

print("\nMaking tracked video...")
draw_tracks_on_video(video_path, all_tracks, video_name=video_name)

print("\nTask 2 done!")

### Task 2 - Evaluate Accuracy

In [None]:
# Evaluate Task 2 object detections with ground truth
if evaluate and Path("ground_truth_for_task2").exists():
    evaluate_task2_detections(
        all_object_detections,
        video_name,
        gt_dir="ground_truth_for_task2",
        iou_thresh=0.5,
        eval_frames=eval_frames,
    )

# Evaluate track quality
print("OBJECT TRACKING EVALUATION")
track_quality = evaluate_track_quality(all_tracks, fps=30)
print(f"Total Tracks:              {track_quality['total_tracks']}")
print(f"Average Track Length:      {track_quality['avg_track_length_frames']:.1f} frames ({track_quality['avg_track_length_seconds']:.2f}s)")
print(f"Median Track Length:       {track_quality['median_track_length_frames']:.0f} frames ({track_quality['median_track_length_seconds']:.2f}s)")


# Evaluate tracking accuracy against ground truth
print("\nTracking Accuracy (vs Ground Truth):")
tracking_acc = evaluate_tracking_accuracy(
    detected_tracks=track_quality['total_tracks'],
    ground_truth_tracks=GT_DIFFERENT_TRACKS_IN_VIDEO
)
print(f"  Precision: {tracking_acc['precision']:.4f}  Recall: {tracking_acc['recall']:.4f}  F1: {tracking_acc['f1']:.4f}  Accuracy: {tracking_acc['accuracy']:.4f}")


## Task 3

### Task 3 - Face detection and tracking

In [None]:
## DETECTION PARAMETERS OVERRIDES ##
# Haar cascade settings
SCALE_FACTOR = 1.1
MIN_NEIGHBORS = 3
MIN_SIZE = (30, 30)
MAX_SIZE = (100, 100)

In [None]:
# Task 3
print("\nTask 3: Faces detection and tracking...")
input_video_detection_path = f"{video_name}1.mp4"
video_task2_path = f"{video_name}2.mp4"
output_video_path = f"{video_name}3.mp4"


current_video = cv2.VideoCapture(input_video_detection_path)
video_length = int(current_video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(current_video.get(cv2.CAP_PROP_FPS))
width = int(current_video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(current_video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# First pass: collect all face detections
print("Detecting faces in all frames...")
all_face_detections = []

for i in range(video_length):
    # Read frame
    ret, frame = current_video.read()
    if not ret:
        break

    # Get person boxes for this frame
    person_boxes = []
    if all_object_detections is not None and i < len(all_object_detections):
        det = all_object_detections[i]
        # COCO class 1 = person
        for idx, label in enumerate(det["labels"]):
            if label == 1:
                person_boxes.append(det["boxes"][idx])

    # Detect faces
    face_detections = facial_recognition_with_cascades(
        frame, person_boxes=person_boxes if person_boxes else None
    )

    all_face_detections.append(face_detections)

current_video.release()
print(f"Detected faces in {video_length} frames")

# Track faces across frames
print("Tracking faces...")
face_tracks = do_face_tracking(all_face_detections, association_method="centroid_distance")
print(f"Created {len(face_tracks)} face tracks")

# Second pass: draw tracked faces on video
print("\nDrawing tracked faces on video...")
current_video = cv2.VideoCapture(video_task2_path)
fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
vid_out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

for i in range(video_length):
    # Read frame
    ret, frame = current_video.read()
    if not ret:
        break

    # Draw all active face tracks
    for track_id, track in enumerate(face_tracks):
        # Check if track is active on this frame
        start_frame = track["start_frame"]
        inner_idx = i - start_frame

        if 0 <= inner_idx < len(track["boxes"]):
            # Get box [tlx, tly, brx, bry]
            tlx, tly, brx, bry = track["boxes"][inner_idx].astype(np.int32)
            cascade_name = track["cascade_names"][inner_idx]

            # Color by track ID
            colour = COLOURS2[track_id % len(COLOURS)]

            # Draw box
            cv2.rectangle(frame, (tlx, tly), (brx, bry), color=colour, thickness=2)

            # Make label
            text = f"ID:{track_id} face:{cascade_name}"
            (text_width, text_height), _ = cv2.getTextSize(
                text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
            )

            # Background
            cv2.rectangle(
                frame,
                (tlx, tly - text_height - 4),
                (tlx + text_width, tly),
                colour,
                -1,
            )

            # Text
            cv2.putText(
                frame,
                text,
                (tlx, tly - 2),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (255, 255, 255),
                1,
                cv2.LINE_AA,
            )

    vid_out.write(frame)

current_video.release()
vid_out.release()
print(f"Face tracking video saved to {output_video_path}")

### Task 3 - Evaluate Accuracy

In [None]:
# Evaluate Task 3
if evaluate and Path("ground_truth_for_task3").exists():
    evaluate_task3_face_detections(
        video_task2_path,
        gt_dir="ground_truth_for_task3",
        iou_thresh=0.5,
        eval_frames=eval_frames,
        all_detections=all_object_detections,
    )


# Evaluate face track quality
print("FACE TRACKING QUALITY EVALUATION")
face_track_quality = evaluate_track_quality(face_tracks, fps=fps)
print(f"Total Tracks:              {face_track_quality['total_tracks']}")
print(f"Average Track Length:      {face_track_quality['avg_track_length_frames']:.1f} frames ({face_track_quality['avg_track_length_seconds']:.2f}s)")
print(f"Median Track Length:       {face_track_quality['median_track_length_frames']:.0f} frames ({face_track_quality['median_track_length_seconds']:.2f}s)")

# Evaluate face tracking accuracy against ground truth
print("\nFace Tracking Accuracy (vs Ground Truth):")
face_tracking_acc = evaluate_tracking_accuracy(
    detected_tracks=face_track_quality['total_tracks'],
    ground_truth_tracks=GT_FACES_TRACKS_IN_VIDEO
)
print(f"  Precision: {face_tracking_acc['precision']:.4f}  Recall: {face_tracking_acc['recall']:.4f}  F1: {face_tracking_acc['f1']:.4f}  Accuracy: {face_tracking_acc['accuracy']:.4f}")
