**Object & Face recognition and Tracking**

**Name**: Hriddhi Doley
**Github link**: https://github.com/HriddhiDoley/Computer-Vision-Object-Tracking




# Task 1: Collect a short video & prepare ground truth

* The short video is aligned with the list of MS COCO class names.
  * The video is of 13s length and it is in .mp4 format
  * The video captures a moving person,a sport ball and a hat and few static items viz. cup, teddy bear and a book.
* Ground Truth is prepared using a tool called Computer vision Annotation Tool(CVAT).
*The Output of CVAT is in .xml format and the below code extracts the frame, object ID and the bounding boxes and export into a .csv file for future evaluation.

In [None]:
# Import libraries

import copy
import math
import numpy as np
import cv2
import torch
import torchvision
from torchvision.transforms import functional as tvtf
from google.colab.patches import cv2_imshow  # Use this for displaying images in Colab
import xml.etree.ElementTree as ET
import csv
import pandas as pd

## Forming the COCO Dataset class label
* Source: https://github.com/amikelive/coco-labels/blob/master/coco-labels-paper.txt

In [None]:
# COCO dataset class labels
COCO_INSTANCE_CATEGORY_NAMES = [
    'background',  # Added as index 0 for alignment with object detection models
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
    'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat',
    'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie',
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork',
    'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
    'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
    'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window',
    'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush', 'hair brush'
]


## Extract bounding boxes coordinates and frames from .xml generated by CVAT

* Extract frame no, object ID and bounding boxes coordinates from the .xml file and dump these in a .csv file for future evaluation.
  *  Input is annotations.xml
  *  Output is groundtruth.csv, This file is later divided by objects for individual evaluations.


In [None]:
# Define the function to parse XML and dump data into a CSV file
def xml_to_csv(xml_file, csv_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Open the CSV file for writing
    with open(csv_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Write the headers to the CSV
        headers = ['track_id', 'label', 'source', 'frame', 'keyframe', 'outside', 'occluded', 'xtl', 'ytl', 'xbr', 'ybr', 'z_order']
        csvwriter.writerow(headers)

        # Iterate through each track in the XML file
        for track in root.findall('track'):
            track_id = track.get('id')
            label = track.get('label')
            source = track.get('source')

            # Iterate through each box within the track
            for box in track.findall('box'):
                frame = box.get('frame')
                keyframe = box.get('keyframe')
                outside = box.get('outside')
                occluded = box.get('occluded')
                xtl = box.get('xtl')
                ytl = box.get('ytl')
                xbr = box.get('xbr')
                ybr = box.get('ybr')
                z_order = box.get('z_order')

                # Write each row to the CSV file
                csvwriter.writerow([track_id, label, source, frame, keyframe, outside, occluded, xtl, ytl, xbr, ybr, z_order])

# Execution
xml_file = '/content/annotations.xml'  # Path to the XML file
csv_file = 'groundtruth.csv'  # Path to the output CSV file

xml_to_csv(xml_file, csv_file)

# Task 2: Obect Tracking and Recognition

* The model used for object detection is a pre-trained Mask R-CNN
* Objects are detected frame-by-frame in the video (task1.mp4)
* Association method IoU is used to unique object IDs and track the objects
* Bounding Boxes and labels are drawn on the output video (task2.mp4) showing which object ID corresponds to which ID, with labels and scores.
* Confidence threshold of 0.5 is used to filter out low confidence predictions.
* A .csv file is also generated to capture the frame, object ID and bounding boxes coordiantes for evaluation.


## Association method : IoU

In [None]:
# Function to compute IoU between two bounding boxes using numpy for optimization
def compute_iou(boxA, boxB):
    xA = np.maximum(boxA[0], boxB[0])
    yA = np.maximum(boxA[1], boxB[1])
    xB = np.minimum(boxA[2], boxB[2])
    yB = np.minimum(boxB[3], boxB[3])

    interArea = np.maximum(0, xB - xA + 1) * np.maximum(0, yB - yA + 1)

    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

## Object tracking dictionary initialization

In [None]:
object_tracker = {}
next_object_id = 0
iou_threshold = 0.3  # IoU threshold to associate objects between frames

## CSV logging function

In [None]:
def log_to_csv(frame_num, object_id, bbox, csv_writer, label, score):
    """Log object detection data to CSV file."""
    x1, y1, x2, y2 = bbox.astype(int)
    csv_writer.writerow([frame_num, object_id, label, score, x1, y1, x2, y2])

## Update tracker function

In [None]:
# Function to update tracker with current frame's detections
def update_tracker(current_boxes):
    global next_object_id
    current_ids = []
    for new_box in current_boxes:
        matched = False
        for obj_id, prev_box in object_tracker.items():
            iou = compute_iou(new_box, prev_box)
            if iou > iou_threshold:
                object_tracker[obj_id] = new_box  # Update tracked object with new box
                current_ids.append(obj_id)
                matched = True
                break
        if not matched:
            object_tracker[next_object_id] = new_box  # Assign new ID
            current_ids.append(next_object_id)
            next_object_id += 1
    return current_ids

## Preprocessing function

In [None]:
# Preprocessing transform
def preprocess(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img_tensor = tvtf.to_tensor(frame_rgb).to(device)
    return img_tensor.unsqueeze(0)

## Execution of the Mask R-CNN model

In [None]:
# Load pre-trained Mask R-CNN model from torchvision (COCO dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True).to(device)
model.eval()

# Load video
video_input = 'task1.mp4'
cap = cv2.VideoCapture(video_input)

# Get video dimensions and set up output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter('task2.mp4', fourcc, fps, (width, height))

# CSV file for logging object detection results
csv_file = open('object_tracking_results.csv', mode='w', newline='')
csv_writer = csv.writer(csv_file)
# Write the header for the CSV file
csv_writer.writerow(['Frame', 'Object ID', 'Label', 'Score', 'x1', 'y1', 'x2', 'y2'])

frame_num = 0  # Initialize frame counter

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame and pass through the Mask R-CNN model
    img_tensor = preprocess(frame)
    with torch.no_grad():
        predictions = model(img_tensor)

    # Extract boxes, labels, and scores from predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    current_boxes = []

    # Filter out low-confidence detections
    for i in range(len(boxes)):
        if scores[i] > 0.5:  # Confidence threshold
            current_boxes.append(boxes[i])

    # Update tracker and get object IDs for current frame
    current_ids = update_tracker(current_boxes)

    # Loop over the boxes and assign object IDs
    for box, obj_id, label_id, score in zip(current_boxes, current_ids, labels, scores):
        (x1, y1, x2, y2) = box.astype(int)
        label = COCO_INSTANCE_CATEGORY_NAMES[label_id]
        # Draw bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Display object ID, label, and score
        label_text = f"ID {obj_id}, {label}: {score:.2f}"
        cv2.putText(frame, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Log the detection details into CSV
        log_to_csv(frame_num, obj_id, box, csv_writer, label, score)

    # Write the frame with detections to output video
    out.write(frame)

    # Display the frame (now using cv2_imshow)
    #cv2_imshow(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Increment frame counter
    frame_num += 1

# Release video capture, writer, and CSV file
cap.release()
out.release()
csv_file.close()
cv2.destroyAllWindows()


# Task 3 : Face detection and tracking

The faces, eyes, and smiles of the person were detected, and the face object was tracked.
* ***Haar Cascades***: Three different pre-trained Haar Cascade classifiers are loaded :
  * haarcascade_frontalface_default.xml for face detection
  * haarcascade_eye.xml for detecting eyes
  * haarcascade_smile.xml for detecting smiles.
* ***Face Detection***: For each frame, faces are detected using face_cascade.detectMultiScale(). This function returns the coordinates of the bounding boxes around the detected faces.
* ***Eye and Smile Detection***: Inside each detected face, detect eyes and smiles, and draw corresponding bounding boxes inside the face region.
* ***Face Tracking***: Detected faces are tracked across frames using IoU-based matching, associating each face with a unique ID.
* ***Bounding Box Drawing***: Bounding boxes are drawn around detected faces, eyes, and smiles. The ID of each tracked face is displayed above the face bounding box.
* ***CSV Logging***: The face tracking information (frame number, object ID, bounding box coordinates) is logged to a CSV file (face_detection_results.csv).
* ***Output Video***: The video with overlaid face detection results is saved as task3.mp4.




## Object tracking dictionary initialization

In [None]:
object_tracker = {}
next_object_id = 0
iou_threshold = 0.3  # IoU threshold to associate objects between frames

## Execution of the Haar Cascade classifier Face detection model.

In [None]:
# Load pre-trained Haar Cascade Classifiers for face, eye, and smile detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')
smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_smile.xml')

# CSV logging function
def log_to_csv(frame_num, object_id, bbox, csv_writer):
    """Log object detection data to CSV file."""
    x1, y1, x2, y2 = bbox.astype(int)
    csv_writer.writerow([frame_num, object_id, x1, y1, x2, y2])

# Load video
video_input = 'task2.mp4'
cap = cv2.VideoCapture(video_input)

# Get video dimensions and set up output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter('task3.mp4', fourcc, fps, (width, height))

# CSV file for logging object detection results
csv_file = open('face_detection_results.csv', mode='w', newline='')
csv_writer = csv.writer(csv_file)
# Write the header for the CSV file
csv_writer.writerow(['Frame', 'Object ID', 'x1', 'y1', 'x2', 'y2'])

frame_num = 0  # Initialize frame counter

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to grayscale (required for Haar classifiers)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces using the Haar cascade
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    current_boxes = []

    # Iterate over detected faces
    for (x, y, w, h) in faces:
        current_boxes.append([x, y, x+w, y+h])
        # Detect eyes within the face region
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = frame[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        smiles = smile_cascade.detectMultiScale(roi_gray, 1.7, 22)

        # Draw face bounding box
        cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)

        # Draw eye bounding boxes within the face
        for (ex, ey, ew, eh) in eyes:
            cv2.rectangle(roi_color, (ex, ey), (ex+ew, ey+eh), (0, 255, 0), 2)

        # Draw smile bounding box
        for (sx, sy, sw, sh) in smiles:
            cv2.rectangle(roi_color, (sx, sy), (sx+sw, sy+sh), (0, 0, 255), 2)

    # Update tracker and get object IDs for current frame
    current_ids = update_tracker(current_boxes)

    # Loop over the boxes and assign object IDs
    for box, obj_id in zip(current_boxes, current_ids):
        (x1, y1, x2, y2) = box
        # Draw bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Display object ID
        label = f"ID {obj_id}"
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        # Log the detection details into CSV
        log_to_csv(frame_num, obj_id, np.array([x1, y1, x2, y2]), csv_writer)

    # Write the frame with detections to output video
    out.write(frame)

    # Display the frame
    # cv2_imshow(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Increment frame counter
    frame_num += 1

# Release video capture, writer, and CSV file
cap.release()
out.release()
csv_file.close()
cv2.destroyAllWindows()


# Task 4: Evaluation / Analyse your object and face detection, recognition and tracking results

**Evaluation method:**
* A .csv file (object_tracking_results.csv) is also generated to capture the frame, object ID and bounding boxes coordinates for the detected objects.
* Ground Truth is prepared using a tool called Computer vision Annotation Tool(CVAT).
* The Output of CVAT is in .xml format and the below code extracts the frame, object ID and the bounding boxes and export into a .csv file (groundtruth.csv) for evaluation.
* Comparison of Ground Truth and Detected object/ faces
  * The association method of IoU is used to compute the intersection over union between two bounding boxes.
  * For each object in the ground truth, we check if there is a corresponding detected face in the same frame with an IoU greater than or equal to the threshold (0.5 in this case).
  * If a match is found, it's counted as a true positive.
  * If no match is found, it’s counted as a false positive.
  * Evaluation Metrics:
    * True Positive (TP): Correctly detected faces/objects.
    * False Positive (FP): Detected faces/objects that don’t correspond to any ground truth face/object.
    * False Negative (FN): Faces/objects in the ground truth that were not detected.
    * Precision: Proportion of detected faces/objects that are correct.
    * Recall: Proportion of actual faces/objects that were detected.
    * F1-Score: The harmonic mean of precision and recall, representing the balance between both.




In [None]:
def evaluate_object_detection(object_detection_file, groundtruth_file):


    # Load the files into pandas DataFrames
    object_detection_df = pd.read_csv(object_detection_file)
    groundtruth_df = pd.read_csv(groundtruth_file)

    # Define IoU threshold
    iou_threshold = 0.5

    # Initialize counters for evaluation metrics
    true_positive = 0
    false_positive = 0
    false_negative = 0

    # Loop through the ground truth data and compare with the predicted data
    for index, gt_row in groundtruth_df.iterrows():
        gt_frame = gt_row['Frame']
        gt_bbox = [gt_row['x1'], gt_row['y1'], gt_row['x2'], gt_row['y2']]

        # Filter the predictions for the same frame
        predicted_boxes = object_detection_df[object_detection_df['Frame'] == gt_frame]

        match_found = False
        for _, pred_row in predicted_boxes.iterrows():
            pred_bbox = [pred_row['x1'], pred_row['y1'], pred_row['x2'], pred_row['y2']]

            # Calculate IoU between ground truth and predicted box
            iou = compute_iou(gt_bbox, pred_bbox)

            if iou >= iou_threshold:
                true_positive += 1
                match_found = True
                break

        if not match_found:
            false_negative += 1

    # Calculate false positives (predicted faces not matched with any ground truth)
    for index, pred_row in object_detection_df.iterrows():
        pred_frame = pred_row['Frame']
        pred_bbox = [pred_row['x1'], pred_row['y1'], pred_row['x2'], pred_row['y2']]

        # Filter the ground truth boxes for the same frame
        groundtruth_boxes = groundtruth_df[groundtruth_df['Frame'] == pred_frame]

        match_found = False
        for _, gt_row in groundtruth_boxes.iterrows():
            gt_bbox = [gt_row['x1'], gt_row['y1'], gt_row['x2'], gt_row['y2']]

            # Calculate IoU between predicted box and ground truth
            iou = compute_iou(gt_bbox, pred_bbox)

            if iou >= iou_threshold:
                match_found = True
                break

        if not match_found:
            false_positive += 1

    # Calculate Precision, Recall, and F1-Score
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Return evaluation metrics as a dictionary
    evaluation_metrics = {
        "True Positive": true_positive,
        "False Positive": false_positive,
        "False Negative": false_negative,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }

    return evaluation_metrics



## Evaluation of Face Detection and Tracking



In [None]:
face_detection_file_path = '/content/face_detection_results.csv'
groundtruth_file_path = '/content/groundtruth_face.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(face_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 139, 'False Positive': 1, 'False Negative': 131, 'Precision': 0.9928571428571429, 'Recall': 0.5148148148148148, 'F1 Score': 0.6780487804878049}


## Evaluation of detection and Tracking of the object type Person

In [None]:
person_detection_file_path = '/content/Object_Tracking_Results_Person.csv'
groundtruth_file_path = '/content/groundtruth_person.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(person_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 290, 'False Positive': 131, 'False Negative': 13, 'Precision': 0.6888361045130641, 'Recall': 0.9570957095709571, 'F1 Score': 0.8011049723756907}


## Evaluation of detection and Tracking of the object type Ball

In [None]:
ball_detection_file_path = '/content/Object_Tracking_Results_Ball.csv'
groundtruth_file_path = '/content/groundtruth_ball.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(ball_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 193, 'False Positive': 0, 'False Negative': 17, 'Precision': 1.0, 'Recall': 0.919047619047619, 'F1 Score': 0.9578163771712159}


## Evaluation of detection and Tracking of the object type Teddy

In [None]:

teddy_detection_file_path = '/content/Object_Tracking_Results_teddy.csv'
groundtruth_file_path = '/content/groundtruth_teddy.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(teddy_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 391, 'False Positive': 1, 'False Negative': 3, 'Precision': 0.9974489795918368, 'Recall': 0.9923857868020305, 'F1 Score': 0.994910941475827}


## Evaluation of detection and Tracking of the object type book

In [None]:

book_detection_file_path = '/content/Object_Tracking_Results_Book.csv'
groundtruth_file_path = '/content/groundtruth_book.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(book_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 272, 'False Positive': 4, 'False Negative': 124, 'Precision': 0.9855072463768116, 'Recall': 0.6868686868686869, 'F1 Score': 0.8095238095238095}


## Evaluation of detection and Tracking of the object type cup

In [None]:
cup_detection_file_path = '/content/Object_Tracking_Results_Cup.csv'
groundtruth_file_path = '/content/groundtruth_cup.csv'

# Call the evaluation function
evaluation_results = evaluate_object_detection(cup_detection_file_path, groundtruth_file_path)
print(evaluation_results)

{'True Positive': 394, 'False Positive': 4, 'False Negative': 1, 'Precision': 0.9899497487437185, 'Recall': 0.9974683544303797, 'F1 Score': 0.9936948297604035}
