In [2]:
import cv2
import numpy as np
import pandas as pd
import os
import time
import shutil

In [7]:
def extract_frames(video_path, output_folder, duration=30):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    end_frame = min(int(duration * video_fps), total_frames)
    frame_index = 0
    while cap.isOpened() and frame_index < end_frame:
        ret, frame = cap.read()
        if not ret:
            break
        output_path = f"{output_folder}/frame_{frame_index:05d}.jpg"
        cv2.imwrite(output_path, frame)
        frame_index += 1
    cap.release()
    cv2.destroyAllWindows()

In [8]:
video_path = "input_video.mp4"
output_folder = "./output_frames"
duration = 30
extract_frames(video_path, output_folder, duration)

In [20]:
def detect_faces_in_frames(frames_folder, output_folder):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    total_time = 0
    frame_count = 0
    for filename in os.listdir(frames_folder):
        if filename.endswith(".jpg"):
            frame_path = os.path.join(frames_folder, filename)
            frame = cv2.imread(frame_path)
            start_time = time.time()
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
            for (x, y, w, h) in faces:
                cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, frame)
            end_time = time.time()
            frame_processing_time = end_time - start_time
            total_time += frame_processing_time
            frame_count += 1
    average_time_per_frame = total_time / frame_count if frame_count > 0 else 0
    print("Average time per frame:", average_time_per_frame, "seconds")

In [21]:
def detect_faces_in_frames_coloured(frames_folder, output_folder):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    total_time = 0
    frame_count = 0
    for filename in os.listdir(frames_folder):
        if filename.endswith(".jpg"):
            frame_path = os.path.join(frames_folder, filename)
            frame = cv2.imread(frame_path)
            start_time = time.time()
            faces = face_cascade.detectMultiScale(frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
            for (x, y, w, h) in faces:
                cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, frame)
            end_time = time.time()
            frame_processing_time = end_time - start_time
            total_time += frame_processing_time
            frame_count += 1
    average_time_per_frame = total_time / frame_count if frame_count > 0 else 0
    print("Average time per frame coloured:", average_time_per_frame, "seconds")

In [26]:
frames_folder = "output_frames"
output_folder = "output_frames_with_faces"
output_folder_coloured = "output_frames_with_faces_coloured"
detect_faces_in_frames(frames_folder, output_folder)
detect_faces_in_frames_coloured(frames_folder, output_folder_coloured)

Average time per frame: 0.08027088459741755 seconds
Average time per frame coloured: 0.0893887150436847 seconds


In [27]:
def frames_to_video(frames_folder, output_video_path, fps=30):
    frame_files = [f for f in os.listdir(frames_folder) if f.endswith('.jpg')]
    frame_files.sort()
    frame_path = os.path.join(frames_folder, frame_files[0])
    frame = cv2.imread(frame_path)
    height, width, _ = frame.shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    for filename in frame_files:
        frame_path = os.path.join(frames_folder, filename)
        frame = cv2.imread(frame_path)
        out.write(frame)
    out.release()
    print(f"Video saved successfully at '{output_video_path}'")

In [28]:
frames_folder = "output_frames_with_faces"
output_video_path = "output_video.mp4"
frames_to_video(frames_folder, output_video_path)
frames_folder = "output_frames_with_faces_coloured"
output_video_path = "output_video_coloured.mp4"
frames_to_video(frames_folder, output_video_path)

Video saved successfully at 'output_video.mp4'
Video saved successfully at 'output_video_coloured.mp4'


We can change the following things in the xml file:

Number of Stages in the Cascade: The cascade classifier in Viola-Jones is a sequence of increasingly complex stages. Each stage filters out a large portion of non-face regions. xml file defines the number of stages. More stages lead to higher accuracy but slower processing time. We can adjust this trade-off by modifying the cascade structure in the xml file.

Minimum Feature Size: This parameter specifies the minimum size of a Haar-like feature considered during detection. A smaller minimum size allows for detecting faces in various scales but requires evaluating more features, impacting speed. The minimum feature size is defined within the xml file.

Search Window Size: The algorithm scans sub-regions of the image (search windows) for faces. A larger search window size covers more area but takes longer to process. The xml file specifies the search window size, and adjusting it can affect detection time.

Boosting Threshold: The AdaBoost algorithm assigns weights to features based on their effectiveness in classifying faces. The boosting threshold determines the minimum weight a weak classifier needs to be included in the final cascade. A lower threshold leads to more features being considered, potentially impacting speed.

In [29]:
def calculate_iou(box1, box2):
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2
    x_left = max(x1, x2)
    y_top = max(y1, y2)
    x_right = min(x1 + w1, x2 + w2)
    y_bottom = min(y1 + h1, y2 + h2)
    intersection_area = max(0, x_right - x_left) * max(0, y_bottom - y_top)
    box1_area = w1 * h1
    box2_area = w2 * h2
    iou = intersection_area / float(box1_area + box2_area - intersection_area)
    return iou

In [48]:
def generate_face_tracks(frames_folder):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    frame_files = [f for f in os.listdir(frames_folder) if f.endswith('.jpg')]
    frame_files.sort()
    tracks = {}
    final_tracks = {}
    frame_number = 0
    main_track_id = 0
    for filename in frame_files:
        frame_path = os.path.join(frames_folder, filename)
        print(frame_path)
        frame = cv2.imread(frame_path)
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        for track_id in list(tracks.keys()):
            last_frame_number, last_box = tracks[track_id][-1]
            best_iou = 0
            best_box_index = -1
            for i, (x, y, w, h) in enumerate(faces):
                iou = calculate_iou(last_box, (x, y, w, h))
                if iou > best_iou:
                    best_iou = iou
                    best_box_index = i
            if best_iou > 0.5:
                print("HI")
                print(best_iou)
                x, y, w, h = faces[best_box_index]
                tracks[track_id].append((frame_number, (x, y, w, h)))
            else:
                item_to_move = tracks.pop(track_id)
                final_tracks[track_id] = item_to_move
        for i, (x, y, w, h) in enumerate(faces):
            is_new_track = True
            for track_id in tracks:
                last_frame_number, last_box = tracks[track_id][-1]
                iou = calculate_iou(last_box, (x, y, w, h))
                if iou > 0.5:
                    is_new_track = False
                    break
            if is_new_track:
                print("hi")
                print(main_track_id)
                tracks[main_track_id] = [(frame_number, (x, y, w, h))]
                main_track_id += 1
        frame_number += 1
    final_tracks.update(tracks)
    return final_tracks

In [49]:
frames_folder = "output_frames"
unique_tracks = generate_face_tracks(frames_folder)

output_frames\frame_00000.jpg
output_frames\frame_00001.jpg
hi
0
output_frames\frame_00002.jpg
HI
0.8089648945775134
output_frames\frame_00003.jpg
HI
0.9130019657399607
output_frames\frame_00004.jpg
output_frames\frame_00005.jpg
output_frames\frame_00006.jpg
output_frames\frame_00007.jpg
output_frames\frame_00008.jpg
hi
1
output_frames\frame_00009.jpg
hi
2
output_frames\frame_00010.jpg
output_frames\frame_00011.jpg
hi
3
output_frames\frame_00012.jpg
HI
0.92340087890625
hi
4
output_frames\frame_00013.jpg
HI
0.9367640361720122
output_frames\frame_00014.jpg
HI
0.8925619834710744
output_frames\frame_00015.jpg
HI
0.8631459398069279
output_frames\frame_00016.jpg
HI
0.8604090672225118
hi
5
output_frames\frame_00017.jpg
HI
0.9833333333333333
output_frames\frame_00018.jpg
HI
0.9672153541424766
output_frames\frame_00019.jpg
HI
0.8920808440320326
output_frames\frame_00020.jpg
HI
0.8410987482614742
output_frames\frame_00021.jpg
HI
0.9262594027809437
hi
6
output_frames\frame_00022.jpg
HI
0.85082066

In [50]:
print(unique_tracks)

{0: [(1, (226, 213, 131, 131)), (2, (215, 211, 130, 130)), (3, (218, 213, 131, 131))], 1: [(8, (500, 164, 57, 57))], 2: [(9, (225, 213, 130, 130))], 4: [(12, (534, 176, 46, 46))], 5: [(16, (544, 148, 66, 66))], 6: [(21, (404, 90, 141, 141))], 7: [(23, (508, 147, 70, 70))], 8: [(27, (272, 322, 52, 52))], 3: [(11, (228, 213, 128, 128)), (12, (230, 217, 123, 123)), (13, (234, 217, 121, 121)), (14, (228, 214, 123, 123)), (15, (237, 217, 120, 120)), (16, (229, 217, 119, 119)), (17, (230, 217, 119, 119)), (18, (228, 217, 121, 121)), (19, (233, 216, 123, 123)), (20, (222, 211, 129, 129)), (21, (217, 209, 131, 131)), (22, (206, 204, 140, 140)), (23, (190, 211, 128, 128)), (24, (177, 210, 129, 129)), (25, (168, 214, 127, 127)), (26, (159, 212, 130, 130)), (27, (157, 214, 127, 127)), (28, (148, 211, 131, 131)), (29, (143, 210, 131, 131)), (30, (141, 210, 127, 127)), (31, (134, 207, 133, 133)), (32, (133, 209, 129, 129)), (33, (130, 211, 127, 127)), (34, (130, 212, 127, 127)), (35, (126, 210, 131

In [51]:
def draw_bbox_and_id(frame, bbox, track_id):
    x, y, w, h = bbox
    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.putText(frame, f'Track ID: {track_id}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    return frame

In [58]:
input_folder = "output_frames/"
output_folder = "output_frames_with_faces_with_tracking/"
file_count = len([name for name in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, name))])
file_dict = {i: 0 for i in range(file_count)}
for track_id, bboxes in unique_tracks.items():
    for frame_number, bbox in bboxes:
        if file_dict[frame_number]==0:
            frame_path = f"{input_folder}frame_{frame_number:05d}.jpg"
            file_dict[frame_number] += 1
        else:
            frame_path = f"{output_folder}frame_{frame_number:05d}.jpg"
        frame = cv2.imread(frame_path)
        frame = draw_bbox_and_id(frame, bbox, track_id)
        output_path = f"{output_folder}frame_{frame_number:05d}.jpg"
        cv2.imwrite(output_path, frame)
print("All frames processed and saved.")

All frames processed and saved.


In [60]:
folder_with_faces = 'output_frames_with_faces_with_tracking'
source_folder = 'output_frames'
faces_files = os.listdir(folder_with_faces)
source_files = os.listdir(source_folder)
for file in source_files:
    if file not in faces_files:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(folder_with_faces, file)
        shutil.copyfile(source_path, destination_path)

In [61]:
frames_folder = "output_frames_with_faces_with_tracking"
output_video_path = "output_video_tracking.mp4"
frames_to_video(frames_folder, output_video_path)

Video saved successfully at 'output_video_tracking.mp4'


Some of the face tracks are good but the others are bad.

We observe that many people are being tracked in a single frame.

We also notice that some of the detections are random and do not correspond to faces.

Yes, a unique person is associated to the same track only if it's detected in each of the frames. Otherwise we will have to use other techniques like kalman filters for the same.

In [2]:
import os
import shutil
import random

original_train_images_dir = "archive/images/train/"
original_val_images_dir = "archive/images/val/"
original_train_labels_dir = "archive/labels/train/"
original_val_labels_dir = "archive/labels/val/"

new_train_images_dir = "new_dataset/images/train/"
new_val_images_dir = "new_dataset/images/val/"
new_train_labels_dir = "new_dataset/labels/train/"
new_val_labels_dir = "new_dataset/labels/val/"

os.makedirs(new_train_images_dir, exist_ok=True)
os.makedirs(new_val_images_dir, exist_ok=True)
os.makedirs(new_train_labels_dir, exist_ok=True)
os.makedirs(new_val_labels_dir, exist_ok=True)

selected_images = random.sample(os.listdir(original_train_images_dir), 100)

for image in selected_images:
    src_image = os.path.join(original_train_images_dir, image)
    dest_image = os.path.join(new_train_images_dir, image)
    shutil.copy(src_image, dest_image)

    label_filename = os.path.splitext(image)[0] + ".txt"
    src_label = os.path.join(original_train_labels_dir, label_filename)
    dest_label = os.path.join(new_train_labels_dir, label_filename)
    shutil.copy(src_label, dest_label)

for file_name in os.listdir(original_val_images_dir):
    src_val_image = os.path.join(original_val_images_dir, file_name)
    dest_val_image = os.path.join(new_val_images_dir, file_name)
    shutil.copy(src_val_image, dest_val_image)

    label_filename = os.path.splitext(file_name)[0] + ".txt"
    src_val_label = os.path.join(original_val_labels_dir, label_filename)
    dest_val_label = os.path.join(new_val_labels_dir, label_filename)
    shutil.copy(src_val_label, dest_val_label)

New dataset created with 100 images in the training set.


The link for the output video is: https://iiitaphyd-my.sharepoint.com/:v:/g/personal/harshit_aggarwal_research_iiit_ac_in/ETUhPau-81ZNtBG9zGKU2vIBH_J--7ZkymX8kbEGtv9b8g?nav=eyJyZWZlcnJhbEluZm8iOnsicmVmZXJyYWxBcHAiOiJPbmVEcml2ZUZvckJ1c2luZXNzIiwicmVmZXJyYWxBcHBQbGF0Zm9ybSI6IldlYiIsInJlZmVycmFsTW9kZSI6InZpZXciLCJyZWZlcnJhbFZpZXciOiJNeUZpbGVzTGlua0NvcHkifX0&e=CpEPJ2