### Imports

In [2]:
import os
import random
import numpy as np
import cv2
import matplotlib.pyplot as plt
import shutil
random.seed(42)

### Dataset Curation and Preprocessing

This code is copied from other_detection_methods.ipynb for clarity. 

Trying to train our own cascade classifier using YT Faces dataset (setup by Ben). Here is the code used to parse the npz files and create a collection of positive examples with annotations:

In [None]:
count = 0

positives_txt_path = os.path.join("data", "pos.txt")
positives_txt = open(positives_txt_path, "w")

if not os.path.exists("data","positives"):
    print("Creating data/positives directory!")
    os.makedirs("data","positives")

for folder_idx in [1, 2, 3, 4]:

    # You should have these directories after running Ben's setup code 
    # for yt_faces either using the corresponding flag in installData.py 
    path = os.path.join("data", "yt_faces", f"youtube_faces_with_keypoints_full_{folder_idx}", 
                        f"youtube_faces_with_keypoints_full_{folder_idx}")
    
    with os.scandir(path) as entries:
        for file in entries:
            # print(file.name)
            file_path = os.path.join(path, file.name)
            arr = np.load(file_path)
            
            # Each loaded npz file contains multiple samples (eg, 79 in Aaron_Eckhart_0.npz) in the last dimension.
            color_images = arr["colorImages"]    # Shape: (231, 237, 3, 79)
            bounding_boxes = arr["boundingBox"]    # Shape: (4, 2, 79)
            
            num_samples = color_images.shape[-1]
            # print("Number of samples in file:", num_samples)

            # Iterate over each sample
            for sample_idx in range(num_samples):
                # Skip 98% of examples since there are ~260k 
                if random.random() <= 0.98:
                    continue 
                count += 1 
                image_frame = color_images[:, :, :, sample_idx]  # (231, 237, 3)
                image_frame = cv2.cvtColor(image_frame, cv2.COLOR_RGB2BGR)
                bbox = bounding_boxes[:, :, sample_idx]           # (4, 2)
                # print(f"Sample index: {sample_idx} \nBounding Box:\n {bbox}")
                
                # Resize image to 224x224
                original_height, original_width = image_frame.shape[:2]
                image_frame = cv2.resize(image_frame, (224, 224))

                # Using x,y,w,h here as instructed by the training cascade classifier docs. 
                x = int(np.min(bbox[:, 0]))
                y = int(np.min(bbox[:, 1]))
                w = int(np.max(bbox[:, 0]) - x)
                h = int(np.max(bbox[:, 1]) - y)

                # Rescale bounding box coordinates
                x = int(x * (224 / original_width))
                y = int(y * (224 / original_height))
                w = int(w * (224 / original_width))
                h = int(h * (224 / original_height))

                img_filename = f"img{count}.jpg"
                img_filepath = os.path.join("data", "positives", img_filename)
                cv2.imwrite(img_filepath, image_frame)
                
                """ VISUALIZATION ONLY """
                # cv2.rectangle() expects x1,y1,x2,y2 (top left and bottom right coordinates)
                # cv2.rectangle(image_frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                # plt.imshow(cv2.cvtColor(image_frame, cv2.COLOR_BGR2RGB))
                # plt.axis("off")
                # plt.show()

                positives_txt.write(f"data/positives/{img_filename} 1 {x} {y} {w} {h}\n")
        
positives_txt.close()
print(count)

In [None]:
input_file = os.path.join("data","pos.txt")
output_file = os.path.join("data","pos_subset.txt")

with open(input_file, "r") as f1:
    lines = f1.readlines()

# Sample a small number of positives, so that we can test if the training even works
selected_lines = random.sample(lines, 1500)

with open(output_file, "w") as f2:
    f2.writelines(selected_lines)


We now create negative examples:

In [None]:
def pick_random_images_and_write_to_negative_txt(src_dir, dest_dir, num_files=2000):

    negatives_txt_path = os.path.join("data", "neg.txt")
    negatives_txt = open(negatives_txt_path, "a")

    all_files = [f for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))]
    
    selected_files = random.sample(all_files, min(num_files, len(all_files)))
    
    for file in selected_files:
        shutil.copy(os.path.join(src_dir, file), os.path.join(dest_dir, file))
        negatives_txt.write(f"{dest_dir}\\{file}\n")   

    print(f"Copied {len(selected_files)} files from {src_dir} to {dest_dir}.")

In [None]:
negatives_dir = os.path.join("data","negatives")
"""
This data was taken from: https://www.kaggle.com/datasets/mikhailma/house-rooms-streets-image-dataset/data"
Simply download and extract into /data.

My reasoning for choosing housing/street data as negative examples was because a) these are higher quality than the CIFAR-10 and caltech negative examples, and b) I think faces are more likely to be seen with these backgrounds so maybe it might help for the negative examples to be indicative of actual backgrounds?
"""
kaggle_street_dir = os.path.join("data", "kaggle_room_street_data", "street_data")
kaggle_house_dir = os.path.join("data", "kaggle_room_street_data", "house_data")

if not os.path.exists(negatives_dir):
    print("Creating data/negatives directory!")
    os.makedirs(negatives_dir)

pick_random_images_and_write_to_negative_txt(kaggle_house_dir, negatives_dir)
pick_random_images_and_write_to_negative_txt(kaggle_street_dir, negatives_dir)

### Training the classifier

The following commands should be run from the root directory of the repo

Now we create a vector file using 

```bash
opencv_createsamples.exe -info data/pos_subset.txt -w 64 -h 64 -num 3000 -vec data/pos.vec
```
> This tool is part of a collection of tools that can be installed from https://sourceforge.net/projects/opencvlibrary/files/3.4.16/

We can train the classifier using

```bash
opencv_traincascade.exe -data cascade/ -vec data/pos.vec -bg data/neg.txt -w 64 -h 64 -numPos 1000 -numNeg 2000 -numStages 10 -minHitRate 0.99
```
> This tool is also part of the same collection of tools.

Error message:
```sh
PS C:\Users\syeda\OneDrive\Desktop\4th Year\COSC444\face-recognition> C:\Users\syeda\Downloads\opencv\build\x64\vc15\bin\opencv_traincascade.exe -data cascade/ -vec data/pos.vec -bg data/neg.txt -w 64 -h 64 -numPos 1000 -numNeg 2000 -numStages 10 -minHitRate 0.95
PARAMETERS:                                                                                                 7783963ff
cascadeDirName: cascade/
vecFileName: data/pos.vec
bgFileName: data/neg.txt
numPos: 1000
numNeg: 2000
numStages: 10
precalcValBufSize[Mb] : 1024
precalcIdxBufSize[Mb] : 1024
stageType: BOOST
featureType: HAAR
sampleWidth: 64
sampleHeight: 64
boostType: GAB
minHitRate: 0.95
maxFalseAlarmRate: 0.5
weightTrimRate: 0.95
maxDepth: 1
maxWeakCount: 100
mode: BASIC
Number of unique features given windowSize [64,64] : 8103936

===== TRAINING 0-stage =====
<BEGIN
POS count : consumed   1000 : 1000
Train dataset for temp stage can not be filled. Branch training terminated.
Cascade classifier can't be trained. Check the used training parameters.
```