# HOG + SVM

Using YT Faces dataset and other dataset to train a SVM model with HOG features

In [None]:
import os
import random
import cv2
import matplotlib.pyplot as plt
import numpy as np

from yt_faces import YTFacesDataset
from skimage.feature import hog
from skimage import exposure

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import SVC
from imutils.object_detection import non_max_suppression

import seaborn as sns

random.seed(42)

In [None]:
YT_FACES_DATASET = YTFacesDataset(os.path.join("data", "yt_faces"))
NON_FACE_DATASET_PATH = os.path.join("data", "natural_images", "Other")

YT_FACES_DATASET.load(frames_per_recording=10) # ~4 mins to load faces, ~8 mins to pre-process and load splits

### Constants

In [None]:
ORIENTATIONS = 9
PIXELS_PER_CELL = (4, 4)
CELLS_PER_BLOCK = (1, 1)

In [None]:
image = cv2.imread("samples/face_detection/solvay_color.jpg", cv2.IMREAD_GRAYSCALE)

res = image.shape[:2]
target_res = [
    ((res[0] + 15) // 16) * 16,
    ((res[1] + 15) // 16) * 16
]
pad_x = (target_res[0] - res[0]) // 2
pad_y = (target_res[1] - res[1]) // 2
image = cv2.copyMakeBorder(image, pad_x, pad_x, pad_y, pad_y, cv2.BORDER_CONSTANT, value=(0, 0, 0))

fd, hog_image = hog(
    image,
    orientations=ORIENTATIONS,
    pixels_per_cell=PIXELS_PER_CELL,
    cells_per_block=CELLS_PER_BLOCK,
    visualize=True,
    feature_vector=True,
)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)

ax1.axis('off')
ax1.imshow(image, cmap=plt.cm.gray)
ax1.set_title('Input image')

hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

ax2.axis('off')
ax2.imshow(hog_image_rescaled, cmap=plt.cm.gray)
ax2.set_title('Histogram of Oriented Gradients')
plt.show()

In [None]:
def extract_hog_features(image,
                         orientations=ORIENTATIONS,
                         pixels_per_cell=PIXELS_PER_CELL,
                         cells_per_block=CELLS_PER_BLOCK,
                         visualize=False):

    if visualize:
        features, hog_image = hog(image,
                    orientations=orientations,
                    pixels_per_cell=pixels_per_cell,
                    cells_per_block=cells_per_block,
                    block_norm='L2-Hys',
                    transform_sqrt=True,
                    feature_vector=True,
                    visualize=True)
        return features, hog_image

    else:
        features = hog(image,
                    orientations=orientations,
                    pixels_per_cell=pixels_per_cell,
                    cells_per_block=cells_per_block,
                    block_norm='L2-Hys',
                    transform_sqrt=True,
                    feature_vector=True)

    return features

In [None]:
def get_feature_vectors_from_folder(folder, label, **kwargs):

    """
        Adapted from Ghaith's function in baseline.ipynb

        kwargs could contain these keys:
            - 'max_images': int value, limiting num images to add to the dataset
            - HOG hyperparameters: 'orientations' (int), 'pixels_per_cell', 'cells_per_block' and 'img_size'

        Returns a tuple containing 2 elements:
            i)  a list of feature vectors (each feature vector is a list)
            ii) a list containing the passed in label repeated max_images times
                i.e. a list of all 1's if we choose positive examples folder (i.e. data/face/Face folder) or all 0's for negative examples
    """
    images_paths = [f for f in os.listdir(folder)]
    random.shuffle(images_paths)

    # Get kwargs, should just default to constants defined earlier
    max_images      = kwargs.get('max_images', None)
    orientations    = kwargs.get('orientations', ORIENTATIONS)
    pixels_per_cell = kwargs.get('pixels_per_cell', PIXELS_PER_CELL)
    cells_per_block = kwargs.get('cells_per_block', CELLS_PER_BLOCK)

    if max_images is not None:
        images_paths = images_paths[:max_images]

    features = []
    paths = []
    for image_path in images_paths:
        full_image_path = os.path.join(folder, image_path)
        img = cv2.imread(full_image_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (480, 480))
            feature_vec = extract_hog_features(img,
                                            orientations=orientations,
                                            pixels_per_cell=pixels_per_cell,
                                            cells_per_block=cells_per_block)
            features.append(feature_vec)
            paths.append(full_image_path)

    return np.array(features), np.full(len(features), label), paths

In [None]:
splits = YT_FACES_DATASET.get_splits()

face_vectors, face_labels, face_paths = get_feature_vectors_from_folder(
    splits["train"], label=1, max_images=2000
)

non_face_vectors, non_face_labels, non_face_paths = get_feature_vectors_from_folder(
    NON_FACE_DATASET_PATH, label=0, max_images=len(face_vectors)
)

In [None]:
X = np.vstack((face_vectors, non_face_vectors))
Y = np.hstack((face_labels, non_face_labels))

paths = face_paths + non_face_paths

# Manually shuffling so we can keep track of the list of image_paths as well
shuffled_indices = np.random.permutation(len(X))

X = X[shuffled_indices]
Y = Y[shuffled_indices]
shuffled_paths = [paths[ind] for ind in shuffled_indices]

split_index = int(len(X) * 0.8)
X_train = X[:split_index]
X_test  = X[split_index:]
Y_train = Y[:split_index]
Y_test  = Y[split_index:]
shuffled_paths_train = shuffled_paths[:split_index]
shuffled_paths_test = shuffled_paths[split_index:]

In [None]:
svm = SVC(kernel="rbf", random_state=42)
svm.fit(X_train, Y_train)

In [None]:
Y_hat = svm.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y_hat)}")
print("Classification Report:\n", classification_report(Y_test, Y_hat))

In [None]:

cm = confusion_matrix(Y_test, Y_hat)

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=np.unique(Y_test),
    yticklabels=np.unique(Y_test),
)

plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
incorrect_indices = [i for i in range(len(Y_test)) if Y_hat[i] != Y_test[i]]

random.seed(None)
random.shuffle(incorrect_indices)
random.seed(42)

num_to_display = min(len(incorrect_indices), 5)
plt.figure(figsize=(15, num_to_display))

for i in range(num_to_display):
    idx = incorrect_indices[i]

    true_label = "Face" if Y_test[idx] == 1 else "Non-Face"
    predicted_label = "Face" if Y_hat[idx] == 1 else "Non-Face"
    img = cv2.imread(shuffled_paths_test[idx], cv2.IMREAD_GRAYSCALE)

    plt.subplot(1, num_to_display, i + 1)
    plt.imshow(img, cmap="gray")
    plt.title(f"True: {true_label}, Pred: {predicted_label}")
    plt.axis("off")

plt.tight_layout()
plt.show()

In [None]:
def pyramid(image, scale=1.5, min_size=(128, 128)):
    yield image
    while True:
        w = int(image.shape[1] / scale)
        image = cv2.resize(image, (w, int(image.shape[0] / scale)))
        if image.shape[0] < min_size[1] or image.shape[1] < min_size[0]:
            break
        yield image

def sliding_window(image, step_size, window_size):
    for y in range(0, image.shape[0] - window_size[1], step_size):
        for x in range(0, image.shape[1] - window_size[0], step_size):
            yield x, y, image[y:y + window_size[1], x:x + window_size[0]]

def resize_and_pad(image, target_size):
    h, w = image.shape[:2]
    target_w, target_h = target_size
    scale = min(target_w / w, target_h / h)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_img = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
    result = np.zeros((target_h, target_w), dtype=np.uint8)
    x_offset = (target_w - new_w) // 2
    y_offset = (target_h - new_h) // 2
    result[y_offset:y_offset + new_h, x_offset:x_offset + new_w] = resized_img

    return result

def live_face_detector():
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not access the webcam.")
        return

    print("Press q to capture the frame, e to exit the program.")

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            cv2.imshow("Camera Feed", frame)
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                gray = resize_and_pad(gray, (480, 480))
                feature_vec, histogram = extract_hog_features(gray, visualize=True)
                decision = svm.decision_function(feature_vec.reshape(1, -1))
                result = "Face" if decision > 0 else "Non-Face"
                print(f"Prediction: {result}, Decision: {decision[0]}")
                color = (0, 255, 0) if result == "Face" else (0, 0, 255)
                cv2.putText(frame, result, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)
                cv2.imshow("Captured Image", gray)
                cv2.imshow("HOG", histogram)
                cv2.imshow("Prediction", frame)
            elif key == ord('e'):
                break
    finally:
        cap.release()
        cv2.destroyAllWindows()

In [None]:
live_face_detector()
