# Image Processing and Face Detection
This notebook performs image processing using OpenCV, Tesseract, and PIL to extract text and detect faces from images in a zip file.

## Steps:
1. Extract images from a zip file.
2. Detect faces in the images.
3. Extract text from the images using OCR.
4. Create a contact sheet for the detected faces.


In [1]:
# importing necessary libraries
import zipfile
from PIL import Image, ImageDraw
import pytesseract
import cv2 as cv
import numpy as np
import os
import shutil
from IPython.display import display

In [2]:
# Load the face detection classifier from OpenCV's pre-trained models
face_cascade = cv.CascadeClassifier(cv.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Function to clear or create a directory
def clear_directory(output_dir):
    """
    Clears the contents of a directory if it exists; otherwise, creates the directory.
    Args:
        output_dir (str): The path to the directory to be cleared or created.
    """
    if os.path.exists(output_dir):
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
    else:
        os.makedirs(output_dir)

In [3]:
# Function to extract images from a ZIP file
def extract_zip(zip_path, output_dir):
    """
    Extracts files from a ZIP archive to a specified output directory.
    Args:
        zip_path (str): The path to the ZIP file.
        output_dir (str): The path to the output directory.
    Returns:
        str: The path to the output directory.
    """
    clear_directory(output_dir)
    with zipfile.ZipFile(zip_path, "r") as zippie:
        zippie.extractall(output_dir)
        return output_dir


In [4]:
# Function to list all files in a directory
def list_of_files(output_dir):
    """
    Lists all files in a specified directory.
    Args:
        output_dir (str): The path to the directory.
    Returns:
        list: List of filenames in the directory.
    """
    return os.listdir(output_dir)

In [5]:
# Function to access files in a directory
def access_file(output_dir):
    """
    Accesses files in the specified directory.
    Args:
        output_dir (str): The path to the directory.
    Returns:
        list: List of filenames in the directory.
    """
    return os.listdir(output_dir)

In [6]:
# Function to extract text from images using Tesseract OCR
def read_text_from_files(output_dir):
    """
    Reads and extracts text from images in a directory using Tesseract OCR.
    Args:
        output_dir (str): The path to the directory containing images.
    Returns:
        list: List of text strings extracted from each image.
    """
    list_of_texts = []
    files = access_file(output_dir)
    for file in files:
        pic = Image.open(output_dir + "/" + file)
        if pic.mode != "1":
            pic.convert("1")
        words = pytesseract.image_to_string(pic)
        list_of_texts.append(words)
    return list_of_texts

In [7]:
# Function to detect faces in images
def rec_faces(output_dir):
    """
    Detects faces in images within a specified directory using OpenCV.
    Args:
        output_dir (str): The path to the directory containing images.
    Returns:
        list: List of face coordinates for each image.
    """
    list_of_faces = []
    for file in access_file(output_dir):
        cv_image = cv.imread(output_dir + "/" + file)
        gray = cv.cvtColor(cv_image, cv.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.334, minNeighbors=5)
        if isinstance(faces, np.ndarray):
            list_of_faces.append(faces.tolist())
        else:
            list_of_faces.append([])
    return list_of_faces

In [8]:
# Function to create a contact sheet for detected faces
def ready_contact_sheet(word, output_dir, zip_path):
    """
    Creates a contact sheet of detected faces from images containing a specific keyword.
    Args:
        word (str): The keyword to search for in the images' text.
        output_dir (str): The directory to store extracted images.
        zip_path (str): The path to the ZIP file containing images.
    """
    extract_zip(zip_path, output_dir)
    record = access_file(output_dir)
    faces = rec_faces(output_dir)
    text = read_text_from_files(output_dir)
    metas = zip(record, faces, text)
    face_images = []

    for meta in metas:
        if word in meta[2]:
            record = meta[0]
            faces = meta[1]
            image = Image.open(output_dir + "/" + record)
            img_copy = image.copy()
            smaller_list = []

            # Crop and resize detected faces
            for (x, y, w, h) in faces:
                bounds = x, y, x + w, y + h
                cropped_image = img_copy.crop(bounds)
                cropped_image_copy = cropped_image.copy()
                cropped_image_copy.thumbnail((100, 100))
                smaller_list.append(cropped_image_copy)

            face_images.append(smaller_list)

            # Handle case where no faces are found
            if smaller_list == []:
                return ("Results found but there were no faces in {}".format(record))

            # Create the contact sheet
            first_image = smaller_list[0]
            contact_sheet = Image.new(first_image.mode, (first_image.width * 5, first_image.height * 2))
            x = 0
            y = 0
            for i, img in enumerate(smaller_list):
                contact_sheet.paste(img, (x, y))
                x = x + first_image.width
                if x >= (first_image.width * 5):
                    x = 0
                    y = y + first_image.height
            print("Results found in {}".format(record))
            display(contact_sheet)

In [None]:
# Example function call:
# Uncomment the line below to run the function with your parameters
# ready_contact_sheet("Mark", "output_directory", "/path/to/images.zip")