# 02. IT24102217 – Face Detection and Cleaning
###### Technique: Image loading, grayscale conversion, Haar Cascade detection, cropping, resizing.
###### Problem: Dataset contained unreadable images and photos without detectable faces, causing noise.
###### Solution:

Skip unreadable images.

Convert images to grayscale.

Detect faces using Haar Cascade.

Crop and resize detected face regions (50×50).

Append only valid faces to dataset.

Flatten images into vectors (2500 features).

###### Why Needed: Guarantees clean, standardized face-only data, improving reliability of training and reducing errors.
###### Visualization: Bar chart showing valid faces collected vs. skipped images (load error / no face).

In [None]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt

face_classifier = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

data = []
target = []
skipped_load_error = 0
skipped_no_face = 0
processed_images_count = 0 

for label in labels:
    imgs_path = os.path.join(data_path, label) #get the each catagari one by one
    img_names = os.listdir(imgs_path)

    for img_name in img_names:   # get the each image one by one
        img_path = os.path.join(imgs_path, img_name)
        processed_images_count += 1  # Increment the count for each image processed
        img = cv2.imread(img_path)

        if img is None: # idintify loading error data
            print(f"Warning: Could not load image {img_path}")
            skipped_load_error += 1
            continue

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convert to the image gray scale
        faces = face_classifier.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) # idintify face 

        if len(faces) == 0:
            print(f"No faces detected in {img_name}") #idintify no face image
            skipped_no_face += 1
            continue

        try:
            for (x, y, w, h) in faces:
                cropped_face = gray[y:y+h, x:x+w]
                cropped_face = cv2.resize(cropped_face, (50, 50)) # crope the face
                data.append(cropped_face)
                target.append(category_dict[label])

                # Show first detected face (optional)
                plt.imshow(cropped_face, cmap="gray")
                plt.title(f"Detected Face - {label}")
                plt.axis("off")
                plt.show()
                break  # Show only one face per image for preview

        except Exception as e:
            print(f"Error processing image {img_name}: {e}")
            pass

print(f"Finished processing. Collected {len(data)} faces.")
print(f"Images skipped due to loading errors: {skipped_load_error}")
print(f"Images skipped due to no face detected: {skipped_no_face}")
print(f"Total images attempted: {processed_images_count}")


In [None]:
import matplotlib.pyplot as plt

# Assuming 'data', 'skipped_load_error', and 'skipped_no_face' are available from the previous processing step.

valid_faces_count = len(data)
# We can consider images skipped due to no face or loading errors as "invalid" image sources for face data
invalid_images_count = skipped_load_error + skipped_no_face

# Prepare data for plotting
categories = ['Valid Faces Collected', 'Images Skipped (No Face/Load Error)']
counts = [valid_faces_count, invalid_images_count]

# Create the bar chart
plt.figure(figsize=(8, 5))
plt.bar(categories, counts, color=['green', 'red'])
plt.ylabel("Count")
plt.title("Distribution of Valid Faces vs. Skipped Images")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
data=np.array(data)
data=data.reshape(data.shape[0],data.shape[1]*data.shape[2])
target=np.array(target)
print("Flattened data shape:", data.shape)