<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Face_Matching_Dataset_Creation_and_Reverse_Image_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install -q pandas opencv-python tqdm

In [3]:
import os
import random
import shutil
import requests
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm
from zipfile import ZipFile
from pathlib import Path

In [4]:
def create_face_test_dataset(output_dir="forensic_face_test", num_images=20, seed=42):
    """
    Create a test dataset for face forensic analysis using VGGFace2 dataset.

    Args:
        output_dir: Output directory for the test dataset
        num_images: Number of images to include (default: 20)
        seed: Random seed for reproducibility
    """
    random.seed(seed)
    np.random.seed(seed)

    # Create directory structure
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "metadata"), exist_ok=True)

    print(f"Creating face test dataset with {num_images} images...")

    # VGGFace2 test set (smaller portion of VGGFace2)
    # Note: In a real setting, you would need to apply for access to this dataset
    # We're providing the code assuming access has been granted
    # URL: https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/

    vggface_dir = os.path.join(output_dir, "vggface2_test")

    # For this example, let's assume the dataset has been downloaded and extracted
    # If you have the dataset downloaded, replace the following comment with actual path
    if not os.path.exists(vggface_dir):
        print("Please download the VGGFace2 test dataset from:")
        print("https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/")
        print(f"Extract it to {vggface_dir}")
        print("The dataset requires application for access.")

        # As an alternative, we can use a small sample from FFHQ dataset
        print("Using Flickr-Faces-HQ (FFHQ) sample as an alternative...")
        ffhq_sample_url = "https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails128x128/00000.png"

        # Create temporary directory
        temp_dir = os.path.join(output_dir, "temp")
        os.makedirs(temp_dir, exist_ok=True)

        # Download a few sample images from FFHQ thumbnails
        sample_images = []
        for i in range(30):  # Download 30 to have enough to choose from
            img_id = str(i).zfill(5)
            img_url = f"https://github.com/NVlabs/ffhq-dataset/raw/master/thumbnails128x128/{img_id}.png"
            img_path = os.path.join(temp_dir, f"{img_id}.png")

            try:
                response = requests.get(img_url, stream=True)
                if response.status_code == 200:
                    with open(img_path, 'wb') as f:
                        for chunk in response:
                            f.write(chunk)
                    sample_images.append((img_path, f"person_{len(sample_images) // 2}", len(sample_images) % 2))
            except Exception as e:
                print(f"Error downloading {img_url}: {e}")

        # Process these images
        selected_images = sample_images[:num_images-1]  # Save one slot for a manipulated image
    else:
        # If VGGFace2 is available, use it
        person_dirs = [d for d in os.listdir(vggface_dir) if os.path.isdir(os.path.join(vggface_dir, d))]

        # Select people with multiple images for identity clustering
        selected_people = random.sample(person_dirs, min(num_images // 2, len(person_dirs)))

        selected_images = []
        for person in selected_people:
            person_dir = os.path.join(vggface_dir, person)
            person_images = os.listdir(person_dir)

            # Take 2 images for each person
            if len(person_images) >= 2:
                selected = random.sample(person_images, 2)
                for img in selected:
                    selected_images.append((os.path.join(person_dir, img), person, selected.index(img)))

    # Process the images
    metadata = []
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    for i, (img_path, person_id, img_index) in enumerate(tqdm(selected_images, desc="Processing images")):
        if i >= num_images - 1:  # Save one slot for manipulated image
            break

        # New file name
        new_filename = f"{i+1:02d}_{person_id}_{Path(img_path).name}"
        dst_path = os.path.join(output_dir, "images", new_filename)

        # Copy the image
        shutil.copy2(img_path, dst_path)

        # Detect faces
        try:
            image = cv2.imread(img_path)
            if image is None:
                print(f"Failed to read image: {img_path}")
                continue

            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, 1.3, 5)

            if len(faces) > 0:
                x, y, w, h = faces[0]
                face_coords = {"x": int(x), "y": int(y), "width": int(w), "height": int(h)}
            else:
                face_coords = {"x": 0, "y": 0, "width": image.shape[1], "height": image.shape[0]}

            # Add to metadata
            metadata.append({
                "filename": new_filename,
                "person_id": person_id,
                "image_index": img_index,
                "source": "vggface2" if os.path.exists(vggface_dir) else "ffhq",
                "face_detected": len(faces) > 0,
                "face_x": face_coords["x"],
                "face_y": face_coords["y"],
                "face_width": face_coords["width"],
                "face_height": face_coords["height"],
                "image_width": image.shape[1],
                "image_height": image.shape[0],
                "has_duplicate": any(m["person_id"] == person_id for m in metadata)
            })
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

    # Create a manipulated version of one image
    if metadata:
        # Pick a random image to manipulate
        original_idx = random.randint(0, len(metadata)-1)
        original_file = os.path.join(output_dir, "images", metadata[original_idx]["filename"])

        # Create a manipulated version
        manipulated_filename = f"manipulated_{metadata[original_idx]['filename']}"
        manipulated_file = os.path.join(output_dir, "images", manipulated_filename)

        # Load the image
        img = cv2.imread(original_file)
        if img is not None:
            # Apply a simple manipulation (face swap or morphing)
            if len(metadata) > 1:
                # Try to find another image of a different person
                other_persons = [i for i, m in enumerate(metadata)
                                if m["person_id"] != metadata[original_idx]["person_id"]]

                if other_persons:
                    # Face swap (simplified version)
                    donor_idx = random.choice(other_persons)
                    donor_file = os.path.join(output_dir, "images", metadata[donor_idx]["filename"])
                    donor_img = cv2.imread(donor_file)

                    if donor_img is not None and metadata[original_idx]["face_detected"] and metadata[donor_idx]["face_detected"]:
                        # Get face regions
                        x1, y1, w1, h1 = (metadata[original_idx]["face_x"], metadata[original_idx]["face_y"],
                                          metadata[original_idx]["face_width"], metadata[original_idx]["face_height"])

                        x2, y2, w2, h2 = (metadata[donor_idx]["face_x"], metadata[donor_idx]["face_y"],
                                          metadata[donor_idx]["face_width"], metadata[donor_idx]["face_height"])

                        # Resize donor face to match original face size
                        donor_face = donor_img[y2:y2+h2, x2:x2+w2]
                        donor_face_resized = cv2.resize(donor_face, (w1, h1))

                        # Create a copy of the original image
                        manipulated_img = img.copy()

                        # Apply face swap with simple blending
                        manipulated_img[y1:y1+h1, x1:x1+w1] = donor_face_resized

                        # Save manipulated image
                        cv2.imwrite(manipulated_file, manipulated_img)
                        manipulation_type = "face_swap"
                    else:
                        # Apply blur as fallback
                        manipulated_img = img.copy()
                        h, w = manipulated_img.shape[:2]
                        face_region = manipulated_img[y1:y1+h1, x1:x1+w1]
                        blurred_face = cv2.GaussianBlur(face_region, (25, 25), 0)
                        manipulated_img[y1:y1+h1, x1:x1+w1] = blurred_face
                        cv2.imwrite(manipulated_file, manipulated_img)
                        manipulation_type = "face_blur"
                else:
                    # Apply blur to face
                    x, y, w, h = (metadata[original_idx]["face_x"], metadata[original_idx]["face_y"],
                                 metadata[original_idx]["face_width"], metadata[original_idx]["face_height"])
                    face_region = img[y:y+h, x:x+w]
                    blurred_face = cv2.GaussianBlur(face_region, (25, 25), 0)
                    img[y:y+h, x:x+w] = blurred_face
                    cv2.imwrite(manipulated_file, img)
                    manipulation_type = "face_blur"

                # Add to metadata
                metadata.append({
                    "filename": manipulated_filename,
                    "original_filename": metadata[original_idx]["filename"],
                    "person_id": f"MANIPULATED_{metadata[original_idx]['person_id']}",
                    "source": "MANIPULATED",
                    "face_detected": metadata[original_idx]["face_detected"],
                    "face_x": metadata[original_idx]["face_x"],
                    "face_y": metadata[original_idx]["face_y"],
                    "face_width": metadata[original_idx]["face_width"],
                    "face_height": metadata[original_idx]["face_height"],
                    "image_width": metadata[original_idx]["image_width"],
                    "image_height": metadata[original_idx]["image_height"],
                    "has_duplicate": True,
                    "manipulation_type": manipulation_type,
                    "original_image": metadata[original_idx]["filename"]
                })

    # Save metadata to CSV
    df = pd.DataFrame(metadata)
    df.to_csv(os.path.join(output_dir, "metadata", "face_images_metadata.csv"), index=False)

    # Create a README file
    with open(os.path.join(output_dir, "README.md"), "w") as f:
        f.write("# Forensic Face Analysis Test Dataset\n\n")
        f.write(f"This dataset contains {len(df)} images for testing face detection, clustering, and forensic analysis.\n\n")
        f.write("## Dataset Statistics\n\n")
        f.write(f"- Total images: {len(df)}\n")
        f.write(f"- Unique individuals: {len(df['person_id'].unique())}\n")
        f.write(f"- Images with detected faces: {df['face_detected'].sum()}\n")
        f.write(f"- Manipulated images: {len(df[df['source'] == 'MANIPULATED'])}\n\n")
        f.write("## Usage Instructions\n\n")
        f.write("This dataset is designed to test face detection, clustering, and manipulation detection capabilities.\n")
        f.write("The metadata CSV contains ground truth information for validation purposes.\n")

    print(f"Dataset created successfully at: {output_dir}")
    print(f"Total images created: {len(df)}")

    # Cleanup temporary files if they exist
    if os.path.exists(os.path.join(output_dir, "temp")):
        shutil.rmtree(os.path.join(output_dir, "temp"))

if __name__ == "__main__":
    create_face_test_dataset(num_images=20)

Creating face test dataset with 20 images...
Please download the VGGFace2 test dataset from:
https://www.robots.ox.ac.uk/~vgg/data/vgg_face2/
Extract it to forensic_face_test/vggface2_test
The dataset requires application for access.
Using Flickr-Faces-HQ (FFHQ) sample as an alternative...


Processing images: 0it [00:00, ?it/s]


KeyError: 'person_id'