In [7]:
import os
import cv2
import numpy as np
from imgaug import augmenters as iaa
from tqdm import tqdm
import shutil

In [3]:
# Set paths using your exact structure
input_path = "/kaggle/input/labeled-faces-in-the-wild/LFW_Dataset_11_15/LFW_Dataset_11_15"
output_path = "/kaggle/working/LFW_Processed"
train_path = os.path.join(output_path, "training_data")
test_path = os.path.join(output_path, "testing_data")

# Verify input path exists
if not os.path.exists(input_path):
    print("\nError: Could not find dataset at:", input_path)
    print("\nTrying to locate your dataset...")
    
    # Check possible locations
    base_path = "/kaggle/input"
    if os.path.exists(base_path):
        print("\nContents of /kaggle/input:")
        print(os.listdir(base_path))
        
        if 'labeled-faces-in-the-wild' in os.listdir(base_path):
            wild_path = os.path.join(base_path, 'labeled-faces-in-the-wild')
            print("\nContents of labeled-faces-in-the-wild:")
            print(os.listdir(wild_path))
            
            if 'LFW_Dataset_11_15' in os.listdir(wild_path):
                dataset_path = os.path.join(wild_path, 'LFW_Dataset_11_15')
                print("\nContents of LFW_Dataset_11_15:")
                print(os.listdir(dataset_path))
    
    raise FileNotFoundError(f"Could not find dataset at: {input_path}\nPlease verify the path and try again.")

print(f"\nDataset found at: {input_path}")
print("Processing...")


Dataset found at: /kaggle/input/labeled-faces-in-the-wild/LFW_Dataset_11_15/LFW_Dataset_11_15
Processing...


In [4]:
# Create output directories
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# Augmentation setup
augmentation = iaa.Sequential([
    iaa.Fliplr(0.5),  # horizontal flips
    iaa.Affine(
        rotate=(-20, 20),
        scale=(0.8, 1.2),
        translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}
    ),
    iaa.GaussianBlur(sigma=(0, 1.0)),
    iaa.LinearContrast((0.8, 1.2)),
    iaa.AdditiveGaussianNoise(scale=(0, 0.05*255))
])

def augment_image(image, num_augmentations):
    """Generate augmented versions of an image"""
    augmented_images = []
    for _ in range(num_augmentations):
        augmented = augmentation(image=image)
        augmented_images.append(augmented)
    return augmented_images

def process_person(person_dir, person_name):
    """Process images for one person"""
    image_files = [f for f in os.listdir(person_dir) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    original_images = []
    
    for img_file in image_files:
        img_path = os.path.join(person_dir, img_file)
        img = cv2.imread(img_path)
        if img is not None:
            original_images.append(img)
    
    num_images = len(original_images)
    
    # Augment if needed to reach 15 images
    if num_images < 15:
        needed = 15 - num_images
        augmented_images = []
        
        # Distribute augmentations across original images
        per_image = max(1, needed // num_images)
        remainder = needed % num_images
        
        for i, img in enumerate(original_images):
            to_generate = per_image + (1 if i < remainder else 0)
            if to_generate > 0:
                augmented_images.extend(augment_image(img, to_generate))
        
        all_images = original_images + augmented_images
    else:
        all_images = original_images[:15]  # Take first 15 if more available
    
    # Shuffle and split
    np.random.shuffle(all_images)
    train_images = all_images[:10]
    test_images = all_images[10:15]
    
    return train_images, test_images

def save_images(images, output_dir, person_name, prefix):
    """Save images to appropriate directory"""
    person_output_dir = os.path.join(output_dir, person_name)
    os.makedirs(person_output_dir, exist_ok=True)
    
    for i, img in enumerate(images):
        output_path = os.path.join(person_output_dir, f"{prefix}_{i+1:02d}.jpg")
        cv2.imwrite(output_path, img)

In [5]:
# Get all person folders
person_folders = [d for d in os.listdir(input_path) 
                 if os.path.isdir(os.path.join(input_path, d))]

print(f"\nFound {len(person_folders)} persons to process")


Found 58 persons to process


In [6]:
# Process each person
for person in tqdm(person_folders, desc="Processing persons"):
    person_dir = os.path.join(input_path, person)
    train_imgs, test_imgs = process_person(person_dir, person)
    
    # Save images
    save_images(train_imgs, train_path, person, "train")
    save_images(test_imgs, test_path, person, "test")

print("\nProcessing complete!")
print(f"Training data saved to: {train_path}")
print(f"Testing data saved to: {test_path}")
print(f"\nTotal persons processed: {len(person_folders)}")
print(f"Training images per person: 10")
print(f"Testing images per person: 5")

Processing persons: 100%|██████████| 58/58 [00:07<00:00,  7.45it/s]


Processing complete!
Training data saved to: /kaggle/working/LFW_Processed/training_data
Testing data saved to: /kaggle/working/LFW_Processed/testing_data

Total persons processed: 58
Training images per person: 10
Testing images per person: 5





In [8]:
folder_path = "/kaggle/working/LFW_Processed"  # Change this to your folder name
zip_file = "LFW_Processed.zip"  # Output zip file

shutil.make_archive(zip_file.replace(".zip", ""), 'zip', folder_path)
print(f"Zipped {folder_path} as {zip_file}")

Zipped /kaggle/working/LFW_Processed as LFW_Processed.zip
