### Pull out specified number of images from a folder


In [10]:
import os
import shutil

def copy_images(source_dir, destination_dir, num_images=1600):
    """
    Copies the first specified number of images from a source folder to a destination folder.
    
    Parameters:
        source_dir (str): Path to the folder containing the original images.
        destination_dir (str): Path to the folder where selected images will be copied.
        num_images (int): Number of images to copy.
    """
    # Ensure source directory exists
    if not os.path.exists(source_dir):
        raise FileNotFoundError(f"Source directory not found: {source_dir}")
    
    # Ensure destination directory exists; create it if it doesn't
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    
    # List all files in the source directory (alphabetically sorted)
    all_files = ([f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))])
    
    # Ensure there are enough files in the source directory
    if len(all_files) < num_images:
        raise ValueError(f"Source directory contains only {len(all_files)} files, but {num_images} are requested.")
    
    # Select the first `num_images` files
    selected_files = all_files[:num_images]
    
    # Copy the selected files to the destination directory
    for file_name in selected_files:
        src_path = os.path.join(source_dir, file_name)
        dest_path = os.path.join(destination_dir, file_name)
        shutil.copy(src_path, dest_path)
    
    print(f"Copied {num_images} images to {destination_dir}")

# Example usage
source_folder = "./oxfordflowers/jpg"  # Replace with your source folder path
destination_folder = "./crushit"  # Replace with your destination folder path
number_of_images = 1600

copy_images(source_folder, destination_folder, number_of_images)


Copied 1600 images to ./crushit


### Scale down smaller size to 256 maintaning aspect ratio then return center crop of 256x256 in specified output directory

In [26]:
import os
from PIL import Image
from torchvision import transforms

def preprocess_image(image_path):
    """
    Preprocess an image to maintain the original aspect ratio and apply a 256x256 center crop.
    
    Parameters:
        image_path (str): Path to the input image.
        
    Returns:
        PIL.Image.Image: Preprocessed image.
    """
    # Define the transformation to resize the smaller dimension and center crop to 256x256
    transform = transforms.Compose([
        transforms.Resize(256),  # Resize the smaller dimension to 256 (maintains aspect ratio)
        transforms.CenterCrop(256)  # Center crop to 256x256
    ])
    
    # Load the image
    image = Image.open(image_path)
    
    # Convert to RGB if the image is not already in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    # Apply the transformations
    processed_image = transform(image)
    
    return processed_image

def process_images_in_folder(input_folder, output_folder):
    """
    Process all images in a specified folder and save them after applying the preprocessing.
    
    Parameters:
        input_folder (str): Folder containing the input images.
        output_folder (str): Folder where the processed images will be saved.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Iterate through the images in the input folder
    for file_name in os.listdir(input_folder):
        # Check for image files (can add more file types if needed)
        if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_folder, file_name)
            
            # Preprocess the image
            processed_image = preprocess_image(image_path)
            
            # Save the processed image to the output folder
            output_path = os.path.join(output_folder, file_name)
            processed_image.save(output_path)
            #print(f"Processed and saved: {output_path}")

# Set the input folder containing images and the output folder to save processed images
input_folder = './crushit'  # Change this to your folder path with images
output_folder = './crushit2'  # Change this to where you want to save the processed images

# Process the images in the folder
process_images_in_folder(input_folder, output_folder)


# Pack images into shards of npy format, swith specified batch size for each shard


In [28]:
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from PIL import Image

# Set the device to CUDA if available, otherwise fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Transformation: convert images to tensor and normalize to [-1, 1]
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts the image to a Tensor (C, H, W) and normalizes to [0, 1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet stats
    # This step converts the normalized values from [0, 1] -> [-1, 1]
    transforms.Lambda(lambda x: 2 * x - 1)  # This scales the normalized tensor to [-1, 1]
])

# Define a custom Dataset to load images from a flat folder (no subfolders)
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.transform = transform
        # List all files in the folder
        self.image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder) if fname.endswith(('jpg', 'png', 'jpeg'))]
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')  # Open the image and ensure it's RGB
        
        if self.transform:
            image = self.transform(image)  # Apply the transformation (to tensor and normalization)
        
        # Move the image tensor to the GPU if available
        image = image.to(device)  # Transfer the image tensor to the selected device (GPU or CPU)
        
        return image

# Load your images as a torch dataset
image_folder = "./crushit2"  # Replace with the actual path to your image folder
dataset = CustomImageDataset(image_folder, transform=transform)

# Use DataLoader to load images in batches (B, C, H, W)
batch_size = 64  # Set the batch size
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Example of iterating through the DataLoader and printing the shape of batches
for images in dataloader:
    print(images.shape)  # Should print (B, C, H, W) where B = batch size, C = 3 (RGB), H = height, W = width
    print(images.device)  # Check if the images are on the GPU (cuda) or CPU


torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0
torch.Size([64, 3, 256, 256])
cuda:0


KeyboardInterrupt: 