In [3]:
pip install numpy

Collecting numpyNote: you may need to restart the kernel to use updated packages.

  Using cached numpy-2.2.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached numpy-2.2.1-cp312-cp312-win_amd64.whl (12.6 MB)
Installing collected packages: numpy
Successfully installed numpy-2.2.1


In [7]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# Parameters
input_folder = r"C:\Users\harshika.pathak\Desktop\WORK\object_detection\stars"  # Folder containing 3-4 images
output_folder = "combined_dataset"  # Folder to save the dataset
dataset_size = 200 # Total number of images in the dataset
page_size = (512, 512)  # Size of the output page (width, height)
image_size = (64, 64)  # Standard size for input images (width, height)
max_images_per_page = 4  # Maximum number of images to place on a single page

# Function to load and resize all images from the input folder
def load_and_resize_images(input_folder, image_size):
    images = []
    for filename in os.listdir(input_folder):
        filepath = os.path.join(input_folder, filename)
        img = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)  # Read the image
        if img is not None:
            # Convert grayscale or RGBA to RGB
            if len(img.shape) == 2:  # Grayscale image
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
            elif img.shape[2] == 4:  # RGBA image
                img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
            # Resize the image to the target size
            resized_img = cv2.resize(img, image_size, interpolation=cv2.INTER_AREA)
            images.append(resized_img)
    if len(images) == 0:
        raise ValueError("No valid images found in the input folder.")
    return images

# Function to place multiple images on a single page without overriding
def create_combined_page(page_size, images, max_images):
    # Create a blank page
    page = np.ones((page_size[1], page_size[0], 3), dtype=np.uint8) *255

    # Randomly decide the number of images to place
    num_images = np.random.randint(1, max_images + 1)

    for _ in range(num_images):
        # Randomly select an image
        img = images[np.random.randint(len(images))]

        # Randomly place the image on the page
        x_offset = np.random.randint(0, page_size[0] - img.shape[1])
        y_offset = np.random.randint(0, page_size[1] - img.shape[0])

        # Overlay the image onto the page without overriding
        for y in range(img.shape[0]):
            for x in range(img.shape[1]):
                if np.any(img[y, x] > 0):  # Check if the pixel is non-black
                    page[y_offset + y, x_offset + x] = img[y, x]

    return page

# Function to create the dataset
def create_dataset(input_folder, output_folder, dataset_size, page_size, image_size, max_images_per_page):
    # Load and resize input images
    images = load_and_resize_images(input_folder, image_size)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    print(f"Generating {dataset_size} combined images...")
    for i in tqdm(range(dataset_size)):
        # Create a combined page
        page = create_combined_page(page_size, images, max_images_per_page)

        # Save the generated page
        output_path = os.path.join(output_folder, f"page_{i+1:04d}.png")
        cv2.imwrite(output_path, page)

    print(f"Dataset created in directory: {output_folder}")

# Generate the dataset
create_dataset(input_folder, output_folder, dataset_size, page_size, image_size, max_images_per_page)


Generating 200 combined images...


  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [00:07<00:00, 25.37it/s]

Dataset created in directory: combined_dataset





In [10]:
import cv2
import os
import xml.etree.ElementTree as ET
import numpy as np

# Function to parse XML annotation file and extract bounding boxes
def parse_annotation(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    bboxes = []

    for obj in root.findall("object"):
        bbox = obj.find("bndbox")
        bboxes.append({
            "xmin": int(bbox.find("xmin").text),
            "ymin": int(bbox.find("ymin").text),
            "xmax": int(bbox.find("xmax").text),
            "ymax": int(bbox.find("ymax").text),
        })

    return bboxes, root

# Function to save updated annotations
def save_annotation(xml_file, root, bboxes, output_dir, new_name):
    for obj, bbox in zip(root.findall("object"), bboxes):
        obj.find("bndbox/xmin").text = str(bbox["xmin"])
        obj.find("bndbox/ymin").text = str(bbox["ymin"])
        obj.find("bndbox/xmax").text = str(bbox["xmax"])
        obj.find("bndbox/ymax").text = str(bbox["ymax"])

    tree = ET.ElementTree(root)
    tree.write(os.path.join(output_dir, new_name + ".xml"))

# Function to apply data augmentation
def augment_image(image, bboxes):
    h, w = image.shape[:2]
    augmented_images = []

    # Flip horizontally
    flipped_image = cv2.flip(image, 1)
    flipped_bboxes = [{
        "xmin": w - bbox["xmax"],
        "ymin": bbox["ymin"],
        "xmax": w - bbox["xmin"],
        "ymax": bbox["ymax"],
    } for bbox in bboxes]
    augmented_images.append((flipped_image, flipped_bboxes))

    # Add Gaussian noise
    noise = np.random.normal(0, 25, image.shape).astype(np.uint8)
    noisy_image = cv2.add(image, noise)
    augmented_images.append((noisy_image, bboxes))

    # Apply Gaussian blur
    blurred_image = cv2.GaussianBlur(image, (5, 5), 0)
    augmented_images.append((blurred_image, bboxes))

    # Skewness transformation (perspective warp)
    pts1 = np.float32([[0, 0], [w, 0], [0, h], [w, h]])
    pts2 = np.float32([[0, 0], [w, 0], [int(0.1 * w), h], [int(0.9 * w), h]])
    matrix = cv2.getPerspectiveTransform(pts1, pts2)
    skewed_image = cv2.warpPerspective(image, matrix, (w, h))
    skewed_bboxes = []
    for bbox in bboxes:
        points = np.array([
            [bbox["xmin"], bbox["ymin"]],
            [bbox["xmax"], bbox["ymin"]],
            [bbox["xmax"], bbox["ymax"]],
            [bbox["xmin"], bbox["ymax"]]
        ], dtype=np.float32)
        transformed_points = cv2.perspectiveTransform(points[None, :, :], matrix)[0]
        x_coords, y_coords = transformed_points[:, 0], transformed_points[:, 1]
        skewed_bboxes.append({
            "xmin": int(min(x_coords)),
            "ymin": int(min(y_coords)),
            "xmax": int(max(x_coords)),
            "ymax": int(max(y_coords))
        })
    augmented_images.append((skewed_image, skewed_bboxes))

    return augmented_images

# Main function
input_images_dir = r"C:\Users\harshika.pathak\Desktop\WORK\object_detection\combined_dataset"
input_annotations_dir = r"C:\Users\harshika.pathak\Desktop\WORK\object_detection\annotated_images"
output_images_dir = "augmented images"
output_annotations_dir = "augmented annotations"

os.makedirs(output_images_dir, exist_ok=True)
os.makedirs(output_annotations_dir, exist_ok=True)

for image_file in os.listdir(input_images_dir):
    if not image_file.endswith(".png"):
        continue

    image_path = os.path.join(input_images_dir, image_file)
    xml_file = os.path.join(input_annotations_dir, image_file.replace(".png", ".xml"))

    image = cv2.imread(image_path)
    bboxes, root = parse_annotation(xml_file)

    augmentations = augment_image(image, bboxes)

    for idx, (aug_image, aug_bboxes) in enumerate(augmentations):
        new_name = f"{os.path.splitext(image_file)[0]}_aug{idx}"
        cv2.imwrite(os.path.join(output_images_dir, new_name + ".png"), aug_image)
        save_annotation(xml_file, root, aug_bboxes, output_annotations_dir, new_name)


In [11]:
pip install torchvision


Collecting torchvisionNote: you may need to restart the kernel to use updated packages.

  Downloading torchvision-0.20.1-cp312-cp312-win_amd64.whl.metadata (6.2 kB)
Collecting torch==2.5.1 (from torchvision)
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting filelock (from torch==2.5.1->torchvision)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch==2.5.1->torchvision)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch==2.5.1->torchvision)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch==2.5.1->torchvision)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch==2.5.1->torchvision)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch==2.5.1->torchvision)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 