In [43]:
import os
import shutil
import random
import cv2
import numpy as np

In [44]:
def create_dir_structure(output_path):
    """
    Creates the required directory structure.
    """
    dirs = [
        os.path.join(output_path, 'images/train'),
        os.path.join(output_path, 'images/val'),
        os.path.join(output_path, 'labels/train'),
        os.path.join(output_path, 'labels/val')
    ]

    for dir_path in dirs:
        os.makedirs(dir_path, exist_ok=True)

In [45]:
def get_file_pairs(base_image_dir, base_label_dir):
    """
    Gets the pairs of image and label files from provided directories.
    """
    image_files = os.listdir(base_image_dir)
    label_files = os.listdir(base_label_dir)
    image_files = [f for f in image_files if f.lower().endswith(('.png', 'jpg', '.jpeg'))]
    label_files = [f for f in label_files if f.lower().endswith('.txt')]

    image_label_pairs = []
    for img_file in image_files:
        label_file = img_file.rsplit('.', 1)[0] + '.txt'
        if label_file in label_files:
            image_label_pairs.append((img_file, label_file))

    return image_label_pairs

In [46]:
def split_data_percentage(image_label_pairs, train_ratio=0.8):
    """
    Splits the data into training and validation sets (Percentage)
    """
    random.shuffle(image_label_pairs) 
    split_index = int(len(image_label_pairs) * train_ratio)
    train_pairs = image_label_pairs[:split_index]
    val_pairs = image_label_pairs[split_index:]

    return train_pairs, val_pairs

In [47]:
def split_data_fix_amount(image_label_pairs, train_count=80):
    """
    Splits the data into training and validation sets (Fix Amount)
    """
    if train_count > len(image_label_pairs):
        train_count = len(image_label_pairs)
    
    random.shuffle(image_label_pairs)
    train_pairs = image_label_pairs[train_count:] 
    val_pairs = image_label_pairs[:train_count]

    return train_pairs, val_pairs

In [48]:
def copy_files(pairs, base_path, output_path, dataset_type):
    """
    Copies the files to the appropriate directories
    """
    for img_file, lbl_file in pairs:
        shutil.copy(os.path.join(base_path, img_file), os.path.join(output_path, f'images/{dataset_type}', img_file))
        shutil.copy(os.path.join(base_path, lbl_file), os.path.join(output_path, f'labels/{dataset_type}', lbl_file))

In [49]:
def copy_files_with_preprocessing(pairs, base_image_dir, base_label_dir, output_image_dir, output_label_dir, preprocess_func=None):
    """
    Copies the files to the appropriate directories with optional preprocessing.
    skips missing files and already existing files.
    """
    os.makedirs(output_image_dir, exist_ok=True)
    os.makedirs(output_label_dir, exist_ok=True)

    for img_file, lbl_file in pairs:
        img_path = os.path.join(base_image_dir, img_file)
        label_path = os.path.join(base_label_dir, lbl_file)

        # Skip if the image or label file is missing
        if not os.path.exists(img_path) or not os.path.exists(label_path):
            print(f"Skipping missing file pair: {img_file}, {lbl_file}")
            continue

        output_image_file = os.path.join(output_image_dir, img_file)
        output_label_file = os.path.join(output_label_dir, lbl_file)

        # Skip if the image or label file already exists in the destination
        if os.path.exists(output_image_file) or os.path.exists(output_label_file):
            print(f"Skipping already  existing file pair: {img_file}, {lbl_file}")
            continue

        image = cv2.imread(img_path)

        # Apply preprocessing if specified
        if preprocess_func:
            image = preprocess_func(image)

        # output_image_file = os.path.join(output_image_dir, img_file)
        cv2.imwrite(output_image_file, image)
        shutil.copy(label_path, output_label_file)
        # shutil.copy(os.path.join(base_label_dir, lbl_file), os.path.join(output_label_dir, lbl_file))
        print(f"Copied {img_file} and {lbl_file} with preprocessing to {output_image_dir}")

In [50]:
def apply_image_sharpening(image):
    """
    Apply sharpening to the input image.
    """
    kernel = np.array([[0, -1, 0],
                       [-1, 5,-1],
                       [0, -1, 0]])
    sharpened_image = cv2.filter2D(image, -1, kernel)
    return sharpened_image

In [51]:
def apply_gaussian_noise(image, mean=0, var=0.1):
    """
    Apply Gaussian noise to the input image.
    """
    sigma = var ** 0.5
    gauss = np.random.normal(mean, sigma, image.shape).astype(np.float32)
    noisy_image = np.clip(image + gauss * 255, 0, 255).astype(np.uint8)
    return noisy_image

In [52]:
def prepare_dataset(base_path='raw_dataset', output_path='data', train_ratio=None, train_count=None):
    create_dir_structure(output_path)

    image_label_pairs = get_file_pairs(base_path)
    if train_ratio != None:
        train_pairs, val_pairs = split_data_percentage(image_label_pairs, train_ratio)
    else:
        train_pairs, val_pairs = split_data_fix_amount(image_label_pairs, train_count)

    copy_files(train_pairs, base_path, output_path, 'train')
    copy_files(val_pairs, base_path, output_path, 'val')

In [53]:
def separate_raw_data_to_preprocessing(base_path='my_data_1'):
    base_image_train_path = os.path.join(base_path, 'images/train')
    base_label_train_path = os.path.join(base_path, 'labels/train')
    base_image_val_path = os.path.join(base_path, 'images/val')
    base_label_val_path = os.path.join(base_path, 'labels/val')

    # sharpened_output_path = 'sharpened_data'
    # noisy_output_path = 'noisy_data'
    sharpened_output_image_path = 'sharpened_data/images'
    sharpened_output_label_path = 'sharpened_data/labels'
    noisy_output_image_path = 'noisy_data/images'
    noisy_output_label_path = 'noisy_data/labels'

    # Create directory structure
    create_dir_structure('sharpened_data')
    create_dir_structure('noisy_data')

    # get file pairs for training and validation
    train_pairs = get_file_pairs(base_image_train_path, base_label_train_path)
    val_pairs = get_file_pairs(base_image_val_path, base_label_val_path)

    # Copy files with image sharpening
    copy_files_with_preprocessing(train_pairs, base_image_train_path, base_label_train_path, os.path.join(sharpened_output_image_path, 'train'), os.path.join(sharpened_output_label_path, 'train'), preprocess_func=apply_image_sharpening)
    copy_files_with_preprocessing(val_pairs, base_image_val_path, base_label_val_path, os.path.join(sharpened_output_image_path, 'val'), os.path.join(sharpened_output_label_path, 'val'), preprocess_func=apply_image_sharpening)

    # Copy files with Gaussian noise
    copy_files_with_preprocessing(train_pairs, base_image_train_path, base_label_train_path, os.path.join(noisy_output_image_path, 'train'), os.path.join(noisy_output_label_path, 'train'), preprocess_func=apply_gaussian_noise)
    copy_files_with_preprocessing(val_pairs, base_image_val_path, base_label_val_path, os.path.join(noisy_output_image_path, 'val'), os.path.join(noisy_output_label_path, 'val'), preprocess_func=apply_gaussian_noise)

In [56]:
# base_path = 'raw_dataset'
# output_path = 'my_data'
# create_dir_structure(output_path)

# image_label_pairs = get_file_pairs(base_path)
# train_pairs, val_pairs = split_data(image_label_pairs)

# copy_files(train_pairs, base_path, output_path, 'train')
# copy_files(val_pairs, base_path, output_path, 'val')

# prepare_dataset(base_path='raw_dataset_2/', output_path='my_data_1', train_count=100)
separate_raw_data_to_preprocessing(base_path='my_data_1')

Skipping already  existing file pair: frame_101280.jpg, frame_101280.txt
Skipping already  existing file pair: frame_101460.jpg, frame_101460.txt
Skipping already  existing file pair: frame_101820.jpg, frame_101820.txt
Skipping already  existing file pair: frame_101940.jpg, frame_101940.txt
Skipping already  existing file pair: frame_1020.jpg, frame_1020.txt
Skipping already  existing file pair: frame_102000.jpg, frame_102000.txt
Skipping already  existing file pair: frame_102180.jpg, frame_102180.txt
Skipping already  existing file pair: frame_102240.jpg, frame_102240.txt
Skipping already  existing file pair: frame_102420.jpg, frame_102420.txt
Skipping already  existing file pair: frame_102480.jpg, frame_102480.txt
Skipping already  existing file pair: frame_103440.jpg, frame_103440.txt
Skipping already  existing file pair: frame_103500.jpg, frame_103500.txt
Skipping already  existing file pair: frame_103680.jpg, frame_103680.txt
Skipping already  existing file pair: frame_104880.jpg,