In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import glob
import tqdm
from PIL import Image

import os

## Explore Data

In [2]:
all_images_folder_path = os.path.join("data", "dfg", "images")
all_images_paths = glob.glob(os.path.join(all_images_folder_path, "*.jpg"))

print("Number of images:", len(all_images_paths))

Number of images: 16264


In [3]:
with open("data/dfg/train.json") as f:
    train_json = json.load(f)

with open("data/dfg/test.json") as f:
    val_json = json.load(f)

In [4]:
all_filenames = set(map(lambda x: x.split("\\")[-1], all_images_paths))
train_filenames = set([i['file_name'] for i in train_json['images']])
val_filenames = set([i['file_name'] for i in val_json['images']])

print("Number of all names:", len(all_filenames))
print("Number of available train names:", len(train_filenames))
print("Number of available val names:", len(val_filenames))

print("Total number of available names:", len((train_filenames | val_filenames) & all_filenames))

Number of all names: 16264
Number of available train names: 14029
Number of available val names: 1703
Total number of available names: 15732


## Transform Data

In [5]:
import copy

def prepare_set_annotations_dict(annotation_json, include_segmentation=False):
    """
    Prepares a dictionary of annotations for each image ID from a list of annotation data.
    
    This function iterates over a list of annotation dictionaries, optionally excludes the 'segmentation'
    data from each annotation, and organizes the annotations by their associated 'image_id'. Each 'image_id'
    key in the resulting dictionary maps to a list of annotation dictionaries for that image.
    
    Parameters:
    - annotation_json: A list of dictionaries, where each dictionary contains annotation data for an image.
      The dictionary must include an 'image_id' key to associate the annotation with an image.
    - include_segmentation: A boolean indicating whether to include 'segmentation' data in the annotations.
      If False, the 'segmentation' field is removed from each annotation dictionary. Default is False.
    
    Returns:
    - A dictionary keyed by 'image_id', where each value is a list of dictionaries containing annotation
      data for the corresponding image. If 'include_segmentation' is False, the 'segmentation' field is
      omitted from these dictionaries.
    """
    annotations_dict = {}
    
    for ann in annotation_json:
        ann_no_image_id = copy.copy(ann)
        ann_no_image_id.pop('image_id')
        if not include_segmentation:
            ann_no_image_id.pop('segmentation', None)
        
        if ann['image_id'] not in annotations_dict:
            annotations_dict[ann['image_id']] = [ann_no_image_id]
        else:
            annotations_dict[ann['image_id']].append(ann_no_image_id)
    
    return annotations_dict

def add_images_to_master_dict(image_data_list, set_type, master_dict):
    """
    Adds images to the specified set type in the master dictionary.
    
    Parameters:
    - image_data_list: List of dictionaries containing image data.
    - set_type: The set type to add the images to ('train' or 'val').
    - master_dict: The master dictionary where the images will be added.
    """
    for image_data in image_data_list:
        master_dict[set_type][image_data['id']] = image_data
        master_dict[set_type][image_data['id']]['annotations'] = None

def update_annotations(annotations_dict, set_type, master_dict):
    """
    Updates the annotations for the specified set type in the master dictionary.
    
    Parameters:
    - annotations_dict: Dictionary containing annotations, keyed by image ID.
    - set_type: The set type to update annotations for ('train' or 'val').
    - master_dict: The master dictionary to update.
    """
    for im_id, annotation in annotations_dict.items():
        if im_id in master_dict[set_type]:
            master_dict[set_type][im_id]['annotations'] = annotation

def remove_images_without_annotations(set_type, master_dict):
    """
    Removes images without annotations from the specified set type in the master dictionary.
    
    Parameters:
    - set_type: The set type to remove images from ('train' or 'val').
    - master_dict: The master dictionary to update.
    """
    im_ids_to_delete = [im_id for im_id, data in master_dict[set_type].items() if data['annotations'] is None]
    for im_id in im_ids_to_delete:
        del master_dict[set_type][im_id]

In [6]:
train_annotations = prepare_set_annotations_dict(train_json['annotations'])
val_annotations = prepare_set_annotations_dict(val_json['annotations'])
        
master_dict = {'train': {}, 'val': {}}

# Populate master_dict with train and validation images
add_images_to_master_dict(train_json['images'], 'train', master_dict)
add_images_to_master_dict(val_json['images'], 'val', master_dict)

# Update annotations in master_dict
update_annotations(train_annotations, 'train', master_dict)
update_annotations(val_annotations, 'val', master_dict)

# Remove images without annotations
remove_images_without_annotations('train', master_dict)
remove_images_without_annotations('val', master_dict)

print("Number of train labels:", len(list(master_dict['train'].keys())))
print("Number of val labels:", len(list(master_dict['val'].keys())))

Number of train labels: 13970
Number of val labels: 1703


In [7]:
with open("data/dfg/master.json", "w") as f:
    json.dump(master_dict, f)

### Save and Prepare Data

In [8]:
train_json['categories'] == val_json['categories'] 

True

In [9]:
yaml_text = """
path: dfg/
train: 'train/images'
val: 'val/images'
 
# class names
names: 
"""
for i in train_json['categories']:
    yaml_text += f"  {i['id']}: {i['name']}\n"

yaml_text = yaml_text.strip()


with open("dfg_tsd.yaml", "w") as f:
    f.write(yaml_text)

In [10]:
datasets_path = "datasets\\dfg"

In [11]:
def prepare_label_text(image_info):
    """
    Prepares and returns a text string containing the formatted annotations for a single image.
    
    The function iterates over the 'annotations' list in the provided image_info dictionary. Each annotation
    includes a 'category_id' and a 'bbox' (bounding box), which is processed to calculate the center coordinates
    (cx, cy), and the width and height (bw, bh) relative to the image dimensions. Annotations marked with 'ignore'
    are skipped. The output is a string where each line corresponds to an annotation formatted as:
    "category_id center_x center_y bbox_width bbox_height", with values normalized relative to the image size.
    
    Parameters:
    - image_info: A dictionary containing keys 'annotations', 'width', and 'height'. 'annotations' is a list of
      dictionaries, each representing an object annotation with keys 'ignore', 'category_id', and 'bbox'. 'width'
      and 'height' are the dimensions of the image.
    
    Returns:
    - A string where each line represents an object annotation in the specified format. The coordinates and
      dimensions are normalized by the image's width and height and rounded to five decimal places.
    """
    text = ""
    for obj in image_info['annotations']:
        
        if obj.get('ignore', False):
            continue
            
        ci = obj['category_id']
        cx = np.clip((obj['bbox'][0] + obj['bbox'][2] / 2) / image_info['width'], 0, 1)
        cy = np.clip((obj['bbox'][1] + obj['bbox'][3] / 2) / image_info['height'], 0, 1)
        bw = np.clip(obj['bbox'][2] / image_info['width'], 0, 1)
        bh = np.clip(obj['bbox'][3] / image_info['height'], 0, 1)
        
        text += f"{ci} {cx:.5f} {cy:.5f} {bw:.5f} {bh:.5f}\n"
    return text.strip()

In [12]:
image_info = master_dict['train'][0]
prepare_label_text(image_info)

'78 0.18958 0.40648 0.05417 0.12963\n158 0.18542 0.30833 0.05417 0.07037\n136 0.02865 0.28519 0.02604 0.06296\n179 0.18411 0.24444 0.05260 0.06111\n59 0.18255 0.15000 0.05260 0.13333'

In [13]:
import concurrent.futures

def process_image_and_label(set_type, image_info, all_images_folder_path, datasets_path):
    """
    Processes a single image and its corresponding label by saving the image in PNG format and
    the label in TXT format.

    Parameters:
    - set_type: 'train' or 'val' indicating the dataset type.
    - image_info: Dictionary containing the image information.
    - all_images_folder_path: Path to the folder containing all images.
    - datasets_path: Path to the base dataset folder where processed images and labels will be saved.
    """
    # Save image
    image = Image.open(os.path.join(all_images_folder_path, image_info['file_name']))
    image_path = os.path.join(datasets_path, set_type, 'images', image_info['file_name'].replace(".jpg", ".png"))
    image.save(image_path)
    image.close()

    # Save label
    label = prepare_label_text(image_info)  # Assuming prepare_label_text is defined elsewhere
    label_path = os.path.join(datasets_path, set_type, 'labels', image_info['file_name'].replace(".jpg", ".txt"))
    with open(label_path, "w") as f:
        f.write(label)

def process_set(master_dict, set_type, all_images_folder_path, datasets_path, max_workers=100):
    """
    Processes and saves images and labels for the specified dataset set_type using multithreading.

    Parameters:
    - master_dict: Master dictionary containing 'train' and 'val' keys with their respective image data.
    - set_type: 'train' or 'val' indicating the dataset type to process.
    - all_images_folder_path: Path to the folder containing all images.
    - datasets_path: Path to the base dataset folder where processed images and labels will be saved.
    - max_workers: Maximum number of threads to use. Default is 100.
    """
    set_master = master_dict[set_type]
    futures = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        for im_id in list(set_master.keys()):  # Adjust as needed
            image_info = set_master[im_id]
            futures.append(executor.submit(process_image_and_label, set_type, image_info, all_images_folder_path, datasets_path))

        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0):
            future.result()  # You can handle exceptions here if you like

In [14]:
process_set(master_dict, 'train', all_images_folder_path, datasets_path, max_workers=500)
process_set(master_dict, 'val', all_images_folder_path, datasets_path, max_workers=500)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13970/13970 [17:51<00:00, 13.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1703/1703 [03:11<00:00,  8.90it/s]
