In [None]:
from pathlib import Path
import polars as pl
import lt_lib.data.preprocessing as preprocessing
from lt_lib.data.tiling import tile_images, tile_images_mono_process

%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

In [None]:
PROCESS_REAL_DATA = False
PROCESS_SYNTHETIC_DATA = True

# 1. Preprocessing real data
## 1.1 Train-val split

In [None]:
RAW_DATA_IMG_DIR_PATH = Path("/content/datasets/dataset_v0/imgs")

if PROCESS_REAL_DATA:
    preprocessing.train_val_split_and_move_rareplanes_real_img_xml_and_geojson_files(
        imgs_dir=RAW_DATA_IMG_DIR_PATH,
        imgs_extension=".png",
        val_fraction=0.2,
        seed=42,
    )

## 1.2 Gather all annotations in one gts.csv file for each real dataset (train-val-test)

In [None]:
REAL_DATA_ROOT_DIR_PATH = Path("/content/datasets/dataset_v0")

if PROCESS_REAL_DATA:
    _, _ = preprocessing.get_all_annotations_from_rareplanes_geojsons(
        root_dir_path = REAL_DATA_ROOT_DIR_PATH / "train",
        tiled_version = True,
        imgs_extension=".png",
        save_to_file = True,
    )

    _, _ = preprocessing.get_all_annotations_from_rareplanes_geojsons(
        root_dir_path = REAL_DATA_ROOT_DIR_PATH / "val",
        tiled_version = True,
        imgs_extension=".png",
        save_to_file = True,
    )

    _, _ = preprocessing.get_all_annotations_from_rareplanes_geojsons(
        root_dir_path = REAL_DATA_ROOT_DIR_PATH / "test",
        tiled_version = True,
        imgs_extension=".png",
        save_to_file = True,
    )

# 2. Preprocessing synthetic data
## 2.1 Copy fraction of the train synthetic data

In [None]:
RAW_DATA_IMG_DIR_PATH = Path("/Volumes/Maxtor-Loic/Loïc/4-KTH/2023-2024/Master Thesis/Datasets/RarePlanes/synthetic/imgs")

if PROCESS_SYNTHETIC_DATA:
    preprocessing.copy_fraction_of_random_rareplanes_synthetic_img_and_xml_files(
        imgs_dir=RAW_DATA_IMG_DIR_PATH,
        imgs_extension=".png",
        fraction_to_copy=0.1,
        seed=42,
    )

## 2.2 Gather all annotations in one gts.csv

In [None]:
SYNTHETIC_ANNOTATIONS_DIR_PATH = Path("/content/datasets/synthetic_data_sampled_10percent_seed42/annotations")

if PROCESS_SYNTHETIC_DATA:
    _ = preprocessing.get_all_synthetic_gts_from_rareplanes_xmls(
        annotations_dir_path = SYNTHETIC_ANNOTATIONS_DIR_PATH,
        imgs_extension=".png",
        clip=True,
        save_to_file=True,
    )

## 2.3 Tiles images from a directory

In [None]:
ROOT_BASE_DATA_DIR_PATH = Path("/content/datasets/synthetic_data_sampled_10percent_seed42")
TILING_TYPE = "grid_tiling"
TILING_ARGS = {
    "tile_shape": 512,
    "overlap": 32,
    "min_bbox_area_on_tile": 0.2,
    "clip_boxes": True,
    "png_compression_level": 3,
}

if PROCESS_SYNTHETIC_DATA:
    tile_images(
        root_data_dir=ROOT_BASE_DATA_DIR_PATH,
        tiling_type=TILING_TYPE,
        tiling_args=TILING_ARGS,
        images_extension=".png",
        processes=4,
        chunksize=1,
    )

## 2.4 Copy a nb of tiles from tiled data

In [None]:
ROOT_TILED_DATA_DIR_PATH = Path("/content/datasets/synthetic_data_sampled_10percent_seed42/tiled_dataset")

if PROCESS_SYNTHETIC_DATA:
    preprocessing.copy_nb_of_random_rareplanes_synthetic_tile_and_gts_files(
        root_data_dir=ROOT_TILED_DATA_DIR_PATH,
        imgs_extension=".png",
        select_nb=4500,
        seed=42
    )

## 2.5 Adjust labels based on wingspan value

In [None]:
GTS_WITH_WINGSPAN_PATH = Path("/content/datasets/synthetic_data_sampled_10percent_seed42/annotations/gts_with_wingspan.csv")
GTS_TO_ADJUST_PATH = Path("/content/datasets/synthetic_data_sampled_10percent_seed42/tiled_dataset/sampled_synthetic_tiled_data/annotations/gts.csv")

preprocessing.adjust_labels_base_on_wingspan(
    gts_with_wingspan_path= GTS_WITH_WINGSPAN_PATH,
    gts_to_adjust_path=GTS_TO_ADJUST_PATH,
    wingspan_label_bins=[0,14,36],
    save_to_file=True,
)