In [67]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get the sets

In [68]:
from satellitepy.data.labels import read_label, init_satellitepy_label, set_image_keys, get_all_satellitepy_keys
from satellitepy.utils.path_utils import get_file_paths
from satellitepy.data.utils import get_satellitepy_dict_values, count_unique_values, get_satellitepy_table, read_img, set_satellitepy_dict_values
from satellitepy.data.bbox import BBox
from satellitepy.data.patch import is_truncated, shift_bboxes, create_patch_polygon, get_intersection
from satellitepy.data.tools import show_labels_on_images

import os
os.environ["OPENCV_IO_MAX_IMAGE_PIXELS"] = str(pow(2,60))
import cv2
import random
import numpy as np
import itertools
import sys
from pathlib import Path
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import json

In [38]:
# img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_dataset/images/")
# label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_dataset/labels_fineair/role_th_50/")
# label_format = 'fineair' # 
# test_img_sz = 8000
# intersection_th = 0.95
# test_sum_ratio_th = 0.15

# train_label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/train/labels_fineair")
# train_img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/train/images")
# test_label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/test/labels_fineair")
# test_img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/test/images")



In [39]:
img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/train/images")
label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/train/labels_fineair")
label_format = 'satellitepy' # 
test_img_sz = 6000
intersection_th = 0.9
test_sum_ratio_th = 0.15
bbox_for_intersection = 'obboxes'
img_read_module = 'cv2'
img_extension = 'png' if img_read_module == 'cv2' else 'tif'
train_label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/only_train/labels")
train_img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/only_train/images")
test_label_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/val/labels")
test_img_folder = Path("/mnt/2tb-1/satellitepy/data/FR24_sets/val/images")



## Get patch labels

In [40]:
def get_patch_dict(label, test_img_size, margin=100, intersection_th=0.95):

    satellitepy_fac = get_satellitepy_table()['fineair-class']
    
    # Define patch starting coords
    patch_start_coords = []
    for i, bbox_corners in enumerate(label[bbox_for_intersection]):
        bbox = BBox(corners=bbox_corners)
        x_min, x_max, y_min, y_max = bbox.get_bbox_limits(bbox.corners)
        x_0, y_0 = np.maximum(x_min - margin,0), np.maximum(y_min - margin,0)
        patch_start_coords.append([x_0,y_0])

    # Set patch 
    patch_dict = {
        'test_indices': [[] for _ in range(len(patch_start_coords))],
        'train_indices':[[] for _ in range(len(patch_start_coords))],
        'test_fac':[[] for _ in range(len(patch_start_coords))],
        'test_fac_count':[[0 for fac_i in range(len(set(satellitepy_fac.values())))] for _ in range(len(patch_start_coords))],
        'train_fac':[[] for _ in range(len(patch_start_coords))],
        'train_fac_count':[[0 for fac_i in range(len(set(satellitepy_fac.values())))] for _ in range(len(patch_start_coords))],
        'start_coords': patch_start_coords,
        }
    for i, patch_start_coord in enumerate(patch_start_coords):
        x_0, y_0 = patch_start_coord
        patch_polygon = create_patch_polygon(x_0=x_0, y_0=y_0, patch_size=test_img_size)
        fineair_classes = get_satellitepy_dict_values(label,task='fineair-class')
        for j, bbox_corners in enumerate(label[bbox_for_intersection]):
            intersection = get_intersection(bbox_corners=bbox_corners, patch_polygon=patch_polygon)
            # is_truncated_bbox = is_truncated(bbox_corners=bbox_corners, patch_polygon=patch_polygon, relative_area_threshold=relative_area_thr)
            ## Set the labels to empty because of cutoff objects
            if intersection == 0:
                continue
            if intersection > 0 and intersection_th > intersection:
                # patch_dict['labels'][i] = []
                patch_dict['test_indices'][i] = []
                patch_dict['test_fac'][i] = []
                break
            else:
                # patch_dict['labels'][i] = set_image_keys(get_all_satellitepy_keys(), patch_dict['labels'][i], label, j)
                patch_dict['test_indices'][i].append(j)
                patch_dict['test_fac'][i].append(fineair_classes[j])
        patch_dict['train_indices'][i] = list(set(range(len(label[bbox_for_intersection])))-set(patch_dict['test_indices'][i]))
        patch_dict['train_fac'][i] = [fineair_classes[j] for j in patch_dict['train_indices'][i]]

        # Set train fac count and test fac count
        for fac in patch_dict['train_fac'][i]:
            fac_i = satellitepy_fac[fac]
            patch_dict['train_fac_count'][i][fac_i] += 1 
        for fac in patch_dict['test_fac'][i]:
            fac_i = satellitepy_fac[fac]
            patch_dict['test_fac_count'][i][fac_i] += 1
    return patch_dict
        

In [41]:
label_paths = get_file_paths(label_folder)


patch_dicts = {}

for label_path in label_paths:
    label = read_label(label_path=label_path,label_format=label_format)
    patch_dict = get_patch_dict(label=label,test_img_size=test_img_sz, intersection_th=intersection_th)
    label_file_name = label_path.name
    patch_dicts[label_file_name] = patch_dict



## Get one patch from each original image for the test set

In [42]:
def unique_lists_with_indices(list_of_lists):
    unique_dict = {}
    
    for idx, sublist in enumerate(list_of_lists):
        # Convert the list to a tuple so it can be used as a dictionary key
        tuple_sublist = tuple(sublist)
        
        # Store the sublist and its first occurrence index if it's unique
        if tuple_sublist not in unique_dict:
            unique_dict[tuple_sublist] = idx
    
    # Extract unique lists and their indices
    unique_lists = [list(key) for key in unique_dict.keys()]
    unique_indices = list(unique_dict.values())
    
    return unique_lists, unique_indices

# Test case
# input_list = [[1, 1], [2, 2], [1, 1]]
# unique_lists, unique_indices = unique_lists_with_indices(input_list)
# print("Unique Lists:", unique_lists)
# print("Indices of First Occurrences:", unique_indices)

In [43]:
def compute_test_train_pairs(patch_dicts):
    
    orig_test_train_pair_indices = []
    all_test_train_pairs = []
    all_test_train_pair_indices = []
    # Iterate through each dictionary in B
    for img_name, patch_dict in patch_dicts.items():
        test_indices, unique_patch_indices = unique_lists_with_indices(patch_dict['test_indices'])
        ## Remove empty list from test indices
        if len(unique_patch_indices) == 0:
            print(img_name)
        else:
            for i, test_ind in enumerate(test_indices.copy()):
                if test_ind == []:
                    test_indices.pop(i)
                    unique_patch_indices.pop(i)
        train_indices = [ind for i, ind in enumerate(patch_dict['train_indices']) if i in unique_patch_indices]
        if img_name == 'O_Hare_Int_Airport_23FEB28165710.json':
            print(patch_dict['train_indices'])
        # train_fac = [fac for i, fac in enumerate(patch_dict['train_fac']) if i in unique_patch_indices]
        # test_fac = [fac for i, fac in enumerate(patch_dict['test_fac']) if i in unique_patch_indices]
        test_fac_count = [fac for i, fac in enumerate(patch_dict['test_fac_count']) if i in unique_patch_indices]
        train_fac_count = [fac for i, fac in enumerate(patch_dict['train_fac_count']) if i in unique_patch_indices]
        test_train_pairs = list(zip(test_fac_count, train_fac_count))
        test_train_pair_indices = list(zip(test_indices, train_indices))
        all_test_train_pairs.append(test_train_pairs)
        all_test_train_pair_indices.append(test_train_pair_indices)
        orig_test_train_pair_indices.append(unique_patch_indices)
    return all_test_train_pairs, all_test_train_pair_indices, orig_test_train_pair_indices


In [44]:
test_train_pairs, test_train_pair_indices, orig_test_train_pair_indices = compute_test_train_pairs(patch_dicts)
# print(test_train_pair_indices[0][17])
# print(orig_test_train_pair_indices)
# print(list(patch_dicts.values())[0]['test_indices'][17])
# print(list(patch_dicts.values())[0]['train_indices'])

for i, s in enumerate(test_train_pairs):
    if len(s) <= 0:
        print(s)
        print(len(s),i)
        print(list(patch_dicts.keys())[i])
        print(list(patch_dicts.values())[i])


## Cartesian options

In [45]:
# Calculate the number of elements in each set
# lengths = [len(s) for s in test_train_pairs]

def cartesian_with_indices(all_test_train_pairs):
    # Calculate the product of all test-train pairs across dictionaries in B
    for combination in itertools.product(*all_test_train_pairs):
         # Find indices of elements in their respective sets
        indices = [s.index(elem) for s, elem in zip(all_test_train_pairs, combination)]
        yield combination, indices

def random_cartesian_with_indices(sets, num_samples):
    for _ in range(num_samples):
        # Randomly choose an index for each set
        indices = [random.randint(0, len(s) - 1) for s in sets]
        # Get the elements at the chosen indices
        sample = [sets[i][index] for i, index in enumerate(indices)]
        yield sample, indices

In [46]:

def get_cartesian_test_train_pairs(sets, test_sum_ratio_th):
    best_indices = [0]*len(sets)
    best_sum_ratio_dif = np.inf
    best_test_to_all_ratio = np.Inf
    # for combination, indices in cartesian_with_indices(sets):
    for combination, indices in random_cartesian_with_indices(test_train_pairs, 100000):

        test_sum = np.sum(np.array(combination)[:,0],axis=0)
        train_sum = np.sum(np.array(combination)[:,1],axis=0)
        total_sum = np.sum(a=[test_sum,train_sum],axis=1)

        test_sum_ratio = test_sum / total_sum[0]
        train_sum_ratio = train_sum / total_sum[1]
        test_to_all_ratio = total_sum[0]/np.sum(total_sum)
        sum_ratio_dif = np.sum(np.abs(test_sum_ratio-train_sum_ratio)) # train and test set ratio difference

        if (best_test_to_all_ratio+0.001 >= np.abs(test_to_all_ratio-test_sum_ratio_th)) and (sum_ratio_dif <= best_sum_ratio_dif):
            print('# Test instances: ', test_sum)
            print('# Train instances: ',train_sum)
            print('# Total instances (ratio): ', total_sum, test_to_all_ratio)
            print('Absolute sum diff: ', sum_ratio_dif)
            print(indices)
            best_sum_ratio_dif = sum_ratio_dif
            best_test_to_all_ratio = np.abs(test_to_all_ratio-test_sum_ratio_th)
            best_indices = indices
    return best_indices

In [47]:
result = get_cartesian_test_train_pairs(test_train_pairs, test_sum_ratio_th=test_sum_ratio_th)

# Test instances:  [ 11   4  40  42  17   0  19   0  42 101   3  10  19  29   5   5   3   6   6   1   0  37   6]
# Train instances:  [ 53 162 487 377 157   0  94   0 663 616  44  37 102 209 120  39  33  75  36  81   0 633 184]
# Total instances (ratio):  [ 406 4202] 0.0881076388888889
Absolute sum diff:  0.45720428695694987
[8, 2, 8, 4, 2, 12, 4, 11, 0, 5, 15, 12, 22, 11, 10, 11, 6, 27, 8, 3, 12, 14, 2, 16, 10, 2, 10, 9, 6, 5, 19, 12, 26, 3, 5, 12, 14, 11, 10, 6]
# Test instances:  [ 10  25  61  42  17   0  13   0  77 121   2   4  15  32   6   0   7   6  13  18   0  65  60]
# Train instances:  [ 54 141 466 377 157   0 100   0 628 596  45  43 106 206 119  44  29  75  29  64   0 605 130]
# Total instances (ratio):  [ 594 4014] 0.12890625
Absolute sum diff:  0.34064108952001326
[18, 0, 3, 11, 7, 10, 26, 2, 5, 11, 5, 8, 7, 32, 14, 0, 2, 18, 2, 11, 11, 0, 13, 9, 11, 4, 11, 7, 18, 5, 13, 2, 16, 3, 2, 7, 6, 3, 2, 0]
# Test instances:  [ 20   4  64  33  22   0  19   0  82  98   1   7  11  34  

In [48]:
print(result)

[6, 4, 17, 3, 9, 8, 22, 0, 8, 2, 16, 8, 20, 6, 26, 6, 2, 25, 1, 1, 9, 0, 8, 12, 5, 5, 4, 6, 13, 5, 2, 11, 6, 9, 0, 13, 12, 5, 7, 12]


In [49]:
def remove_all_files(folder_path):
    folder = Path(folder_path)
    for file in folder.iterdir():
        if file.is_file():
            file.unlink()


## Save the splits

In [58]:
# remove_all_files(test_img_folder)
# remove_all_files(test_label_folder)
# remove_all_files(train_img_folder)
# remove_all_files(train_label_folder)
def save_sets(patch_dicts,best_indices,test_train_pair_indices, orig_test_train_pair_indices):
    
    # for file_name, patch_dict in patch_dicts.items():
    file_names = list(patch_dicts.keys())
    tasks = get_all_satellitepy_keys()
    for i, ind in enumerate(best_indices):
        # i : original image index
        # ind : patch index within the original image

        orig_ind = orig_test_train_pair_indices[i][ind]
        # Save labels
        file_name = Path(file_names[i]).stem
        train_img_path = train_img_folder/f"{file_name}.png"
        if train_img_path.is_file():
             print(f'{file_name} exists in the destination train folder, skipped...')
             continue
        patch_dict = patch_dicts[file_names[i]]
        label_path = label_folder / file_names[i]
        label = read_label(label_path=label_path,label_format=label_format)
        test_label = init_satellitepy_label()
        train_label = init_satellitepy_label()
        x_0, y_0 = patch_dict['start_coords'][orig_ind]

        # Set test and train labels
        ## Unique values are calculated previously for test_train_pairs
        ## Use the corresponding unique value indices to find the train-test pair indices
        test_indices, train_indices = test_train_pair_indices[i][ind]
        # test_indices = patch_dict['test_indices'][ind]
        # train_indices = patch_dict['train_indices'][ind]
        # print(test_train_pair_indices[i][ind])
        print(test_indices)
        print(f"Test image has {len(test_indices)} airplanes.")
        print(train_indices)
        # if len(test_indices) == 0:
        #     print(f"{file_name} has no test objects, original image will be saved into the train folder.")
        #     train_label_path = train_label_folder / f"{file_name}.json"
        #     with open(str(train_label_path), 'w') as f:
        #         json.dump(train_label, f, indent=4)
        #     img = read_img(img_path=img_folder/f"{file_name}.tif", module='rasterio')
        #     cv2.imwrite(str(train_img_folder/f"{file_name}.png"),img)
        #     continue
        for task in tasks:
            task_values = get_satellitepy_dict_values(label,task=task)
            test_task_values = []
            train_task_values = []
            for task_value_i, task_value in enumerate(task_values):
                if task_value_i in test_indices:
                    test_task_values.append(task_value)
                elif task_value_i in train_indices:
                    train_task_values.append(task_value)
            test_label = set_satellitepy_dict_values(test_label,task=task,value=test_task_values)
            train_label = set_satellitepy_dict_values(train_label,task=task,value=train_task_values)

        # shift test bboxes
        for bbox_task in ['obboxes','hbboxes']:
            bbox_values = get_satellitepy_dict_values(test_label,task=bbox_task)
            shifted_bbox_values = (np.array(bbox_values) - [x_0, y_0]).tolist()
            test_label = set_satellitepy_dict_values(test_label,task=bbox_task,value=shifted_bbox_values)

        test_label_path = test_label_folder / f"{file_name}.json"
        train_label_path = train_label_folder / f"{file_name}.json"

        with open(str(test_label_path), 'w') as f:
                json.dump(test_label, f, indent=4)
        with open(str(train_label_path), 'w') as f:
                json.dump(train_label, f, indent=4)

        # Save images
        img = read_img(img_path=str(img_folder/f"{file_name}.{img_extension}"), module=img_read_module)
        test_img = img[y_0:y_0+test_img_sz, x_0:x_0+test_img_sz, :]
        train_mask = np.ones_like(img, dtype=np.uint8)
        train_mask[y_0:y_0+test_img_sz, x_0:x_0+test_img_sz, :] = 0
        train_img = img*train_mask
        cv2.imwrite(str(train_img_path),train_img)
        cv2.imwrite(str(test_img_folder/f"{file_name}.png"),test_img)


In [69]:
save_sets(patch_dicts=patch_dicts,
    best_indices=result,
    test_train_pair_indices=test_train_pair_indices,
    orig_test_train_pair_indices=orig_test_train_pair_indices)

Amsterdam_23MAR14104929 exists in the destination train folder, skipped...
Bangkok_23FEB21040003 exists in the destination train folder, skipped...
Beijing_23SEP13031225 exists in the destination train folder, skipped...
Beijing_Capital_International_22DEC04031345 exists in the destination train folder, skipped...
Cairo_24JAN04084426 exists in the destination train folder, skipped...
Cairo_24JAN29084910 exists in the destination train folder, skipped...
Charles_De_Gaulle_23MAY29105259 exists in the destination train folder, skipped...
Charlotte_Douglas_International_Airport_24JAN18161805 exists in the destination train folder, skipped...
Chengdu_Shuangliu_International_Airport_23NOV27035335 exists in the destination train folder, skipped...
Chongqing_Jiangbei_International_Airport_23JUL16034146 exists in the destination train folder, skipped...
Dallas_Fort_Worth_21JUL13170358 exists in the destination train folder, skipped...
Dallas_Fort_Worth_22JUN16172659 exists in the destination tr

error: OpenCV(4.5.4) /tmp/pip-req-build-1mhygbdf/opencv/modules/imgcodecs/src/loadsave.cpp:77: error: (-215:Assertion failed) pixels <= CV_IO_MAX_IMAGE_PIXELS in function 'validateInputImageSize'


In [208]:
show_labels_on_images(image_folder=test_img_folder,
        label_folder=test_label_folder,
        mask_folder=None,
        label_format='satellitepy',
        img_read_module='cv2',
        out_folder=Path('/home/murat/Projects/satellitepy/docs/temp_fineair_set'),
        tasks=['coarse-class','obboxes'],
        rescaling=1.0,
        interpolation_method=None)

100%|██████████| 2/2 [00:48<00:00, 24.21s/it]
