In [1]:
import cv2
from pathlib import Path
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from os.path import exists
import os.path

In [3]:
dir_prefix = '/data'

In [3]:
def tile_image(image, 
               mask, 
               vertical_splits,
               horizontal_splits, 
               img_tiling_prefix, 
               mask_tiling_prefix, 
               target_imgtile_path, 
               target_masktile_path,
               target_csv_dataset_path, 
               to_exclude_condition, 
               imgtype='.tif',
               skip_img=False,
               skip_mask=False
               ):
    Path(target_imgtile_path).mkdir(exist_ok=True)
    Path(target_masktile_path).mkdir(exist_ok=True)
    
    im = image
    imgheight=im.shape[0]
    imgwidth=im.shape[1]

    assert imgheight == mask.shape[0],'imgheight == mask.shape[0] failed'
    assert imgwidth == mask.shape[1],'imgwidth == mask.shape[1] failed'

    M = imgheight//horizontal_splits
    N = imgwidth//vertical_splits

    excluded_regions_num = 0
    usable_tiles_dataset = [] #. To be filled with tuples of (path to img tile, path to mask tile) for tiles that are not excluded by the exclude condition

    for j in range(horizontal_splits):
        for i in range(vertical_splits):  
            y = j*M
            x = i*N

            y1 = y + M
            x1 = x + N

            if j == (horizontal_splits - 1): y1 = imgheight
            if i == (vertical_splits - 1): x1 = imgwidth

            tiles_im = im[y:y1,x:x1]
            tiles_ma = mask[y:y1,x:x1]

            img_tile_filename = target_imgtile_path + '/' + img_tiling_prefix + str(j) + '_' + str(i)+imgtype
            mask_tile_filename = target_masktile_path + '/' + mask_tiling_prefix + str(j) + '_' + str(i)+imgtype
            if (not skip_img) and (not exists(img_tile_filename)): cv2.imwrite(img_tile_filename,tiles_im)
            if (not skip_mask) and (not exists(mask_tile_filename)): cv2.imwrite(mask_tile_filename,tiles_ma)
            if not to_exclude_condition(tiles_im, tiles_ma):
                usable_tiles_dataset.append((img_tile_filename, mask_tile_filename))
            else:
                excluded_regions_num += 1
    usable_tiles_dataset = pd.DataFrame(usable_tiles_dataset, columns=['imgPath', 'maskPath']).sort_values('imgPath')
    usable_tiles_dataset.to_csv(target_csv_dataset_path, index=False)
    print(f"Excluded {excluded_regions_num} regions for {target_csv_dataset_path}")

In [5]:
def create_one_channel_forest_dataset(horizontal_splits,
                                      vertical_splits,
                                      FINAL_DATASET_NAME,
                                      img_file,
                                      mask_file,
                                      skip_img=False,
                                      skip_mask=False):
    # One Channel Dataset
    satellite_img_path = os.path.join(dir_prefix, 'raw_forest', img_file)
    nks_mask_path = os.path.join(dir_prefix, 'raw_forest', mask_file)
    img_tiling_prefix = Path(satellite_img_path).stem + '_split_'
    mask_tiling_prefix = Path(nks_mask_path).stem + '_split_'
    target_imgtile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, Path(satellite_img_path).stem)
    target_masktile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, Path(nks_mask_path).stem)
    target_csv_dataset_path = os.path.join(dir_prefix,
                                          FINAL_DATASET_NAME,
                                          f"Dataset-{Path(satellite_img_path).stem}-{Path(nks_mask_path).stem}.csv")

    os.makedirs(target_imgtile_path, exist_ok=True)
    os.makedirs(target_masktile_path, exist_ok=True)

    to_exclude_condition = lambda tiles_im, tiles_ma: ((tiles_ma < 0).sum()/tiles_ma.size > 0.5) or ((tiles_im == 0).sum()/tiles_im.size > 0.5) # Because mask has all class labels greater than 0 and everything else is -inf


    image = cv2.imread(satellite_img_path, cv2.IMREAD_UNCHANGED)
    mask = cv2.imread(nks_mask_path, cv2.IMREAD_UNCHANGED)

    image[image < 0] = 0
    image[image > 255] = 255
    image = np.uint8(image)

    mask = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)


    tile_image(
        image=image,
        mask=mask,
        vertical_splits=vertical_splits,
        horizontal_splits=horizontal_splits,
        img_tiling_prefix=img_tiling_prefix,
        mask_tiling_prefix=mask_tiling_prefix,
        target_imgtile_path=target_imgtile_path,
        target_masktile_path=target_masktile_path,
        target_csv_dataset_path=target_csv_dataset_path,
        to_exclude_condition=to_exclude_condition,
        skip_img=skip_img,
        skip_mask=skip_mask
    )

    del image

    print(f"num of files in {target_imgtile_path}: {len([name for name in os.listdir(target_imgtile_path) if os.path.join(target_imgtile_path, name)])}")
    print(f"num of files in {target_masktile_path}: {len([name for name in os.listdir(target_masktile_path) if os.path.join(target_masktile_path, name)])}")
    print()


In [6]:
args = ['wv2_0.tif', 
        'wv2_1.tif',
        'wv2_2.tif',
        'wv2_3.tif',
        'wv2_4.tif',
        'wv2_5.tif',
        'wv2_6.tif',
        'wv2_7.tif']

for mask_name in ['nks.tif', 'nks_hrsume.tif', 'train_polygons.tif']:
    for img_file in args:
        create_one_channel_forest_dataset(horizontal_splits=40,
                                          vertical_splits=40,
                                          FINAL_DATASET_NAME='ForestDataset8C',
                                          img_file=img_file,
                                          mask_file=mask_name)

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_0-nks.csv
num of files in /data/ForestDataset8C/wv2_0: 1600
num of files in /data/ForestDataset8C/nks: 1600

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_1-nks.csv
num of files in /data/ForestDataset8C/wv2_1: 1600
num of files in /data/ForestDataset8C/nks: 1600

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_2-nks.csv
num of files in /data/ForestDataset8C/wv2_2: 1600
num of files in /data/ForestDataset8C/nks: 1600

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_3-nks.csv
num of files in /data/ForestDataset8C/wv2_3: 1600
num of files in /data/ForestDataset8C/nks: 1600

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_4-nks.csv
num of files in /data/ForestDataset8C/wv2_4: 1600
num of files in /data/ForestDataset8C/nks: 1600

Excluded 874 regions for /data/ForestDataset8C/Dataset-wv2_5-nks.csv
num of files in /data/ForestDataset8C/wv2_5: 1600
num of files in /data/ForestDataset8C/nks

In [8]:
len(os.listdir('/data/ForestDataset8C/wv2_4'))

1600

In [8]:
exit

nks_old je stara verzija nks klasifikacije, Subasic je dobio access boljim kasifikacijama train polygons i nks_hrsume (msm da je to maska s semi automatskim labeliranjem podataka), i brojevi klasa su malo shiftani da se poklapaju s drugim maskama

In [None]:
nks_old = cv2.imread(os.path.join(dir_prefix, 'nks_old.tif'), cv2.IMREAD_UNCHANGED)
nks = cv2.imread(os.path.join(dir_prefix, 'nks.tif'), cv2.IMREAD_UNCHANGED)
nks_hrsume = cv2.imread(os.path.join(dir_prefix, 'nks_hrsume.tif'), cv2.IMREAD_UNCHANGED)
train_polygons = cv2.imread(os.path.join(dir_prefix, 'train_polygons.tif'), cv2.IMREAD_UNCHANGED)

print(f"nks_old shape: {nks_old.shape}")
print(f"nks shape: {nks.shape}")
print(f"nks_hrsume shape: {nks_hrsume.shape}")
print(f"train_polygons shape: {train_polygons.shape}")
print(f"nks_old freqs: {np.unique(nks_old, return_counts=True)}")
print(f"nks freqs: {np.unique(nks, return_counts=True)}")
print(f"nks_hrsume freqs: {np.unique(nks_hrsume, return_counts=True)}")
print(f"train_polygons freqs: {np.unique(train_polygons, return_counts=True)}")

nks_old shape: (851, 964)
nks shape: (2605, 2814)
nks_hrsume shape: (851, 964)
train_polygons shape: (4170, 4626)
nks_old freqs: (array([-3.4e+38,  1.0e+00,  2.0e+00,  3.0e+00,  4.0e+00,  5.0e+00,
        6.0e+00,  7.0e+00,  8.0e+00,  9.0e+00,  1.0e+01,  1.1e+01,
        1.2e+01,  1.3e+01,  1.4e+01], dtype=float32), array([440166,   1241,    322,   6264,   3904,  19341,   6936,   4348,
        24765,  10283,  80180,  24209,  23012, 117578,  57815]))
nks freqs: (array([-3.4e+38,  1.0e+00,  2.0e+00,  3.0e+00,  4.0e+00,  5.0e+00,
        6.0e+00,  7.0e+00,  8.0e+00,  9.0e+00,  1.0e+01,  1.1e+01],
      dtype=float32), array([3941754,  450152,  543822,   97827,  626456,   10203,   38057,
        992117,   67195,  349082,  136990,   76815]))
nks_hrsume freqs: (array([-3.4e+38,  1.0e+00,  2.0e+00,  3.0e+00,  4.0e+00,  5.0e+00,
        6.0e+00,  7.0e+00,  8.0e+00,  1.0e+01,  1.1e+01,  1.2e+01],
      dtype=float32), array([440141,  26149,  80413,  10232, 117493,  24246,  29920,  57872,
      

In [None]:
print(851/964)
print(2605/2814)

0.8827800829875518
0.925728500355366


##  Forest Dataset


In [None]:
# One Channel Dataset

horizontal_splits = 40
vertical_splits = 40
FINAL_DATASET_NAME = 'ForestDataset'
satellite_img_path = os.path.join(dir_prefix, 'wv2_2.tif')
nks_mask_path = os.path.join(dir_prefix, 'nks.tif')
img_tiling_prefix = Path(satellite_img_path).stem + '_split_'
mask_tiling_prefix = Path(nks_mask_path).stem + '_split_'
target_imgtile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, Path(satellite_img_path).stem)
target_masktile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, Path(nks_mask_path).stem)
target_csv_dataset_path = os.path.join(dir_prefix,
                                       FINAL_DATASET_NAME,
                                       f"Dataset-{Path(satellite_img_path).stem}-{Path(nks_mask_path).stem}.csv")
# os.mkdir(os.path.join(dir_prefix, FINAL_DATASET_NAME))
# os.mkdir(target_imgtile_path)
# os.mkdir(target_masktile_path)

try:
    os.makedirs(target_imgtile_path)
except FileExistsError:
    # directory already exists
    pass

try:
    os.makedirs(target_masktile_path)
except FileExistsError:
    # directory already exists
    pass

to_exclude_condition = lambda tiles_im, tiles_ma: ((tiles_ma < 0).sum()/tiles_ma.size > 0.5) or ((tiles_im == 0).sum()/tiles_im.size > 0.5) # Because mask has all class labels greater than 0 and everything else is -inf


image = cv2.imread(satellite_img_path, cv2.IMREAD_UNCHANGED)
mask = cv2.imread(nks_mask_path, cv2.IMREAD_UNCHANGED)

image[image < 0] = 0
image[image > 255] = 255
image = np.uint8(image)

mask = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)


tile_image(
    image=image,
    mask=mask,
    vertical_splits=vertical_splits,
    horizontal_splits=horizontal_splits,
    img_tiling_prefix=img_tiling_prefix,
    mask_tiling_prefix=mask_tiling_prefix,
    target_imgtile_path=target_imgtile_path,
    target_masktile_path=target_masktile_path,
    target_csv_dataset_path=target_csv_dataset_path,
    to_exclude_condition=to_exclude_condition,
    skip_mask=True
)

del image


Excluded 873 regions


## Three Channel Dataset

In [None]:
# Three Channel Dataset

horizontal_splits = 40
vertical_splits = 40
FINAL_DATASET_NAME = 'Three_Channel_ForestDataset'
satellite_img_path_0 = os.path.join(dir_prefix, 'wv2_0.tif')
satellite_img_path_1 = os.path.join(dir_prefix, 'wv2_1.tif')
satellite_img_path_2 = os.path.join(dir_prefix, 'wv2_2.tif')
nks_mask_path = os.path.join(dir_prefix, 'nks.tif')
img_tiling_prefix = 'wv2_012' + '_split_'
mask_tiling_prefix = Path(nks_mask_path).stem + '_split_'
target_imgtile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, 'wv2_012')
target_masktile_path = os.path.join(dir_prefix, FINAL_DATASET_NAME, Path(nks_mask_path).stem)
os.mkdir(os.path.join(dir_prefix, FINAL_DATASET_NAME))
os.mkdir(target_imgtile_path)
os.mkdir(target_masktile_path)

to_exclude_condition = lambda tiles_im, tiles_ma: ((tiles_ma < 0).sum()/tiles_ma.size > 0.5) or ((tiles_im == 0).sum()/tiles_im.size > 0.5) # Because mask has all class labels greater than 0 and everything else is -inf


image_0 = cv2.imread(satellite_img_path_0, cv2.IMREAD_UNCHANGED)
image_1 = cv2.imread(satellite_img_path_1, cv2.IMREAD_UNCHANGED)
image_2 = cv2.imread(satellite_img_path_2, cv2.IMREAD_UNCHANGED)

image = np.dstack([image_0,
                   image_1,
                   image_2])

image[image < 0] = 0
image[image > 255] = 255
image = np.uint8(image)

plt.imshow(image[3000:6000, 3000:6000, :])
plt.title('image')
plt.show()

mask = cv2.imread(nks_mask_path, cv2.IMREAD_UNCHANGED)
mask = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)


tile_image(
    image=image,
    mask=mask,
    vertical_splits=vertical_splits,
    horizontal_splits=horizontal_splits,
    img_tiling_prefix=img_tiling_prefix,
    mask_tiling_prefix=mask_tiling_prefix,
    target_imgtile_path=target_imgtile_path,
    target_masktile_path=target_masktile_path,
    to_exclude_condition=to_exclude_condition
)
