In [1]:
import os
import fiona
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import rasterio as rio
from helpers import utils

## Get all Images and Label-Files from Directory

In [2]:
DATADIR = "/home/jovyan/work/data"
DATASET = "DENMARK"
IMAGETYPE = ".tif"
LABELTYPE = ".shp"

regions = []
labels = []
PATH = os.path.join(DATADIR, DATASET)
for _, _, files in os.walk(PATH):
    for file in files:
        if file.endswith(IMAGETYPE):
            regions.append(file)
        elif file.endswith(LABELTYPE):
            labels.append(file)
            
print(f"Found {len(regions)} Regions and {len(labels)} Label-Files")

Found 3 Regions and 1 Label-Files


## Set Tile Size and Overlap

To ensure equal tile sizes, overap is computed dynamically based on the amount of vertical and horizontal tiles

In [6]:
tiles_h = 40
tiles_v = 40
width = 250
height = 250

example_src = rio.open(os.path.join(PATH, regions[0]))
ncols, nrows = example_src.meta['width'], example_src.meta['height']
h_overlap = ((tiles_h * width) - ncols) / (tiles_h - 1)
v_overlap = ((tiles_v * height) - nrows) / (tiles_v - 1)

print(f"Generating {tiles_h * tiles_v} tiles per region with: \n - tile size: {width} x {height} px \n - region size: {ncols} x {nrows} px \n - vertical overlap: {v_overlap} px \n - horizontal overlap: {h_overlap} px")

Generating 1600 tiles per region with: 
 - tile size: 250 x 250 px 
 - region size: 8000 x 8000 px 
 - vertical overlap: 51.282051282051285 px 
 - horizontal overlap: 51.282051282051285 px


## Generate Collection

Each entry contains the OG file, number of tile and tile image as RGB-array

NOTE: In case there are more than one Shapefiles at any point, add mapping here

In [7]:
point_labels = []
for file in labels:
    with fiona.open(os.path.join(PATH, file)) as shapefile:
        point_labels.extend([list(feature["geometry"]['coordinates'][:2]) for feature in shapefile])
        
point_labels = pd.DataFrame(data=point_labels, columns=["X", "Y"])
print(f"Found {len(point_labels)} Point Labels")

Found 1133 Point Labels


In [9]:
collection = []
for region in regions:
    src = rio.open(os.path.join(PATH, region))
    name_clean = region.replace(".tif","")
    
    ncols, nrows = src.meta['width'], src.meta['height']
    bounds = list(src.bounds)
    big_window = rio.windows.Window(col_off = 0, row_off = 0, width = ncols, height = nrows)
    
    # filter X and Y
    region_points = point_labels[(point_labels.X > bounds[0]) & (point_labels.X < bounds[2])]
    region_points = region_points[(region_points.Y > bounds[1]) & (region_points.Y < bounds[3])]
    # translate to pixels
    region_points['X'] = region_points['X'].apply(lambda x: round((x - bounds[0])*8)) #CHANGE
    region_points['Y'] = region_points['Y'].apply(lambda y: round((y - bounds[1])*8))
    
    # traverse tiles column bv column, row by row
    for row in range(tiles_v):
        row_off = int(row * (height - v_overlap))
        for col in range(tiles_v):
            col_off = int(col * (width - h_overlap))
            # read tile part of region
            window = rio.windows.Window(col_off = col_off, row_off = row_off, width = width, height = height).intersection(big_window)
            image = np.transpose(src.read(window = window)[:3])
            # get points in tile 
            tile_points = region_points[(region_points.X > col_off) & (region_points.X < (col_off+width))]
            tile_points = tile_points[(tile_points.Y > row_off) & (tile_points.Y < (row_off+height))]
            # translate to new dimensions
            tile_points['X'] = tile_points['X'].apply(lambda x: round(x - col_off))
            tile_points['Y'] = tile_points['Y'].apply(lambda y: round(y - row_off))
            np_points = tile_points.to_numpy()
            # add to collection
            collection.append({"file": name_clean, "tile": str(col + (row * tiles_v) + 1), "image": image, "points": np_points, "npoints": len(np_points)})

collection = pd.DataFrame(collection)
print(f"Generated Collection of {len(collection)} Tiles")

[637000.0, 6078000.0, 638000.0, 6079000.0]
[638000.0, 6078000.0, 639000.0, 6079000.0]
Generated Collection of 3200 Tiles


## Write To Data Directory

In [11]:
TARGET_TYPE = ".npy"
#make new folder in data directory
NEW_PATH = os.path.join(PATH, f"{width}x{height}")
os.mkdir(NEW_PATH)
IMAGE_PATH = os.path.join(NEW_PATH, "images")
os.mkdir(IMAGE_PATH)

for index, item in collection.iterrows():
    data = item['image']
    name = item['file'] + "_" + item['tile'] + TARGET_TYPE
    np.save(os.path.join(IMAGE_PATH, name), data)
    #img = Image.fromarray(data, 'RGB')
    #img.save(os.path.join(IMAGE_PATH, name))

In [None]:
LABEL_PATH = os.path.join(NEW_PATH, "points")
os.mkdir(LABEL_PATH)
items_with_label = collection[collection.npoints > 0]
for index, item in items_with_label.iterrows():
    data = item['points']
    name = item['file'] + "_" + item['tile'] + "_points.npy"
    np.save(os.path.join(LABEL_PATH, name), data)

## Define Train, Val and Test Sets

In [None]:
SETS_PATH = os.path.join(NEW_PATH, "image_sets")
os.mkdir(SETS_PATH)

In [None]:
# DEFAULT: Full Set
finished_set = collection

# IDEA 1: only labeled tiles
finished_set = collection[collection.npoints > 0]

finished_set = finished_set.reset_index(drop=True)

In [None]:
# Default 70 - 20 - 10 Split
total = len(finished_set)
train = total * 0.7
val = total * 0.2
test = total * 0.1
TARGET_TYPE = ".jpg"

train_file = open(os.path.join(SETS_PATH, "training.txt"), "w") 
val_file = open(os.path.join(SETS_PATH, "validation.txt"), "w") 
test_file = open(os.path.join(SETS_PATH, "test.txt"), "w") 

for index, item in finished_set.iterrows():
    name = item['file'] + "_" + item['tile'] + TARGET_TYPE + "\n"
    if index < train:
        train_file.write(name) 
    elif index < (train+val):
        val_file.write(name) 
    else:
        test_file.write(name)

train_file.close() 
val_file.close()     
test_file.close() 

# EXPLORATION ZONE

In [None]:
# How many 250x250 tiles actually have a label?
have = len(collection[collection.npoints > 0])
total = len(collection)
print(f"From a total of {total} tiles, {have} have a label assigned ({have/total*100} %)")

In [10]:
collection

Unnamed: 0,file,tile,image,points,npoints
0,2019_1km_6078_637,1,"[[[206, 215, 216], [205, 214, 216], [205, 214,...",[],0
1,2019_1km_6078_637,2,"[[[121, 124, 114], [123, 121, 111], [126, 123,...",[],0
2,2019_1km_6078_637,3,"[[[130, 123, 97], [112, 107, 92], [106, 98, 76...",[],0
3,2019_1km_6078_637,4,"[[[87, 98, 68], [89, 100, 69], [90, 102, 70], ...",[],0
4,2019_1km_6078_637,5,"[[[111, 108, 84], [113, 109, 81], [113, 109, 8...",[],0
...,...,...,...,...,...
3195,2019_1km_6078_638,1596,"[[[60, 57, 44], [60, 58, 46], [61, 59, 46], [6...",[],0
3196,2019_1km_6078_638,1597,"[[[100, 95, 91], [104, 98, 96], [109, 104, 93]...",[],0
3197,2019_1km_6078_638,1598,"[[[154, 152, 154], [149, 147, 149], [157, 156,...",[],0
3198,2019_1km_6078_638,1599,"[[[212, 180, 174], [203, 169, 161], [182, 151,...",[],0
