In [1]:
import rasterio as rio
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import cv2
from collections import defaultdict

import os
path = os.path.join
np.random.seed(seed=42)

In [2]:
RASTERS_PATH = "/mnt/d/work/data/agrivision/rasters/"
DATASET_DF = "/mnt/d/work/data/agrivision/train_val_initial_2020-03-25.csv"
OVERSAMPLE_DIR = "/mnt/d/work/data/agrivision/oversampled"
OVERSAMPLE_DF = "/mnt/d/work/data/agrivision/oversampled.csv"
LABELS_RASTER_NAMES = [
    'cloud_shadow',
    'double_plant',
    'planter_skip',
    'standing_water',
    'waterway',
    'weed_cluster'
]
IMAGE_RASTER_NAMES = [
    'rgb',
    'nir'
]
BOUNDARIES_RASTER_NAME = 'boundary'
IMAGE_NAME_TEMPLATE = "{field_id}_{start_x}-{start_y}-{end_x}-{end_y}"

## Select train data

In [3]:
dataset = pd.read_csv(DATASET_DF)
dataset = dataset[dataset['ds_part'] == 'train']
train_fields = dataset['field_id'].tolist()
available_rasters = os.listdir(RASTERS_PATH)
train_rasters = filter(lambda field: field in train_fields, available_rasters)
train_rasters = list(train_rasters)

In [4]:
cols = dataset.columns
cols

Index(['double_plant', 'cloud_shadow', 'waterway', 'weed_cluster',
       'planter_skip', 'standing_water', 'mask', 'boundary', 'combined_mask',
       'id', 'name', 'field_id', 'ds_part', 'start_x', 'start_y', 'end_x',
       'end_y'],
      dtype='object')

## Select data with planter skips

In [5]:
# ps = planter_skip
raster_w_ps = []
for raster_dir in tqdm(train_rasters):
    ps_path = path(RASTERS_PATH, raster_dir, "planter_skip.tif")
    with rio.open(ps_path) as ps:
        ps_img = ps.read()
    has_ps = np.sum(ps_img) > 0
    if has_ps:
        raster_w_ps.append(raster_dir)

HBox(children=(IntProgress(value=0, max=403), HTML(value='')))




In [6]:
print(f"total rasters: {len(os.listdir(RASTERS_PATH))}")
print(f"rasters w/ ps: {len(raster_w_ps)}")

total rasters: 543
rasters w/ ps: 41


## Generate roi in rasters

In [7]:
def make_rois(seed_coords, amount=10, roi_size=512):
    half_side = roi_size//2
    roi_seeds_idx = np.random.choice(seed_coords.shape[0], size=amount, replace=True)
    roi_seeds = seed_coords[roi_seeds_idx]
    rois_topleft = roi_seeds - half_side
    rois_bottomright = roi_seeds + half_side
    rois = np.hstack([rois_topleft, rois_bottomright])
    return rois

In [8]:
oversample_rois = {}
for raster_dir in tqdm(raster_w_ps):
    ps_path = path(RASTERS_PATH, raster_dir, "planter_skip.tif")
    with rio.open(ps_path) as ps:
        ps_img = ps.read()
        ps_img = ps_img[0]
    raster_shape = ps_img.shape
    ps_pixel_coord = np.transpose(ps_img.nonzero())
    rois = make_rois(ps_pixel_coord, amount=25)
    topleft_in_bounds = np.logical_or.reduce(rois[:,[0,1]] >= 0, axis=1)
    bottomright_in_bounds = (
        np.logical_or.reduce(rois[:,[2]] <= raster_shape[0], axis=1)
        &
        np.logical_or.reduce(rois[:,[3]] <= raster_shape[1], axis=1)
    )
    rois_in_bounds = topleft_in_bounds & bottomright_in_bounds
    if rois_in_bounds.sum() > 0:
        rois = rois[rois_in_bounds]
        oversample_rois[raster_dir] = rois

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




## Cut out and write to disk

In [9]:
def write_rois(img, rois, write_path, field_id, img_format):
    for roi in rois:
        start_y, start_x, end_y, end_x = roi
        image_name = IMAGE_NAME_TEMPLATE.format(
            field_id=field_id,
            start_x=start_x,
            start_y=start_y,
            end_x=end_x,
            end_y=end_y
        )
        image_path = path(write_path, image_name + img_format)
        roi_img = img[:, start_y:end_y, start_x:end_x]
        roi_img = np.transpose(roi_img, axes=(1,2,0))
        cv2.imwrite(image_path, cv2.cvtColor(roi_img, cv2.COLOR_RGB2BGR))

In [10]:
# TODO: generate dataframe
for raster, rois in tqdm(oversample_rois.items()):
    raster_path = path(RASTERS_PATH, raster)
    # Make images
    for image_raster_name in IMAGE_RASTER_NAMES:
        write_path = path(OVERSAMPLE_DIR, 'images', image_raster_name)
        os.makedirs(write_path, exist_ok=True)
        with rio.open(path(raster_path, image_raster_name + '.tif')) as img_ds:
            img = img_ds.read()     
        write_rois(img, rois, write_path, raster, '.jpg')
    
    # Make labels
    for label_raster_name in LABELS_RASTER_NAMES:
        write_path = path(OVERSAMPLE_DIR, 'labels', label_raster_name)
        os.makedirs(write_path, exist_ok=True)
        with rio.open(path(raster_path, label_raster_name + '.tif')) as img_ds:
            img = img_ds.read()
            
        write_rois(img, rois, write_path, raster, '.png')
    
    # Make boudary
    write_path = path(OVERSAMPLE_DIR, 'boundaries')
    os.makedirs(write_path, exist_ok=True)
    with rio.open(path(raster_path, BOUNDARIES_RASTER_NAME + '.tif')) as img_ds:
        img = img_ds.read()

    write_rois(img, rois, write_path, raster, '.png')

HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




## Create dataframe for oversampled data

In [11]:
meta_dict = defaultdict(list)
for img_name in os.listdir(path(OVERSAMPLE_DIR, 'images', 'rgb')):
    meta_dict['name'].append(img_name)
    meta_dict['ds_part'].append('train')

In [15]:
oversample_df = pd.DataFrame(meta_dict)
oversample_df.to_csv(path('/mnt/d/work/data/agrivision/', 'oversample.csv'))

In [17]:
dataset = pd.read_csv(DATASET_DF)

In [23]:
dataset_os = pd.concat([dataset, oversample_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [25]:
dataset_os.to_csv(path('/mnt/d/work/data/agrivision/', 'oversample.csv'))

In [24]:
dataset_os

Unnamed: 0,boundary,cloud_shadow,combined_mask,double_plant,ds_part,end_x,end_y,field_id,id,mask,name,planter_skip,standing_water,start_x,start_y,waterway,weed_cluster
0,262144.0,0.0,0.0,0.0,train,2826.0,4677.0,WJZJEE14I,WJZJEE14I_2314-4165-2826-4677,0.0,WJZJEE14I_2314-4165-2826-4677.jpg,0.0,0.0,2314.0,4165.0,0.0,174211.0
1,226182.0,0.0,0.0,0.0,train,1525.0,1342.0,XD6GQK3HF,XD6GQK3HF_1013-830-1525-1342,0.0,XD6GQK3HF_1013-830-1525-1342.jpg,0.0,0.0,1013.0,830.0,0.0,35362.0
2,166559.0,0.0,0.0,0.0,train,2050.0,3744.0,91MB8NI61,91MB8NI61_1538-3232-2050-3744,0.0,91MB8NI61_1538-3232-2050-3744.jpg,0.0,0.0,1538.0,3232.0,0.0,2524.0
3,262144.0,0.0,0.0,0.0,train,8904.0,7918.0,VLABVAH3T,VLABVAH3T_8392-7406-8904-7918,0.0,VLABVAH3T_8392-7406-8904-7918.jpg,0.0,0.0,8392.0,7406.0,38693.0,0.0
4,223071.0,205743.0,0.0,0.0,train,2877.0,4136.0,393VYUQ83,393VYUQ83_2365-3624-2877-4136,0.0,393VYUQ83_2365-3624-2877-4136.jpg,0.0,0.0,2365.0,3624.0,0.0,0.0
5,262144.0,0.0,0.0,22708.0,train,9602.0,1344.0,T89JYE12U,T89JYE12U_9090-832-9602-1344,0.0,T89JYE12U_9090-832-9602-1344.jpg,0.0,0.0,9090.0,832.0,0.0,24851.0
6,141389.0,0.0,0.0,0.0,train,9400.0,1073.0,24M49T6CC,24M49T6CC_8888-561-9400-1073,0.0,24M49T6CC_8888-561-9400-1073.jpg,0.0,0.0,8888.0,561.0,0.0,5726.0
7,204888.0,0.0,0.0,0.0,train,1266.0,2831.0,J4IXXECHN,J4IXXECHN_754-2319-1266-2831,0.0,J4IXXECHN_754-2319-1266-2831.jpg,0.0,0.0,754.0,2319.0,0.0,9081.0
8,212736.0,0.0,0.0,0.0,train,2875.0,2677.0,KVQI28WTL,KVQI28WTL_2363-2165-2875-2677,0.0,KVQI28WTL_2363-2165-2875-2677.jpg,0.0,0.0,2363.0,2165.0,0.0,84193.0
9,262144.0,0.0,0.0,0.0,train,2244.0,6310.0,F2JPFEHLL,F2JPFEHLL_1732-5798-2244-6310,0.0,F2JPFEHLL_1732-5798-2244-6310.jpg,0.0,0.0,1732.0,5798.0,0.0,28810.0
