# Building a training dataset for "computer Vision" deep learning

In this note book we are building a training set of small tiles to use to train a 'U' shaped CNN built on thensor flow. the Steps involed are:
* select a site
* load RGB geomedian (what we want to use to do the classification on)
* load a corrosponding slice of classified data to use as a 'truth' mask
* process both into nice shaped tiles
* save to file as .npy for fast loading by data generator

In [13]:
import numpy as np
import os
import sys
import xarray as xr
import tensorflow as tf
import datacube
import matplotlib.pyplot as plt
import json
import pandas as pd
from odc.algo import to_f32, xr_geomedian, int_geomedian

sys.path.insert(1, os.path.abspath("../Tools"))
import imageio
from dea_tools.datahandling import pan_sharpen_brovey, load_ard
from tools.GEOC_utils import tile_array


In [14]:
dc = datacube.Datacube(app="urban_segmentation")

In [15]:
coord_list = "small_training_areas.txt"
with open(coord_list) as tweetfile:
    coordinate_dict = json.loads(tweetfile.read())


In [16]:
coordinate_dict

{'sydney_CBD_2015': {'x': [151.1021, 151.3021],
  'y': [-34.0026, -33.8026],
  'time': '2015'},
 'newcastle_2015': {'x': [151.7012, 151.9012],
  'y': [-32.961400000000005, -32.7614],
  'time': '2015'},
 'almota_park_2015': {'x': [135.41070000000002, 135.6107],
  'y': [-34.754000000000005, -34.554],
  'time': '2015'},
 'north_adeliade_2015': {'x': [138.5008, 138.7008],
  'y': [-34.803200000000004, -34.6032],
  'time': '2015'},
 'lake_mungo_2015': {'x': [142.9695, 143.1695],
  'y': [-33.8292, -33.6292],
  'time': '2015'},
 'hobart_2015': {'x': [147.2083, 147.4083],
  'y': [-42.9625, -42.762499999999996],
  'time': '2015'},
 'coral_bay_2015': {'x': [113.67240000000001, 113.8724],
  'y': [-23.243100000000002, -23.0431],
  'time': '2015'},
 'yara_yara_2015': {'x': [115.71270000000001, 115.9127],
  'y': [-29.7421, -29.542099999999998],
  'time': '2015'},
 'perth_2015': {'x': [115.8147, 116.01469999999999],
  'y': [-32.0743, -31.874299999999998],
  'time': '2015'},
 'darwin_2015': {'x': [130.

In [17]:
def save_data_to_file(data_array, label_array, area_label='default', year='2015'):
    """saves both the data and label file using same name but in respective folders"""
    

    ID_store = []
    for i in range(0,data_array.shape[0]):
    
        data_path = f'training_data/pancro_LC_labeled/{area_label}{year}{i}.npy'
        label_path = f'training_data/pancro_LC_labeled/labels/{area_label}{year}{i}.npy'
        np.save(data_path, data_array[i])
        np.save(label_path, label_array[i])
        
        ID_store +=[f'{area_label}{year}{i}.npy']
        
    print('added all of these IDs to the folder:')
    return(ID_store)
    

In [18]:
def process_landcover(da, lc_class):
    """creates mask of urban area from Landcover level 3. Does tiling to 128x128 """
    
        #first landcover 
    Urban = xr.ones_like(da.level3).where(da.level3 == lc_class, 0)
    Urban = Urban.astype('int16', casting='safe')

    #change order of dementions to what the GEOC utils expect
    Urban_array = Urban.transpose('y','x','time')

    #tile the urban classification layer
    tile_Urban = tile_array(Urban_array, xsize=128, ysize=128, overlap=0.5)
    
    return(tile_Urban)

In [19]:
def process_geomedian(data):
    """ processed Geomedian, reshapes and tiles"""
    # make geomedian a neat array 
    Geomedian_array = data.to_array()

    #change order of dementions to what the GEOC utils expect
    Geomedian_array = Geomedian_array.transpose('y','x','variable')

    tile_geomedian = tile_array(Geomedian_array, xsize=128, ysize=128, overlap=0.5)
    
    return(tile_geomedian)

In [20]:
def make_panchromatic_median(data):
    """rescales panchromatic data to be between -1 and + 1. also generates a median over loaded epoc
    don't use if using a geomedian""""
    
    rescaled = (data - data.mean()) / data.std()
    median = rescaled.median(dim='time', keep_attrs=None)
    return median
    

In [21]:
def do_pand_sharpen(data):
    """perorm pan sharpening"""
    
    # Perform Brovey pan-sharpening and return three numpy.arrays
    red_sharpen, green_sharpen, blue_sharpen = pan_sharpen_brovey(
    band_1=data.nbart_red,
    band_2=data.nbart_green,
    band_3=data.nbart_blue,
    pan_band=data.nbart_panchromatic)
    
    
    # Copy the coordinates and dimensions from the original dataset
    time = data.time
    y = data.y
    x = data.x
    coords = [time, y, x]
    dims = ['time', 'y', 'x']

    # Create new data arrays
    red_data_array = xr.DataArray(red_sharpen, coords=coords, dims=dims)
    green_data_array = xr.DataArray(green_sharpen, coords=coords, dims=dims)
    blue_data_array = xr.DataArray(blue_sharpen, coords=coords, dims=dims)

    sharp_rgb = xr.Dataset({'red': red_data_array, 
                            'green': green_data_array, 
                            'blue': blue_data_array}, 
                           coords={'time': time, 'y': y,'x': x})
    
    return sharp_rgb

In [22]:
# import json

# with open('training_data/training_sites.txt', 'w') as convert_file:
#      convert_file.write(json.dumps(query_dictionary))


### load the two datasets
we are using geomedian and land cover as our stating point

In [23]:
# dictionary = {}

In [25]:
for keys in coordinate_dict:
    
    location = coordinate_dict[keys]
    
    lname = keys

    query = {
        "y": location['y'],
        "x": location['x'],
        "time": location['time']
    }
    
    #load both datasets

    # Load DEA Land Cover data from the datacube
    landcover = dc.load(
            product="ga_ls_landcover_class_cyear_2",
            output_crs="EPSG:3577",
            measurements=["level3"],
            resolution=(-15, 15),
            **query
            )

        # Load DEA Land Cover data from the datacube
    # Set the filters to apply
    filters = [("opening", 5), ("dilation", 5)]


    ARD_lc = load_ard(dc=dc,
            products=['ga_ls8c_ard_3', 'ga_ls7e_ard_3'],
            resampling = 'bilinear',
            output_crs="EPSG:3577",
            measurements=['nbart_blue','nbart_green','nbart_red','nbart_panchromatic'],
            resolution=(-15, 15),
            group_by='solar_day',
            ls7_slc_off=False,
            mask_filters=filters,
            **query
            )
    
#     print(Geomedian)
    sharp_rgb = do_pand_sharpen(ARD_lc)
    
    sharp_median = make_panchromatic_median(sharp_rgb)
    
    tiled_urban = process_landcover(landcover, lc_class=215)
    
    tiled_median = process_geomedian(sharp_median)
    
    #save to file
    
    save_data_to_file(tiled_median, tiled_urban, area_label=lname, year=str(location['time'][0]))


Finding datasets
    ga_ls8c_ard_3
    ga_ls7e_ard_3 (ignoring SLC-off observations)
Applying morphological filters to pixel quality mask: [('opening', 5), ('dilation', 5)]
Applying pixel quality/cloud mask (oa_fmask)
Loading 46 time steps
added all of these IDs to the folder: ['sydney_CBD_201520.npy', 'sydney_CBD_201521.npy', 'sydney_CBD_201522.npy', 'sydney_CBD_201523.npy', 'sydney_CBD_201524.npy', 'sydney_CBD_201525.npy', 'sydney_CBD_201526.npy', 'sydney_CBD_201527.npy', 'sydney_CBD_201528.npy', 'sydney_CBD_201529.npy', 'sydney_CBD_2015210.npy', 'sydney_CBD_2015211.npy', 'sydney_CBD_2015212.npy', 'sydney_CBD_2015213.npy', 'sydney_CBD_2015214.npy', 'sydney_CBD_2015215.npy', 'sydney_CBD_2015216.npy', 'sydney_CBD_2015217.npy', 'sydney_CBD_2015218.npy', 'sydney_CBD_2015219.npy', 'sydney_CBD_2015220.npy', 'sydney_CBD_2015221.npy', 'sydney_CBD_2015222.npy', 'sydney_CBD_2015223.npy', 'sydney_CBD_2015224.npy', 'sydney_CBD_2015225.npy', 'sydney_CBD_2015226.npy', 'sydney_CBD_2015227.npy', 'sy

ValueError: No data available for query: ensure that the products specified have data for the time and location requested