In [3]:
import numpy as np
import xarray as xr
import geopandas as gpd
import pandas as pd
from matplotlib import pyplot as plt
from osgeo import gdal, ogr, gdal_array
import dask
import datacube 
from datacube.helpers import ga_pq_fuser
from datacube.storage import masking
from datacube.utils import geometry
import os
#import custom functions
import sys
sys.path.append('src')
import DEAPlotting, SpatialTools, BandIndices, DEADataHandling
from load_data import load_data
from transform_tuple import transform_tuple
from query_from_shp import query_from_shp
from rsgislib.segmentation import segutils
from rasterstats import zonal_stats
from imageSeg import imageSeg
import fiona
import rasterio.features

In [2]:
# where is your data and results folder?
results = "results/"
data = 'data/'

sensors = ['ls5','ls7','ls8']

#are we using a polygon to mask the AOI?
shp_fpath = 'data/spatial/murrumbidgee_boundingbox.shp'

#Input your area of interest's name, coords, and 
#the year you're interested in?
AOI = 'Murrum_randomForest'
year = 'Winter2013'

time_period = ('2013-01-01', '2013-12-31')

#What thresholds should I use?
threshold = 0.8
wofs_theshold = 0.15
#-----------------------------------------

In [3]:
#Creating a folder to keep things neat
directory = results + AOI + "_" + year
if not os.path.exists(directory):
    os.mkdir(directory)

results = results + AOI + "_" + year + "/"

In [4]:
#set up query
query = query_from_shp(shp_fpath, time_period[0], time_period[1], dask_chunks = 1000)
#landsat
# dc = datacube.Datacube(app='dc_name')
# landsat = DEADataHandling.load_clearlandsat(dc,query=query, sensors=sensors, product='nbart',
#                        masked_prop=0.75)

landsat = load_data(dc_name = 'irrigated_areas', sensors=sensors,
          export_name = data + AOI + "_" + year + '.nc', query=query)
#wofs
# dc = datacube.Datacube(app='wofs')
# del query['time'] 
# wofs_alltime = dc.load(product = 'wofs_summary', **query)

#masking the returned array to the polygon area
with fiona.open(shp_fpath) as shapes:
        crs = geometry.CRS(shapes.crs_wkt)
        first_geometry = next(iter(shapes))['geometry']
        geom = geometry.Geometry(first_geometry, crs=crs)

mask = rasterio.features.geometry_mask([geom.to_crs(landsat.geobox.crs) for geoms in [geom]],
                                           out_shape=landsat.geobox.shape,
                                           transform=landsat.geobox.affine,
                                           all_touched=False,
                                           invert=True)
# Mask the xarrays
landsat = landsat.where(mask)
#wofs_alltime = wofs_alltime.where(mask)
#datacube.storage.storage.write_dataset_to_netcdf(landsat, results + AOI "_" + year + '.nc')

ls5_loading...
ls5_loaded
ls7_loading...


  if not landsat_ds:


ls7_loaded


  if not landsat_ds:


ls8_loading...
ls8_loaded


  if not landsat_ds:
  return self.array[key]


In [5]:
landsat

<xarray.Dataset>
Dimensions:  (time: 99, x: 24218, y: 15524)
Coordinates:
  * y        (y) float64 -3.714e+06 -3.714e+06 ... -4.102e+06 -4.102e+06
  * x        (x) float64 1.011e+06 1.011e+06 1.011e+06 ... 1.616e+06 1.616e+06
  * time     (time) datetime64[ns] 2013-05-01T00:16:24 ... 2013-09-30T00:10:24.500000
Data variables:
    blue     (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
    green    (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
    red      (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
    nir      (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
    swir1    (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
    swir2    (time, y, x) float64 dask.array<shape=(99, 15524, 24218), chunksize=(1, 1000, 1000)>
Attributes:
    crs:      EPSG:3577

In [None]:
#band indices calculation

def ndvi_ufunc(ds):
    def ndvi_func(nir, red):
        return ((nir - red)/(nir + red))
    
    return xr.apply_ufunc(
        ndvi_func, ds.nir, ds.red,
        dask='parallelized',
        output_dtypes=[float])

def brightness_ufunc(ds):
    def brightness_func(g,r,nir,swir):
        return (g**2 + r**2 + nir**2 + swir**2)**(1/2.0)
    
    return xr.apply_ufunc(
        brightness_func, ds.green,ds.red, ds.nir, ds.swir1,
        dask='parallelized',
        output_dtypes=[float])

NDVI_landsat = ndvi_ufunc(landsat)
brightness_landsat = brightness_ufunc(landsat)

#calculate per pixel summary stats
print('calculating NDVI stats')
NDVI_max = NDVI_landsat.max('time').rename('NDVI_max')
NDVI_mean = NDVI_landsat.mean('time').rename('NDVI_mean')
NDVI_std = NDVI_landsat.std('time').rename('NDVI_std')
NDVI_min = NDVI_landsat.min('time').rename('NDVI_min')
NDVI_range = NDVI_max - NDVI_min
NDVI_range = NDVI_range.rename('NDVI_range')

print('calculating birghtness stats')
brightness_max = brightness_landsat.max('time').rename('brightness_max')
brightness_mean = brightness_landsat.mean('time').rename('brightness_mean')
brightness_std = brightness_landsat.std('time').rename('brightness_std')
brightness_min = brightness_landsat.min('time').rename('brightness_min')

print('resampling timeseries')
NDVI_landsat_resample = NDVI_landsat.resample(time='M').mean('time')
y = NDVI_landsat.coords['y']
x = NDVI_landsat.coords['x']

print('calculating argmax stats') #SLOW BECAUSE DASK ARRAYS COMPUTED
timeofmax = NDVI_landsat_resample.values.argmax(axis=0)
timeofmax = xr.DataArray(timeofmax, coords = [y, x], dims = ['y', 'x'], name='time_of_max')
timeofmin = NDVI_landsat_resample.values.argmin(axis=0)
timeofmin = xr.DataArray(timeofmin, coords = [y, x], dims = ['y', 'x'], name='time_of_min')
rate = (NDVI_max-NDVI_min)/(timeofmax - timeofmin)

calculating NDVI stats
calculating birghtness stats
resampling timeseries
calculating argmax stats


  x = np.divide(x1, x2, out)


In [None]:
xray_list = [NDVI_max, NDVI_mean, NDVI_std, NDVI_min, NDVI_range,timeofmax, timeofmin, rate,
            brightness_max, brightness_mean, brightness_std, brightness_std]
names = ['NDVI_max', 'NDVI_mean', 'NDVI_std', 'NDVI_min', 'NDVI_range','timeofmax', 'timeofmin','rate',
            'brightness_max', 'brightness_mean', 'brightness_std', 'brightness_std']

### Image segmentation for use in masking AFTER the RF classifier

In [None]:
#export Gtiff for use in Image segmentation
transform, projection = transform_tuple(NDVI_max, (NDVI_max.x, NDVI_max.y), epsg=3577)
# SpatialTools.array_to_geotiff(results + AOI + "_" + year + "ndvimax.tif",
#               NDVI_max.values, geo_transform = transform, 
#               projection = projection, nodata_val=np.nan)

#export all GTiffs for catchment so I don't have to keep loading them.
for l, n in zip(xray_list, names):
    SpatialTools.array_to_geotiff(results + AOI + "_" + year + "_" + n + ".tif",
              l.values, geo_transform = transform, 
              projection = projection, nodata_val=np.nan)


In [None]:
InputNDVIStats = results + AOI + "_" + year + "_NDVI_max.tif"
KEAFile = results + AOI + '_' + year + '.kea'
SegmentedKEAFile = results + AOI + '_' + year + '_sheperdSEG.kea'
SegmentedTiffFile = results + AOI + '_' + year + '_sheperdSEG.tif'
SegmentedPolygons = results + AOI + '_' + year + '_SEGpolygons.shp'
imageSeg(InputNDVIStats, KEAFile, SegmentedKEAFile, SegmentedTiffFile, SegmentedPolygons, minPxls = 100)

### Generate a training dataset

I was generating the training dataset in the code that is now commented out,

have since moved to generating a random dataset using R.  10,000 points extracted

per class across the Murrumbidgee. Rasterizing this file directly.


In [None]:
#rasterize the training dataset
NDVI_max = xr.open_rasterio(results + AOI + "_" + year + "_NDVI_max.tif")
NDVI_max = NDVI_max.squeeze()
#get the transform and projection of our gtiff
transform, projection = transform_tuple(NDVI_max, (NDVI_max.x, NDVI_max.y), epsg=3577)
#find the width and height of the xarray dataset we want to mask
width,height = NDVI_max.shape
# rasterize vector
training_set = SpatialTools.rasterize_vector(results + "murrumbidgee_training_samples.shp",
               height, width, transform, projection, field='id',raster_path= results + AOI + "_" + year +'training_raster.tif')
#xr.DataArray(training_set, coords = [NDVI_max.y, NDVI_max.x], dims = ['y', 'x'], name='training areas').plot(figsize=(10,10))

In [None]:
# peel_landuse = gpd.read_file('data/spatial/Peel_landuse_small.shp')
# peel_landuse = peel_landuse.to_crs(epsg=3577)

# peel_trainset = peel_landuse[(peel_landuse.TertiaryAL == 430) | #irrigated cropping
#                         (peel_landuse.TertiaryAL == 330) |      #cropping
#                         (peel_landuse.TertiaryAL == 220) |      #forestry
#                         (peel_landuse.TertiaryAL == 133) |      #native cover (bushland)
#                         (peel_landuse.TertiaryAL == 541)]       #urban

# peel_trainset = peel_trainset[['TertiaryAL', 'd_Tertiary', 'geometry']]
# peel_trainset.columns = ['id', 'class', 'geometry']
# peel_trainset.to_file(results + AOI + "_" + year + "_peel_trainset.shp")
# peel_trainset.plot(column = 'class', legend=True, figsize=(7,7))

# #rasterize the training dataset
# NDVI_max = xr.open_rasterio(results + AOI + "_" + year + "ndvimax.tif")
# NDVI_max = NDVI_max.squeeze()
# #get the transform and projection of our gtiff
# transform, projection = transform_tuple(NDVI_max, (NDVI_max.x, NDVI_max.y), epsg=3577)
# #find the width and height of the xarray dataset we want to mask
# width,height = NDVI_max.shape
# # rasterize vector
# training_set = SpatialTools.rasterize_vector(results + AOI + "_" + year + "_peel_trainset.shp",
#                height, width, transform, projection, field='id',raster_path= results + AOI + "_" + year +'training_raster.tif')
# #xr.DataArray(training_set, coords = [NDVI_max.y, NDVI_max.x], dims = ['y', 'x'], name='training areas').plot(figsize=(10,10))

In [None]:
k = xr.open_rasterio(results + AOI + "_" + year +'training_raster.tif')
k = k.squeeze()
classes = np.unique(k)
for c in classes:
    print('Class {c} contains {n} pixels'.format(c=c,n=(training_set == c).sum()))

### Create a Random Forest Model

In [None]:
# Read in our training data
roi_ds = gdal.Open(results + AOI + "_" + year +'training_raster.tif', gdal.GA_ReadOnly)
roi = roi_ds.GetRasterBand(1).ReadAsArray().astype(np.uint16)
#convert to numpy arrays (SLOW BECAUSE DASK ARRAYS ARE COMPUTED)
x,y = NDVI_max.shape
z = len(xray_list)
img = np.zeros((x,y,z))
for b,c in zip(xray_list, range(img.shape[2])):
    print('adding slice to array...')
    img[:, :, c] = b.values 
    
img_noNaNs = np.nan_to_num(img) #remove nans as they f/w classifier

In [None]:
# # Display them
plt.subplot(121)
plt.imshow(img_noNaNs[:, :, 1])
plt.title('NDVI')

plt.subplot(122)
plt.imshow(roi, cmap=plt.cm.Spectral)
plt.title('AOI Training Data')

In [None]:
# Find how many non-zero entries we have -- i.e. how many training data samples?
n_samples = (roi > 0).sum()
print('We have {n} samples'.format(n=n_samples))

# What are our classification labels?
labels = np.unique(roi[roi > 0])
print('The training data include {n} classes: {classes}'.format(n=labels.size, 
                                                                classes=labels))
x = img_noNaNs[roi > 0,:]
y = roi[roi > 0]

print('Our X matrix is sized: {sz}'.format(sz=x.shape))
print('Our y array is sized: {sz}'.format(sz=y.shape))

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize our model with 500 trees
rf = RandomForestClassifier(n_estimators=250, oob_score=True, verbose=True,
                            n_jobs=4, max_features="auto") #auto = sqrt(n_features)

# Fit our model to training data
rf = rf.fit(x, y)

In [1]:
#save the model
from joblib import dump, load
dump(rf, results + 'murrumbidgee_rfModel.joblib')

In [None]:
print('Our OOB prediction of accuracy is: {oob}%'.format(oob=rf.oob_score_ * 100))

In [None]:
#disaply the importance of the individual bands
for b, imp in zip(names, rf.feature_importances_):
    print('Band {b} importance: {imp}'.format(b=b, imp=imp))

In [None]:
# Create a cross-tabulation dataframe to check out how each class performs
df = pd.DataFrame()
df['truth'] = y
df['predict'] = rf.predict(x)

# Cross-tabulate predictions
print(pd.crosstab(df['truth'], df['predict'], margins=True))

### Classify our image

In [None]:
# If we need to load back in the trained RF model:
# rf = load(results + 'murrumbidgee_rfModel.joblib')

# Take our full image, and reshape into long 2d array (nrow * ncol, nband) for classification
new_shape = (img_noNaNs.shape[0] * img_noNaNs.shape[1], img_noNaNs.shape[2])

img_as_array = img_noNaNs[:, :, :z].reshape(new_shape)
print('Reshaped from {o} to {n}'.format(o=img_noNaNs.shape,
                                        n=img_as_array.shape))

# Now predict for each pixel
print('generating prediction')
class_prediction = rf.predict(img_as_array)

# Reshape our classification map
class_prediction = class_prediction.reshape(img_noNaNs[:, :, 0].shape)

In [None]:
#generate a plot of the predictions
import matplotlib.patches as mpatches
values = np.unique(class_prediction.ravel())

plt.figure(figsize=(15,15))
plt.subplot(121)
im = plt.imshow(class_prediction, interpolation='none')
# colors = [im.cmap(im.norm(value)) for value in values]
# patches = [ mpatches.Patch(color=colors[i], label="Level {l}".format(l=values[i]) ) for i in range(len(values)) ]
# plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )

plt.subplot(122)
irr = np.where(class_prediction==430, 1, 0)
plt.imshow(irr)
plt.title('Irrigation Pixels Only')

In [None]:
#export out the results
transform, projection = transform_tuple(NDVI_max, (NDVI_max.x, NDVI_max.y), epsg=3577)
SpatialTools.array_to_geotiff(results + AOI + "_" + year + "classpredict.tif",
              class_prediction, geo_transform = transform, 
              projection = projection, nodata_val=0)

### use image segmentation polygons to filter results of RF

In [None]:
class_predict = xr.open_rasterio(results + AOI + "_" + year + "classpredict.tif")
class_predict = class_predict.squeeze()

gdf = gpd.read_file(results + AOI + '_' + year + '_SEGpolygons.shp')
#calculate majority values inside segments
gdf['majority'] = pd.DataFrame(zonal_stats(vectors=gdf['geometry'], raster=results + AOI + "_" + year + "classpredict.tif", stats='majority'))['majority']
#calculate area of polygons
gdf['area'] = gdf['geometry'].area
#filter by area and majority values
smallArea = gdf['area'] <= 5500000
irrigated = gdf['majority'] == 430.0 #filtering for irrigated areas only
gdf = gdf[smallArea&irrigated]
#export shapefile
gdf.to_file(results + AOI + "_" + year + "_Irrigated.shp")

#get the transform and projection of our gtiff
transform, projection = transform_tuple(class_predict, (class_predict.x, class_predict.y), epsg=3577)
#find the width and height of the xarray dataset we want to mask
width,height = class_predict.shape
# rasterize vector
gdf_raster = SpatialTools.rasterize_vector(results + AOI + "_" + year + "_Irrigated.shp",
                                           height, width, transform, projection, raster_path=results + AOI + "_" + year + "_Irrigated.tif")

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(gdf_raster, interpolation='none')