In [1]:
import numpy as np
import os
import rioxarray as rioxr
from random import sample

from shapely.geometry import Point 
import geopandas as gpd

from rasterio.crs import CRS

# following
# https://www.neonscience.org/resources/learning-hub/tutorials/merge-lidar-geotiff-py
from osgeo import gdal

In [2]:
# Load raster
year = 2020

# directory where the NAIP classifications are
root = '/home/jovyan/msai4earth-esa/iceplant_detection/temp/' + os.path.join('LS_filter_clip_preds_' + str(year))

s_fp = os.path.join(root, 'LS_merged_crs26910_S_' + str(year) + '.tif')
w_fp = os.path.join(root, 'LS_merged_crs26910_W_' + str(year) + '.tif')
r_fp = os.path.join(root, 'LS_merged_crs26911_' + str(year) + '_clip.tif')

s_raster = rioxr.open_rasterio(s_fp).squeeze()
w_raster = rioxr.open_rasterio(w_fp).squeeze()
#r_raster = rioxr.open_rasterio(r_fp).squeeze()

In [4]:
s_raster

In [11]:
# count number of pixels in each category

# 0 = noniceplant
# 1 = iceplant
# 2 = ground
# 3 = water

#cats = np.unique(raster)
cats = [0,1,2,3]
n_pix = []
for cat in cats:
    n_pix.append(len(np.where(raster == cat)[0]))
n_pix   

[4442352, 194962, 1924182, 389701]

In [12]:
# Recreation of sampling design by SEPAL

# ---------------------------------------------
# --------------- PARAMETERS ------------------
# standard error for all the points
std_error = 0.015

# user's accuracies TP/(TP+FP) (estimates)
U = [0.7, 0.6, 0.9, 0.95]
# ---------------------------------------------

# fraction of pixels with a given class in total pixels
total_pix = sum(n_pix)
pix_prop = [n/total_pix for n in n_pix]

# standard deviation of user's accuracies
stdv = [ np.sqrt(u*(1-u)) for u in U]

X = [ x*y for x,y in zip(pix_prop, stdv)]

sample_size = (sum(X)/std_error)**2
sample_size

# distributing sample size among classes
#[...]

717.7567023864519

In [41]:
# STRATIFIED RANDOM SAMPLING
cats = [0, 1, 2, 3]
n_samples = [3, 3, 3, 3]
n_cats = 4
points = []
classes = []

for i in range(0, n_cats):
    # arrays with indices where condition is True
    cat_raster = np.where(raster == cats[i])

    # sample random (y,x) pairs from list
    indices = sample(list(zip(cat_raster[0], cat_raster[1])), n_samples[i])

    # unzip into y and x coordinates
    y, x = list(zip(*indices))

    # x and y coordinates in raster CRS corresponding to sampled points
    x_coord = raster.x[np.asarray(x)]
    y_coord = raster.y[np.asarray(y)]
    
    # make points 
    points = points + [Point(i,j) for i,j in zip(x_coord, y_coord)]
    
    # add classes
    classes = classes + [cats[i]]*n_samples[i]

points_df = gpd.GeoDataFrame({'geometry': points,
                              'class' : classes},
                             crs=raster.rio.crs)

points_df = points_df.to_crs(CRS.from_epsg(4326))  # change to lat/lon crs

points_df['LON'] = points_df.geometry.x
points_df['LAT'] = points_df.geometry.y

points_df = points_df.drop(['geometry'], axis=1)

In [42]:
points_df.to_csv('tiny_sample.csv', index_label='PLOTID')