This notebook does a stratified random sampling of points from the three rasters that make up the Santa Barbara County Coast. 
The number of samples per class is specified by the user. 
The ouput points are given in the EPSG(4326) CRS and can be directly used as input for the validation platform Collect Earth Online.

In [None]:
import os
import numpy as np
import pandas as pd

import geopandas as gpd
import rioxarray as rioxr
from random import sample

from shapely.geometry import Point 

from rasterio.crs import CRS

In [None]:
# Load raster
year = 2020
prefix = 'salt13_p30'
root = '/home/jovyan/msai4earth-esa/iceplant_detection/processing_results/' + prefix +'_clip_preds_' + str(year)

# pixel classes in raster
# 0 = other vegetation 1 = iceplant, 2 = low ndvi, 3 = water
cats = [0, 1, 2, 3]

# -----------------------------------------------------
# different strata in stratfified random sampling and 
# # 0 = other vegetation 1 = iceplant, 2 = non-vegetation
stratum = [0, 1, 2]
# number of samples per strata
samples_per_strata = [300, 300, 410]

# -----------------------------------------------------
# whether to save the sampled points
save_pts = False

# -----------------------------------------------------
# number of pixels per pixel class in each raster
pixel_count = pd.read_csv('salt13_p30_2020_combined_pixel_counts_per_raster.csv')

# counts of pixels per strata (pixel clases 2 and 3 are merged)  in each raster
pixel_count_unmerged = pd.read_csv('salt13_p30_2020_pixel_counts_per_raster.csv')

In [None]:
# load rasters
fp = os.path.join(root, prefix+'_merged_crs26910_S_'+str(year)+'.tif')
r_26910_S = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, prefix+'_merged_crs26910_W_'+str(year)+'.tif')
r_26910_W = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, prefix+'_merged_crs26911_'+str(year)+'.tif')
r_26911 = rioxr.open_rasterio(fp).squeeze()

rasters = [r_26910_S, r_26910_W, r_26911]

In [None]:
# STRATIFIED RANDOM SAMPLING
# this cell creates an nxm array where 
# n=number of stratum in stratified sample, m=number of rasters 
# dist[n,m] = number of samples of strata n in raster m
dist = []

for c in stratum:
    
    # number of pixels in each raster from specified class
    n1 = list(pixel_count.iloc[:,c])[0]
    n2 = list(pixel_count.iloc[:,c])[1]
    n3 = list(pixel_count.iloc[:,c])[2]
    s = n1 + n2 + n3
     
    # allocate number of samples from this class per raster
    # each pixel from the class has the same probability of being selected, 
    # regardless of the raster it is in
    where_to_sample = np.random.choice([0,1,2],   # the raster to sample from (1st, 2nd, or 3rd)
                                       size = samples_per_strata[c], 
                                       # the probabilty to sample from each raster
                                       # (proportional to the # of pixels from that class in the raster)
                                       p = [n1/s, n2/s, n3/s])
    
    # count how many times each raster appeared in the sample
    dist.append([len(np.where(where_to_sample == 0)[0]), 
                 len(np.where(where_to_sample == 1)[0]), 
                 len(np.where(where_to_sample == 2)[0])])
    
dist

In [None]:
# Distribute the number of points per raster among the two merged pixel classes
# 2 = low-NDVI (ground)
# 3 = water
merged_pixel_classes = [2,3]

ground_samples = []
water_samples = []
for r in range(3):

    # number of pixels in raster from specified class
    n2 = pixel_count_unmerged.iloc[r,2]
    n3 = pixel_count_unmerged.iloc[r,3]
    s = n2 + n3

    which_to_sample = np.random.choice(merged_pixel_classes,   # the class to sample from
                                       size = dist[2][r], 
                                       # the probabilty to sample from each raster
                                       # (proportional to the # of pixels from that class in the raster)
                                       p = [n2/s, n3/s])
    ground_samples.append(len(np.where(which_to_sample == 2)[0]))
    water_samples.append(len(np.where(which_to_sample == 3)[0]))

# replace last row (3rd strata) with the samples distributed to each class
dist.pop()
dist += [ground_samples, water_samples]
dist

In [None]:
# sample points from rasters
# resulting points have crs EPSG(4326)
all_points = []

# cycle through rasters, then strata sample
for i in range(3):

    points = []
    which_class = []
    which_raster = []
    sampled_raster = False
    print('STARTED COLLECTION FOR RASTER ', i)
    
    for c in cats:
        n_samples = dist[c][i]

        if n_samples != 0:
            sampled_raster = True

            raster = rasters[i]

            # arrays with indices where condition is True
            c_raster = np.where(raster == c)

            # sample random (y,x) pairs from list
            indices = sample(list(zip(c_raster[0], c_raster[1])), n_samples)

            # unzip into y and x coordinates
            y, x = list(zip(*indices))

            # x and y coordinates in raster CRS corresponding to sampled points
            x_coord = raster.x[np.asarray(x)]
            y_coord = raster.y[np.asarray(y)]

            # make points 
            points = points + [Point(m,n) for m,n in zip(x_coord, y_coord)]

            which_class = which_class + [c]*n_samples
            which_raster = which_raster + [i]*n_samples
            print('sampled cat ', c)
            
    if sampled_raster:
        points_df = gpd.GeoDataFrame({'geometry': points,
                                      'class' : which_class,
                                      'which_raster' : which_raster},
                                     crs = raster.rio.crs)
        all_points.append(points_df.to_crs(CRS.from_epsg(4326)))  # change to lat/lon crs
    print(' ')

# assemble points in dataframe compatible with Collect Earth Online 
points_df = pd.concat(all_points, ignore_index=True)
points_df['LON'] = points_df.geometry.x
points_df['LAT'] = points_df.geometry.y
points_df = points_df.drop(['geometry'], axis=1)

# randomize points
points_df = points_df.sample(frac=1).reset_index(drop=True)

In [None]:
points_df

In [None]:
np.unique(points_df['class'], return_counts=True)

In [None]:
np.unique(points_df['which_raster'], return_counts=True)

In [None]:
if save_pts:
    points_df.to_csv(prefix+'_validation_sample_'+str(year)+'.csv', index_label='PLOTID')