This notebook does a stratified random sampling of points from the three rasters that make up the Santa Barbara Coast. 
The number of samples per class is specified by user. 
The ouput points are given in CRS from EPSG(4326) and can be directly used as input for the validation platform Collect Earth Online.

In [1]:
import os
import numpy as np
import pandas as pd

import geopandas as gpd
import rioxarray as rioxr
from random import sample

from shapely.geometry import Point 

from rasterio.crs import CRS

In [2]:
# Load raster
year = 2020
prefix = 'modelAE5_FP_2020'
root = '/home/jovyan/msai4earth-esa/iceplant_detection/processing_results/' + prefix +'/' + prefix +'_filter_clip_preds_' + str(year)

# classes in raster
# 0 = other vegetation 1 = iceplant, 2 = low ndvi, 3 = water
cats = [0, 1, 2, 3]

# required number of random samples per class
samples_per_class = [200, 200, 110, 85]

#whether to save the sampled points
save_pts = True

In [3]:
# load rasters
fp = os.path.join(root, prefix+'_merged_crs26910_S_'+str(year)+'.tif')
r_26910_S = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, prefix+'_merged_crs26910_W_'+str(year)+'.tif')
r_26910_W = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, prefix+'_merged_crs26911_'+str(year)+'.tif')
r_26911 = rioxr.open_rasterio(fp).squeeze()

rasters = [r_26910_S, r_26910_W, r_26911]

# load previously calculated pixel counts
pixel_count = pd.read_csv(os.path.join(os.getcwd(), prefix+'_rasters_'+str(year)+'_pixel_counts.csv'))
pixel_count

Unnamed: 0,n_nonice_2020,n_ice_2020,n_ground_2020,n_water_2020,raster
0,36271293,5382187,111150412,62968690,modelAE5_FP_2020_merged_crs26910_S_2020
1,1122203,30004,1891593,2893071,modelAE5_FP_2020_merged_crs26910_W_2020
2,89669636,1123921,62587031,69125241,modelAE5_FP_2020_merged_crs26911_2020


In [4]:
# STRATIFIED RANDOM SAMPLING

# nxm array where n=number of classes, m=number of rasters 
# dist[n,m] = number of samples of class n from raster m
dist = []

for c in cats:
    
    # number of pixels in each raster from specified class
    n1 = list(pixel_count.iloc[:,c])[0]
    n2 = list(pixel_count.iloc[:,c])[1]
    n3 = list(pixel_count.iloc[:,c])[2]
    s = n1 + n2 + n3
     
    # allocate number of samples frpm this class per raster
    # each pixel from the class has the same probability of being selected, regardless of the raster it is in
    where_to_sample = np.random.choice([0,1,2], size=samples_per_class[c], p=[n1/s, n2/s, n3/s])

    dist.append([len(np.where(where_to_sample == 0)[0]), 
                 len(np.where(where_to_sample == 1)[0]), 
                 len(np.where(where_to_sample == 2)[0])])
    
dist

[[56, 2, 142], [165, 2, 33], [65, 5, 40], [36, 1, 48]]

In [5]:
# sample points from rasters
# resulting points have crs EPSG(4326)
all_points = []

for i in range(0,3):

    points = []
    which_class = []
    which_raster = []
    sampled_raster = False
    print(' STARTED COLLECTING FOR RASTER ', i)
    
    for c in cats:
        n_samples = dist[c][i]

        if n_samples != 0:
            sampled_raster = True

            raster = rasters[i]

            # arrays with indices where condition is True
            c_raster = np.where(raster == c)

            # sample random (y,x) pairs from list
            indices = sample(list(zip(c_raster[0], c_raster[1])), n_samples)

            # unzip into y and x coordinates
            y, x = list(zip(*indices))

            # x and y coordinates in raster CRS corresponding to sampled points
            x_coord = raster.x[np.asarray(x)]
            y_coord = raster.y[np.asarray(y)]

            # make points 
            points = points + [Point(m,n) for m,n in zip(x_coord, y_coord)]

            which_class = which_class + [c]*n_samples
            which_raster = which_raster + [i]*n_samples
            print('sampled cat ', c)
            
    if sampled_raster:
        points_df = gpd.GeoDataFrame({'geometry': points,
                                      'class' : which_class,
                                      'which_raster' : which_raster},
                                     crs = raster.rio.crs)
        all_points.append(points_df.to_crs(CRS.from_epsg(4326)))  # change to lat/lon crs
    print(' ')

# assemble points in dataframe compatible with Collect Earth Online 
points_df = pd.concat(all_points, ignore_index=True)
points_df['LON'] = points_df.geometry.x
points_df['LAT'] = points_df.geometry.y
points_df = points_df.drop(['geometry'], axis=1)

# randomize points
points_df = points_df.sample(frac=1).reset_index(drop=True)

 STARTED COLLECTING FOR RASTER  0
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 
 STARTED COLLECTING FOR RASTER  1
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 
 STARTED COLLECTING FOR RASTER  2
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 


In [6]:
points_df

Unnamed: 0,class,which_raster,LON,LAT
0,1,0,-120.465232,34.462486
1,1,0,-119.997217,34.459656
2,3,2,-119.969810,34.438493
3,1,0,-120.469599,34.465779
4,2,0,-120.266262,34.471258
...,...,...,...,...
590,0,2,-119.864898,34.410694
591,1,0,-120.472786,34.472076
592,1,0,-120.463615,34.450510
593,0,2,-119.798882,34.422402


In [7]:
np.unique(points_df['class'], return_counts=True)

(array([0, 1, 2, 3]), array([200, 200, 110,  85]))

In [8]:
np.unique(points_df['which_raster'], return_counts=True)

(array([0, 1, 2]), array([322,  10, 263]))

In [9]:
if save_pts:
    points_df.to_csv(prefix+'_validation_sample_'+str(year)+'.csv', index_label='PLOTID')