In [1]:
import os
import numpy as np
import pandas as pd

import geopandas as gpd
import rioxarray as rioxr
from random import sample

from shapely.geometry import Point 

from rasterio.crs import CRS

In [2]:
# Load raster
year = 2018

root = '/home/jovyan/msai4earth-esa/iceplant_detection/processing_results/' + os.path.join('LS_filter_clip_preds_' + str(year))

fp = os.path.join(root, 'LS_merged_crs26910_S_'+str(year)+'.tif')
r_26910_S = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, 'LS_merged_crs26910_W_'+str(year)+'.tif')
r_26910_W = rioxr.open_rasterio(fp).squeeze()

fp = os.path.join(root, 'LS_merged_crs26911_'+str(year)+'.tif')
r_26911 = rioxr.open_rasterio(fp).squeeze()

rasters = [r_26910_S, r_26910_W, r_26911]

In [3]:
pixel_count = pd.read_csv(os.path.join(os.getcwd(), 'rasters_'+str(year)+'_pixel_counts.csv'))
pixel_count

Unnamed: 0,n_nonice_2018,n_ice_2018,n_ground_2018,n_water_2018,raster
0,35863561,5897584,113626707,59950901,LS_merged_crs26910_S_2018
1,6730191,899027,24887840,16301218,LS_merged_crs26910_W_2018
2,79013822,3880728,73808926,66038443,LS_merged_crs26911_2018


In [4]:
# STRATIFIED RANDOM SAMPLING
cats = [0, 1, 2, 3]
samples_per_class = [100, 100, 150, 100]
dist = []

for c in cats:
    
    # pixels in each raster from specified class
    n1 = list(pixel_count.iloc[:,c])[0]
    n2 = list(pixel_count.iloc[:,c])[1]
    n3 = list(pixel_count.iloc[:,c])[2]
    s = n1 + n2 + n3
     
    # randomly specify number of samples per raster
    where_to_sample = np.random.choice([0,1,2], size=samples_per_class[c], p=[n1/s, n2/s, n3/s])

    dist.append([len(np.where(where_to_sample == 0)[0]), 
                 len(np.where(where_to_sample == 1)[0]), 
                 len(np.where(where_to_sample == 2)[0])])
    
dist

[[35, 3, 62], [55, 9, 36], [82, 16, 52], [43, 11, 46]]

In [5]:
all_points = []

for i in range(0,3):

    points = []
    which_class = []
    which_raster = []
    sampled_raster = False
    print(' STARTED COLLECTING FOR RASTER ', i)
    
    for c in cats:
        n_samples = dist[c][i]

        if n_samples != 0:
            sampled_raster = True

            raster = rasters[i]

            # arrays with indices where condition is True
            c_raster = np.where(raster == c)

            # sample random (y,x) pairs from list
            indices = sample(list(zip(c_raster[0], c_raster[1])), n_samples)

            # unzip into y and x coordinates
            y, x = list(zip(*indices))

            # x and y coordinates in raster CRS corresponding to sampled points
            x_coord = raster.x[np.asarray(x)]
            y_coord = raster.y[np.asarray(y)]

            # make points 
            points = points + [Point(m,n) for m,n in zip(x_coord, y_coord)]

            which_class = which_class + [c]*n_samples
            which_raster = which_raster + [i]*n_samples
            print('sampled cat ', c)
            
    if sampled_raster:
        points_df = gpd.GeoDataFrame({'geometry': points,
                                      'class' : which_class,
                                      'which_raster' : which_raster},
                                     crs = raster.rio.crs)
        all_points.append(points_df.to_crs(CRS.from_epsg(4326)))  # change to lat/lon crs
    print(' ')
        

 STARTED COLLECTING FOR RASTER  0
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 
 STARTED COLLECTING FOR RASTER  1
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 
 STARTED COLLECTING FOR RASTER  2
sampled cat  0
sampled cat  1
sampled cat  2
sampled cat  3
 


In [6]:
points_df = pd.concat(all_points, ignore_index=True)
points_df['LON'] = points_df.geometry.x
points_df['LAT'] = points_df.geometry.y
points_df = points_df.drop(['geometry'], axis=1)
points_df

Unnamed: 0,class,which_raster,LON,LAT
0,0,0,-120.382097,34.459662
1,0,0,-120.046124,34.464234
2,0,0,-120.118280,34.471577
3,0,0,-120.022476,34.458078
4,0,0,-120.443720,34.454352
...,...,...,...,...
445,3,2,-119.720796,34.394071
446,3,2,-119.919905,34.427792
447,3,2,-119.920651,34.427844
448,3,2,-119.593934,34.415511


In [7]:
# randomize points
points_df = points_df.sample(frac=1).reset_index(drop=True)
points_df

Unnamed: 0,class,which_raster,LON,LAT
0,0,2,-119.742562,34.407042
1,3,2,-119.505269,34.384257
2,3,2,-119.639609,34.413251
3,3,2,-119.866794,34.409054
4,2,0,-120.488476,34.495967
...,...,...,...,...
445,2,2,-119.749731,34.403776
446,2,0,-120.047737,34.466079
447,0,0,-120.471065,34.467183
448,1,2,-119.644511,34.417208


In [8]:
points_df.to_csv('validation_sample_'+str(year)+'.csv', index_label='PLOTID')

In [9]:
np.unique(points_df['class'], return_counts=True)

(array([0, 1, 2, 3]), array([100, 100, 150, 100]))

In [10]:
np.unique(points_df['which_raster'], return_counts=True)

(array([0, 1, 2]), array([215,  39, 196]))