### Import packages, connect to openEO, and define aoi

In [1]:
import openeo
import folium
import json
import shapely.geometry
from openeo.processes import if_, is_valid
from functools import reduce
import os

In [2]:
# connect to openeo
conn = openeo.connect("https://openeo.dataspace.copernicus.eu").authenticate_oidc()

Authenticated using refresh token.


In [3]:
# output directory
outdir = "./SVM_output"

# aoi
aoi = json.load(open('auxiliary_data/senales/senales_wgs84.geojson'))

# define time period
time_period = ['2023-02-01', '2023-02-28']

# check the aoi
region = aoi['features'][0]['geometry']
geom = shapely.geometry.shape(region)
centroid = geom.centroid
center_latlon = [centroid.y, centroid.x]

m = folium.Map(location=center_latlon, zoom_start=10)

folium.GeoJson(aoi).add_to(m)
m

### retrieve data

In [4]:
# get s2
s2 = conn.load_collection(
    'SENTINEL2_L1C',
    spatial_extent=region, 
    temporal_extent=time_period,
    bands=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12']).mask_polygon(mask=aoi)

In [8]:
# green band
green = s2.filter_bands("B03")

Retrieve previously generated auxiliary data using jobID

In [5]:
def cube_from_jobId(jobId):
    job = conn.job(jobId)
    stac_result = job.get_results().get_metadata()
    stac_result_url = [x["href"] for x in stac_result["links"] if x["rel"]=="canonical"][0]
    return conn.load_stac(stac_result_url)


list the jobIDs

In [6]:
job_titles = [
    "shad_idx",
    "diff_B_NIR",
    "ndsi",
    "water_mask",
    "solar_incidence_angle",
    "ndvi",
    "cloud_mask",
    "scaled_distance_index"
]

for job_info in conn.list_jobs():
    title = job_info.get("title", "")
    if title in job_titles:
        print(f"{title}: {job_info['id']}")


shad_idx: j-25082813185543919daf1744e769510b
ndsi: j-2508281024534d9a82eda347ab270074
cloud_mask: j-25082113255449a9a01abe9ac679241f
scaled_distance_index: j-2508211256334a64abd68fe06634c401
diff_B_NIR: j-2507281102154142b0342e69fd6e920f
ndsi: j-250728105822466f8855457e0ef1271d
water_mask: j-25072810575948c3820debe741f281e2
ndvi: j-2507281053414939a87dbd4b7a50a3d7
solar_incidence_angle: j-2507281053234398b67d65abbc1efa1a
shad_idx: j-2507221142444b158d4777ce2855a7f4
shad_idx: j-2507170830034ec3aec851ee7c6847af
solar_incidence_angle: j-25071708041540ca987cbf7aa16fdde5


In [7]:
shad_idx = cube_from_jobId('j-25082813185543919daf1744e769510b')
diff_B_NIR = cube_from_jobId('j-2507281102154142b0342e69fd6e920f')
ndsi = cube_from_jobId('j-250728105822466f8855457e0ef1271d')
water_mask = cube_from_jobId('j-25072810575948c3820debe741f281e2')
solar_incidence_angle = cube_from_jobId('j-2507281053234398b67d65abbc1efa1a')
ndvi = cube_from_jobId('j-2507281053414939a87dbd4b7a50a3d7')
cloud_mask = cube_from_jobId('j-25082113255449a9a01abe9ac679241f')
scaled_distance_index = cube_from_jobId('j-2508211256334a64abd68fe06634c401')

### Combine masks

In [9]:
# make a mask from valid pixels in distance index. in distance mask non valid = 255, valid = no data
scaled_distance_index = scaled_distance_index.mask_polygon(mask=aoi) # this works
distance_mask = scaled_distance_index.mask(scaled_distance_index != 255) 

# combine masks (where 1 = valid, 0 = not valid, 129 = nodata)
scene_valid = (cloud_mask != 1) & (water_mask != 1) & (distance_mask != 255)

# reduce band dimension and add back (spark wants this with the merged datacube for some reason)
scene_valid = scene_valid.reduce_dimension(dimension='bands', reducer='first').add_dimension(name="bands", label="valid", type='bands')

In [10]:
# scene_valid.download(os.path.join(outdir, 'scene_valid.nc'))

In [11]:
# # trying out scene_valid
# green_masked = green.mask(scene_valid != 1)
# green_masked.download(os.path.join(outdir, 'green_masked.nc'))

### Prepare the datacubes for merging

In [12]:
# reduce dimension and add it back with correct label (otherwise spark gives error of bands in metadata 1/1, but in reality 11/11)

ndsi = (ndsi.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="NDSI", type='bands'))

ndvi = (ndvi.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="NDVI", type='bands'))

green = (green.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="GREEN", type='bands'))

diff_B_NIR = (diff_B_NIR.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="DIFF_B_NIR", type='bands'))

shad_idx = (shad_idx.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="SHAD_IDX", type='bands'))

solar_incidence_angle = (solar_incidence_angle.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="SOLAR_ANGLE", type='bands'))

scaled_distance_index = (scaled_distance_index.reduce_dimension(dimension="bands", reducer="first").add_dimension(name="bands", label="DIST_IDX", type='bands'))


In [13]:
print(green.metadata)
print(shad_idx.metadata)
print(diff_B_NIR.metadata)
print(ndsi.metadata)
print(solar_incidence_angle.metadata)
print(ndvi.metadata)
print(scene_valid.metadata)
print(scaled_distance_index.metadata)

CollectionMetadata({'spatial': {'bbox': [[-180, -56, 180, 83]]}, 'temporal': {'interval': [['2015-07-04T00:00:00Z', None]]}} - ['GREEN'] - ['t', 'x', 'y', 'bands'])
CubeMetadata(['SHAD_IDX'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['DIFF_B_NIR'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['NDSI'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['SOLAR_ANGLE'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['NDVI'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['valid'] - ['x', 'y', 't', 'bands'])
CubeMetadata(['DIST_IDX'] - ['x', 'y', 't', 'bands'])


### Merge and mask data

In [14]:
merged = (
    green
    .merge_cubes(ndvi)
    .merge_cubes(ndsi)
    .merge_cubes(diff_B_NIR)
    .merge_cubes(shad_idx)
    .merge_cubes(scaled_distance_index)
    .merge_cubes(solar_incidence_angle)
    .mask(scene_valid != 1)
)

In [15]:
print(merged.metadata)

CollectionMetadata({'spatial': {'bbox': [[-180, -56, 180, 83]]}, 'temporal': {'interval': [['2015-07-04T00:00:00Z', None]]}} - ['GREEN', 'NDVI', 'NDSI', 'DIFF_B_NIR', 'SHAD_IDX', 'DIST_IDX', 'SOLAR_ANGLE'] - ['t', 'x', 'y', 'bands'])


In [16]:
# job=merged.execute_batch(out_format="netCDF",title="merged")
# job.get_results().download_files()

In [50]:
normalize_udf = openeo.UDF.from_file("normalize_udf.py")

In [51]:
# mask the valid solar angle ranges
def extract_range_mask(range_min, range_max):
    angle_mask = (solar_incidence_angle >= range_min) & (solar_incidence_angle < range_max) 
    return angle_mask

# test on one range
lo, hi = 45, 70

# create angle mask where 1 = within the range, 0 = outside of range, 129 = nodata
angle_mask = extract_range_mask(lo, hi).mask_polygon(aoi)

In [52]:
# combine masks where 1 = valid pixels, 0 = not valid
combined_mask = (scene_valid != 255) & (angle_mask != 0)

In [53]:
combined_mask = combined_mask.filter_bands('snow_sure')

In [54]:
# combined_mask.download(os.path.join(outdir,'combined_mask.nc'))

In [55]:
green = s2.filter_bands('B03')

green_mask = green.merge_cubes(combined_mask)

In [61]:
ndvi_mask = ndvi.merge_cubes(combined_mask)

In [62]:
print(ndvi_mask.metadata)

CubeMetadata(['snow_sure'] - ['x', 'y', 'bands', 't'])


In [57]:
# # mask
# masked_ndsi = ndsi.mask(combined_mask != 1)
# masked_ndvi = ndvi.mask(combined_mask != 1)
# masked_green = s2.filter_bands('B03').mask(combined_mask != 1)
# masked_shad = shad_idx.mask(combined_mask != 1)
# masked_diff = diff_B_NIR.mask(combined_mask != 1)

In [58]:
normalized_ndsi = masked_ndsi.apply_polygon(geometries=aoi, process=normalize_udf)
normalized_ndvi = masked_ndvi.apply_polygon(geometries=aoi, process=normalize_udf)
normalized_green = green_mask.apply_polygon(geometries=aoi, process=normalize_udf)
normalized_shad = masked_shad.apply_polygon(geometries=aoi, process=normalize_udf)
normalized_diff = masked_diff.apply_polygon(geometries=aoi, process=normalize_udf)


In [59]:
# submit job with a name
job = normalized_green.create_job(
    title="normalized_green")

job_id = job.job_id
print(f"Job ID: {job_id}")

job.start_and_wait()

# download
job.download_results(target=os.path.join(outdir))

Job ID: j-2508071503054b8b9ba31f482ff4200e
0:00:00 Job 'j-2508071503054b8b9ba31f482ff4200e': send 'start'
0:00:19 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:00:24 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:00:31 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:00:39 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:00:50 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:01:02 Job 'j-2508071503054b8b9ba31f482ff4200e': created (progress 0%)
0:01:19 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:01:38 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:02:03 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:02:33 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:03:11 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:03:58 Job 'j-2508071503054b8b9ba31f482ff4200e': running (progress N/A)
0:04:56 Job 'j-250807150

  job.download_results(target=os.path.join(outdir))


{PosixPath('SVM_output/openEO_2023-02-02Z.tif'): {'bands': [{'eo:center_wavelength': 0.5598,
    'eo:common_name': 'green',
    'name': 'B03'}],
  'eo:bands': [{'center_wavelength': 0.5598,
    'common_name': 'green',
    'name': 'B03'}],
  'href': 'https://openeo.dataspace.copernicus.eu/openeo/1.2/jobs/j-2508071503054b8b9ba31f482ff4200e/results/assets/ZDc2YTQ5NmItNmIyMy00ZGYxLWIxNzMtODdmNGYyNGQxYzEx/9484621fe29128a86fdcf5f1c44ca99f/openEO_2023-02-02Z.tif?expires=1755184143',
  'proj:bbox': [631910, 5167410, 656060, 5184560],
  'proj:epsg': 32632,
  'proj:shape': [1715, 2415],
  'raster:bands': [{'name': 'B03', 'statistics': {'valid_percent': 0}}],
  'roles': ['data'],
  'title': 'openEO_2023-02-02Z.tif',
  'type': 'image/tiff; application=geotiff'},
 PosixPath('SVM_output/openEO_2023-02-05Z.tif'): {'bands': [{'eo:center_wavelength': 0.5598,
    'eo:common_name': 'green',
    'name': 'B03'}],
  'eo:bands': [{'center_wavelength': 0.5598,
    'common_name': 'green',
    'name': 'B03'}],


In [94]:
# testing angle range 45,70 , so sunlit and solar incidence angle < 90
snow_score = normalized_ndsi - normalized_ndvi + normalized_green

In [64]:
# percentile udf to get the 95th percentile
percentile_udf = openeo.UDF.from_file("percentile_udf.py")

# snow mask of the top 5% snowy pixels
snow_mask = snow_score.apply_polygon(geometries=aoi, process=percentile_udf)

In [65]:
snow_mask.download(os.path.join(outdir, 'snow_mask.nc'))

OpenEoApiError: [500] Internal: Server error: A part of your process graph failed multiple times. Simply try submitting again, or use batch job logs to find more detailed information in case of persistent failures. Increasing executor memory may help if the root cause is not clear from the logs. (ref: r-2507241603434ec69c40dfee937228d7)

In [None]:
angle_ranges = [(0,20), (20,45), (45,70), (70,90), (90,180)] 
range_masks = [(solar_angle >= lo) & (solar_angle < hi) for lo, hi in angle_ranges]

In [25]:
# mask the valid solar angle ranges
def extract_range_mask(range_min, range_max):
    angle_mask = (solar_incidence_angle >= range_min) & (solar_incidence_angle < range_max) 
    return angle_mask
    
# # angle range
angle_ranges = [(0,20), (20,45), (45,70), (70,90), (90,180)] 
range_masks = [extract_range_mask(lo, hi).mask_polygon(mask=aoi) for lo, hi in angle_ranges]

In [26]:
# range_masks[0].download(os.path.join(outdir,'range_mask_0.nc'))

In [55]:
print(ndsi.metadata)
print(ndvi.metadata)
print(scaled_distance_index.metadata)
print(shad_idx.metadata)
print(diff_B_NIR.metadata)
print(s2.metadata)

CubeMetadata([] - ['x', 'y', 'bands', 't'])
CubeMetadata([] - ['x', 'y', 'bands', 't'])
CubeMetadata(['snow_sure'] - ['x', 'y', 'bands', 't'])
CubeMetadata([] - ['x', 'y', 'bands', 't'])
CubeMetadata([] - ['x', 'y', 'bands', 't'])
CollectionMetadata({'spatial': {'bbox': [[-180, -56, 180, 83]]}, 'temporal': {'interval': [['2015-07-04T00:00:00Z', None]]}} - ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12'] - ['bands', 't', 'x', 'y'])


### Original function from SNOWflakes

https://github.com/bare92/SnowFLAKES/blob/main/training_collection.py#L403

In [None]:
def collect_trainings(curr_acquisition, curr_aux_folder, SVM_folder_name, no_data_mask, bands, PCA=False, total_samples = 500):
    
    scf_folder = os.path.join(curr_acquisition, SVM_folder_name)
    if not os.path.exists(scf_folder):
        os.makedirs(scf_folder)
        
    sensor = get_sensor(os.path.basename(curr_acquisition))
    
    path_cloud_mask = glob.glob(os.path.join(curr_aux_folder, 'cloud_mask.nc'))[0]
    path_water_mask = glob.glob(os.path.join(curr_aux_folder, 'water_mask.nc'))[0]
    solar_incidence_angle_path = glob.glob(os.path.join(curr_aux_folder, 'solar_incidence_angle.nc'))[0]
    NDSI_path = glob.glob(os.path.join(curr_aux_folder, 'ndsi.nc'))[0]
    NDVI_path = glob.glob(os.path.join(curr_aux_folder, 'ndvi.nc'))[0]
    diff_B_NIR_path = glob.glob(os.path.join(curr_aux_folder, 'diff_B_NIR.nc'))[0]
    shad_idx_path = glob.glob(os.path.join(curr_aux_folder, 'shad_idx.nc'))[0]
    distance_index_path = glob.glob(os.path.join(curr_aux_folder, 'scaled_distance_index.nc'))[0]
        
    
    bands_path = glob.glob(os.path.join(curr_acquisition, '*scf.vrt'))
    
    if bands_path == []:
        bands_path = [f for f in glob.glob(curr_acquisition + os.sep + "PRS*.tif") if 'PCA' not in f][0]
    else:
        bands_path = bands_path[0]
        
    valid_mask = np.logical_not(no_data_mask)
    
    # Load masks and other necessary data
    cloud_mask = open_image(path_cloud_mask)[0]
    water_mask = open_image(path_water_mask)[0]
    solar_incidence_angle = open_image(solar_incidence_angle_path)[0]
    curr_scene_valid = np.logical_not(np.logical_or.reduce((cloud_mask == 2, water_mask == 1, no_data_mask)))
    
    ranges = ((0,20), (20, 45), (45, 70), (70, 90), (90, 180))
    range_samples = calculate_training_samples(solar_incidence_angle, ranges, total_samples)
    #ranges = ((70, 90))
    
    #ranges = ((0,20), (20, 30), (30, 40), (40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 180))
    empty = np.zeros(curr_scene_valid.shape, dtype='uint8')
    
    percentage_per_angles_list = []
    for curr_range, sample_count in range_samples.items():
        print(curr_range)
        print(sample_count)
        
        # Initialize as empty arrays
        representative_pixels_mask_snow = np.array([])
        representative_pixels_mask_noSnow = np.array([])
    
        curr_angle_valid = np.logical_and(curr_scene_valid, np.logical_and(solar_incidence_angle >= curr_range[0], solar_incidence_angle < curr_range[1]))
        
        percentage_of_scene_valid =  np.sum(curr_angle_valid) / np.sum(curr_scene_valid)
        
        percentage_per_angles_list.append(percentage_of_scene_valid)
    
        curr_NDSI = read_masked_values(NDSI_path, curr_angle_valid)
        curr_NDVI = read_masked_values(NDVI_path, curr_angle_valid)
        curr_green = read_masked_values(bands_path, curr_angle_valid, bands=[2])
        curr_bands = read_masked_values(bands_path, curr_angle_valid)
        curr_diff_B_NIR = read_masked_values(diff_B_NIR_path, curr_angle_valid)
        curr_shad_idx = read_masked_values(shad_idx_path, curr_angle_valid)
        curr_distance_idx = read_masked_values(distance_index_path, curr_angle_valid)
    
        # SNOW TRAINING
        if curr_range[0] >= 90:
            # Normalize indices and compute shadow metric
            diff_B_NIR_low_perc, diff_B_NIR_high_perc = np.percentile(curr_diff_B_NIR, [2, 95])
            shad_idx_low_perc, shad_idx_high_perc = np.percentile(curr_shad_idx, [2, 95])
            curr_diff_B_NIR_norm = np.clip((curr_diff_B_NIR - diff_B_NIR_low_perc) / (diff_B_NIR_high_perc - diff_B_NIR_low_perc), 0, 1)
            curr_shad_idx_norm = np.clip((curr_shad_idx - shad_idx_low_perc) / (shad_idx_high_perc - shad_idx_low_perc), 0, 1)
            curr_score_snow_shadow = curr_diff_B_NIR_norm - curr_shad_idx_norm
            threshold_shadow = np.percentile(curr_score_snow_shadow, 95)
            curr_valid_snow_mask_shadow = np.logical_and.reduce((curr_score_snow_shadow >= threshold_shadow, curr_NDSI > 0.7, curr_distance_idx != 255)).flatten()
            if np.sum(curr_valid_snow_mask_shadow) > 10:
                representative_pixels_mask_snow = get_representative_pixels(curr_bands, curr_valid_snow_mask_shadow, sample_count = int(sample_count/2), k=5, n_closest='auto')
        else:
            # Normalize indices and compute sun metric
            NDSI_low_perc, NDSI_high_perc = np.percentile(curr_NDSI[np.logical_not(np.isnan(curr_NDSI))], [1, 99])
            NDVI_low_perc, NDVI_high_perc = np.percentile(curr_NDVI[np.logical_not(np.isnan(curr_NDVI))], [1, 99])
            green_low_perc, green_high_perc = np.percentile(curr_green, [1, 99])
            curr_NDSI_norm = np.clip((curr_NDSI - NDSI_low_perc) / (NDSI_high_perc - NDVI_low_perc), 0, 1)
            curr_NDVI_norm = np.clip((curr_NDVI - NDVI_low_perc) / (NDVI_high_perc - NDVI_low_perc), 0, 1)
            curr_green_norm = np.clip((curr_green - green_low_perc) / (green_high_perc - green_low_perc), 0, 1)
            curr_score_snow_sun = curr_NDSI_norm - curr_NDVI_norm + curr_green_norm
            threshold = np.percentile(curr_score_snow_sun, 95)
            curr_valid_snow_mask = np.logical_and.reduce((curr_score_snow_sun >= threshold, curr_NDSI > 0.7, curr_distance_idx != 255)).flatten()
            
            if np.sum(curr_valid_snow_mask) > 10:
                representative_pixels_mask_snow = get_representative_pixels(curr_bands, curr_valid_snow_mask, sample_count = int(sample_count/2), k=5, n_closest='auto')
    
        ## NO snow TRAINING
        if curr_range[0] >= 90:
            threshold_shadow_no_snow = np.percentile(curr_score_snow_shadow, 5)
            curr_valid_no_snow_mask_shadow = (curr_score_snow_shadow <= threshold_shadow_no_snow).flatten()
            
            if np.sum(curr_valid_no_snow_mask_shadow) > 10:
                representative_pixels_mask_noSnow = get_representative_pixels(curr_bands, curr_valid_no_snow_mask_shadow, sample_count = int(sample_count/2), k=5, n_closest='auto') * 2
        else:
            curr_valid_no_snow_mask = (curr_NDSI < 0).flatten()
            
            if np.sum(curr_valid_no_snow_mask) > 10:
                representative_pixels_mask_noSnow = get_representative_pixels(curr_bands, curr_valid_no_snow_mask, sample_count = int(sample_count/2), k=10, n_closest='auto') * 2
    
        # Check if masks have been assigned; if not, set as zeros
        if representative_pixels_mask_snow.size == 0:
            representative_pixels_mask_snow = np.zeros(curr_angle_valid.sum(), dtype='uint8')
        if representative_pixels_mask_noSnow.size == 0:
            representative_pixels_mask_noSnow = np.zeros(curr_angle_valid.sum(), dtype='uint8')
            
        representative_pixels_mask = representative_pixels_mask_noSnow + representative_pixels_mask_snow
        empty[curr_angle_valid] = representative_pixels_mask
        
        print(str(np.sum(representative_pixels_mask_snow.flatten())) + ' SNOW PIXELS')
        print(str(np.sum(representative_pixels_mask_noSnow.flatten() / 2)) + ' NO SNOW PIXELS')

    
    # Convert points where result == 1 or 2 to a shapefile
    points = []
    values = []
    with rasterio.open(NDSI_path) as src:
        for row, col in zip(*np.where((empty == 1) | (empty == 2))):
            x, y = src.xy(row, col)
            points.append(Point(x, y))
            values.append(empty[row, col])

    gdf = gpd.GeoDataFrame({"value": values}, geometry=points, crs=src.crs)
    svm_folder_path = os.path.join(curr_acquisition, SVM_folder_name)
    
    plot_valid_pixels_percentage(ranges, percentage_per_angles_list, svm_folder_path)
    
    shapefile_path = os.path.join(svm_folder_path, 'representative_pixels_for_training_samples.shp')
    gdf.to_file(shapefile_path, driver="ESRI Shapefile")
    
    training_mask_path = os.path.join(svm_folder_path, 'representative_pixels_for_training_samples.tif')
    
    # Update the profile and save the representative mask
    with rasterio.open(NDSI_path) as src:
        profile = src.profile
    profile.update(dtype='uint8', count=1, compress='lzw', nodata=0)
    
    with rasterio.open(training_mask_path, 'w', **profile) as dst:
        dst.write(empty, 1)

    return shapefile_path , training_mask_path   