In [None]:
#@title # Run to Setup
import os
import cv2
import requests
import ftplib
#import random

import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import geopy.distance as geoDist
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from IPython.display import clear_output

try:
  import sentinelsat
except:
  !pip install sentinelsat
  import sentinelsat
  
try:
  import netCDF4 as nc
except:
  !pip install netCDF4
  import netCDF4 as nc

# Mount drive
clear_output()
print("Mount Google Drive")
from google.colab import drive
drive.mount('/content/gdrive')

# Global Variables
image_dimension = 128

# OData Authentication
# Enter your credentials here
odata_username = ""
odata_password = ""
search_api = sentinelsat.SentinelAPI(odata_username, odata_password)

# CMEMS Authentication
# Enter your credentials here
cmems_username = ""
cmems_password = ""

# Dataset folder names
dataset_location_path   = "/content/gdrive/My Drive/AKM05/DataSets"
#/content/drive/My Drive/location_of_the_file

#SENTINEL-1 Quicklook File Save Name
sentinel_1_folder         = "SENTINEL-1 SAR"
sentinel_1_quicklook_name = "Quick Look Images/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID
sentinel_1_resampled_name = "Resampled Images/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID

# CMEMS File Save Name
cmems_label_folder = "CMEMS Concentration"
cmems_name         = "NC Files/{0}/{1}/{0}_{1}_{2}.nc" # 0:YEAR, 1:MONTH, 2:DAY
concentration_name = "Concentration Labels/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID
uncertainty_name   = "Uncertainty Labels/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID

# Validation
validations_folder    = "Image Processing Validation"
interpolation_name    = "Interpolation Validation/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID
label_comparison_name = "Patch Label Validation/{0}/{1}/{2}/{0}_{1}_{2}_{3}.png" # 0:YEAR, 1:MONTH, 2:DAY, 3:UUID

# Batch size for training and testing
batch_size = 16
seed = 1

# Show the available datasets
clear_output()
dataset_location_path   = "/content/gdrive/My Drive/AKM05/DataSets"

print("The following datasets are available:")
for name in os.listdir(dataset_location_path):
  print("\t" + name)
print("Please make sure that the new dataset name is unique.")

The following datasets are available:
	Southern_Freezing_3_20190901_20191130
	Southern_Melting_3_20200101_20200331
	Northern_Freezing_6_20200101_20200630
	Southern_Freezing_6_20200401_20200930
	Data Information.gdoc
	Northern_Melting_3_20200801_20201029
	Southern_Melting_6_20200101_20200630
Please make sure that the new dataset name is unique.


In [None]:
#@title # Step 1: Dataset Parameters
#@markdown ---
#@markdown ###Choose a name for this dataset
dataset_name = "Southern_Melting_6" #@param {type:"string"}
#@markdown ---
#@markdown ###Select the start date (included)
start_date = "2020-01-01" #@param {type:"date"}
#@markdown ###Select the end date (excluded)
end_date   = "2020-06-30" #@param {type:"date"}
#@markdown ---
#@markdown ###Choose region of interest
ROI = "Antarctic Circle" #@param ["Arctic Circle", "Antarctic Circle", "South Sandwich Islands", "Other North", "Other South"]
#@markdown ---
#------------------------------------------------------------------------------#

# Dataset Information
start_date   = start_date.replace("-", "")
end_date     = end_date.replace("-", "")
dataset_name = "{}_{}_{}".format(dataset_name, start_date, end_date)

os.chdir(dataset_location_path)
if dataset_name not in os.listdir():
  # Make the dataset folder
  os.mkdir(dataset_name)

  # Make the folders for label comparisons
  # os.mkdir(os.path.join(dataset_name, validations_folder))
  # os.mkdir(os.path.join(dataset_name, validations_folder, "Interpolation Validation"))
  # os.mkdir(os.path.join(dataset_name, validations_folder, "Patch Label Validation"))
  
  # Done making new folders.
  print("Made new directory.")
else:
  print("Directory already exists.")

Directory already exists.


# Step 2: SENTINEL-1 Data Acquisition

**Search**

Query the Copernicus database using the Sentinelsat API (which uses OpenSearch)

This produces a dictionary of all search results.

In [None]:
#@title Find SENTINEL-1 Data
#13

#Create directory for SENTINEL data
os.chdir(os.path.join(dataset_location_path, dataset_name))
if sentinel_1_folder not in os.listdir(): os.mkdir(sentinel_1_folder)

# Define ROI
# Useful site for generating and checking these polygons: https://arthur-e.github.io/Wicket/sandbox-gmaps3.html
roi_polygons = {"Arctic Circle"           : "POLYGON((-180 66,-180 85,0 85,180 85,180 66,0 66,-180 66))",
                "Antarctic Circle"        : "POLYGON((-180 -66,-180 -85,0 -85,180 -85,180 -66,0 -66,-180 -66))",
                "South Sandwich Islands"  : "POLYGON((-29.30712890625 -55.480494204910514,-25.966796875 -55.975418279377394,-25.36279296875 -59.589097877324384,-28 -60,-29.30712890625 -55.480494204910514))",
                "Other North"             : "<Enter a Northern hemisphere polygon here if you want to define some other region>",
                "Other South"             : "<Enter a Southern hemisphere polygon here if you want to define some other region>"}

# Define additional keyword search items
kwargs = {'platformname':'Sentinel-1',
          'producttype':'GRD',
          'sensoroperationalmode':'EW',
          'polarisationmode':'HH+HV'
         }

# Query for matching products
while (True):
  try:
    # This often times out, so try is as many times as is necessary, unless the user cancels
    products = search_api.query(area=roi_polygons[ROI],
                                date=(start_date, end_date),
                                # date=("20190101", "20190301"),
                                limit=None,
                                offset=0,
                                **kwargs)
    break
  except KeyboardInterrupt:
    break
  except:
    print("Timed out - trying again.")

print("\nThere are {} results matching the search criteria.".format(len(products)))

# Write all search results uuids to txt file
os.chdir(sentinel_1_folder)
with open("Search Results.txt", 'w') as f:
  for uuid in products.keys():
    f.write(uuid + "\n")

**Download**

Save all quicklook files to Google Drive (retry failed downloads)

In [None]:
#@title Download SENTINEL-1 Data
# Root access path
# service_root_URI = "https://scihub.copernicus.eu/dhus/odata/v1/"
service_root_URI = "https://scihub.copernicus.eu/apihub/odata/v1/"

# Move into the SENTINEL-1 folder
os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))

# Remove the failed downlaods file so that those items can be retried
if "Failed_Downloads.txt" in os.listdir(): os.remove("Failed_Downloads.txt")

# Lists to track downloads
successful_downloads = []
needs_downloading = {}

# Do not want to re-download files already on the drive, so remove them from the list
if "Successful_Downloads.txt" in os.listdir():
  with open("Successful_Downloads.txt", 'r') as f:
    successful_downloads = [item.split("_")[-1] for item in f.read().splitlines()] # Extract the UUID from the file name

# All failed downloads will be retried automatically if they are contained in 'products'
for uuid in products:
  if uuid not in successful_downloads:
    needs_downloading[uuid] = products[uuid]

# Download all items which haven't already been downloaded
for uuid, value in needs_downloading.items():
  # clear_output()
  print("Downloading product {} of {}: {}\n".format(list(needs_downloading.keys()).index(uuid) + 1, len(needs_downloading), uuid))
  # quicklook_url = os.path.join(service_root_URI, "Products('{}')/Products('Quicklook')/$value".format(uuid))
  quicklook_url = value['link_icon']
  print(quicklook_url)
  response = requests.get(quicklook_url, auth=(odata_username, odata_password))
  if (response.status_code == 200):
    date = value['endposition']
    if not os.path.exists("Quick Look Images/{}/{}/{}".format(date.year, date.month, date.day)): os.makedirs("Quick Look Images/{}/{}/{}".format(date.year, date.month, date.day))
    if not os.path.exists("Resampled Images/{}/{}/{}".format(date.year, date.month, date.day)): os.makedirs("Resampled Images/{}/{}/{}".format(date.year, date.month, date.day))
    try:
      # Save the quicklook image
      open(sentinel_1_quicklook_name.format(date.year, date.month, date.day, uuid), 'wb').write(response.content)
      original_image = cv2.imread(sentinel_1_quicklook_name.format(date.year, date.month, date.day, uuid))

      # Save a resampled version of the image
      cv2.imwrite(sentinel_1_resampled_name.format(date.year, date.month, date.day, uuid), cv2.resize(original_image, (image_dimension, image_dimension)))

      # Add this uuid to the list of successful downloads
      with open("Successful_Downloads.txt", 'a+') as f:
        f.write("{}_{}_{}_{}\n".format(date.year, date.month, date.day, uuid))
    except:
      # Failed when trying to write save image files
      with open("Failed_Downloads.txt", 'a+') as f:
        f.write("{}_{}_{}_{}\n".format(date.year, date.month, date.day, uuid))
  else:
    # Bad response from server
    print("Bad response from server")
    with open("Failed_Downloads.txt", 'a+') as f:
      f.write("{}_{}_{}_{}\n".format(date.year, date.month, date.day, uuid))

# Sumarise
print("Done saving search data.")
if "Failed_Downloads.txt" in os.listdir():
  with open("Failed_Downloads.txt", 'r') as f:
    failed_downloads = f.read().splitlines()
else:
  failed_downloads = []
print("Tried to save {} images, {} failed.".format(len(needs_downloading), len(failed_downloads)))

## Delete all SENTINEL-1 data - Be careful!

In [None]:
os.chdir(os.path.join(dataset_location_path, dataset_name))
if sentinel_1_folder in os.listdir():
  os.chdir(sentinel_1_folder)

  if "Search Results.txt" in os.listdir(): os.remove("Search Results.txt")
  if "Successful_Downloads.txt" in os.listdir(): os.remove("Successful_Downloads.txt")
  if "Failed_Downloads.txt" in os.listdir(): os.remove("Failed_Downloads.txt")

  if "Quick Look Images" in os.listdir():
    for y in os.listdir("Quick Look Images"):
      for m in os.listdir(os.path.join("Quick Look Images", y)):
        for d in os.listdir(os.path.join("Quick Look Images", y, m)):
          for f in os.listdir(os.path.join("Quick Look Images", y, m, d)):
            os.remove(os.path.join("Quick Look Images", y, m, d, f))
          os.rmdir(os.path.join("Quick Look Images", y, m, d))
        os.rmdir(os.path.join("Quick Look Images", y, m))
      os.rmdir(os.path.join("Quick Look Images", y))
    os.rmdir("Quick Look Images")

  if "Resampled Images" in os.listdir():
    for y in os.listdir("Resampled Images"):
      for m in os.listdir(os.path.join("Resampled Images", y)):
        for d in os.listdir(os.path.join("Resampled Images", y, m)):
          for f in os.listdir(os.path.join("Resampled Images", y, m, d)):
            os.remove(os.path.join("Resampled Images", y, m, d, f))
          os.rmdir(os.path.join("Resampled Images", y, m, d))
        os.rmdir(os.path.join("Resampled Images", y, m))
      os.rmdir(os.path.join("Resampled Images", y))
    os.rmdir("Resampled Images")

  if len(os.listdir()) == 0:
    os.chdir("..")
    os.rmdir(sentinel_1_folder)

  print("All files should have been deleted.")
else:
  print("No SENTINEL-1 data to delete.")

# Step 3: CMEMS Data Acquisition

**Download**

Connect to the FTP server, and save the required .nc files to Google Drive.

In [None]:
#@title Download CMEMS Data
# Create directory for CMEMS data
os.chdir(os.path.join(dataset_location_path, dataset_name))
if cmems_label_folder not in os.listdir(): os.mkdir(cmems_label_folder)
os.chdir(cmems_label_folder)

# These ROIs will be treated as northern locations, all others will be assumed to be in the South
north_keys = ["Arctic Circle", "Other North"]

# Product information (Adjust directory names based on which hemisphere has been selected in the ROI)
server_name    = "nrt.cmems-du.eu"
product_family = "SEAICE_GLO_SEAICE_L4_NRT_OBSERVATIONS_011_001"
product_name   = "METNO-GLO-SEAICE_CONC-{}-L4-NRT-OBS".format("NORTH" if ROI in north_keys else "SOUTH")
file_on_server = "{0}/{1}/ice_conc_{3}h_polstere-100_multi_{0}{1}{2}1200.nc".format("{0:04d}", "{1:02d}", "{2:02d}", "n" if ROI in north_keys else "s")

# Start date
start_datetime = dt.datetime(int(start_date[:4]), int(start_date[4:6]), int(start_date[6:]))

# End date (inclusive)
end_datetime   = dt.datetime(int(end_date[:4]), int(end_date[4:6]), int(end_date[6:]))

# Lists to track downloads
successful_downloads = []
failed_downloads = []

# Don't want to repeat already downloaded products
if "Successful_Downloads.txt" in os.listdir():
  with open("Successful_Downloads.txt", 'r') as file:
    successful_downloads = file.read().splitlines()

# Connect to the CMEMS FTP server
server = ftplib.FTP()
server.connect(server_name)
server.login(cmems_username, cmems_password)

# Locate the desired product
server.cwd("Core")
server.cwd(product_family)
server.cwd(product_name)

# Iterate through the desired files
date = start_datetime
download_count = 0
while date < end_datetime:
  if "{}_{}_{}".format(date.year, date.month, date.day) not in successful_downloads:
    # File has not already been downloaded, so download it now
    # clear_output()
    print("Downloading file number {0}: {1:04d}-{2:02d}-{3:02d}".format(download_count + 1, date.year, date.month, date.day))
    if not os.path.exists("NC Files/{}/{}".format(date.year, date.month)): os.makedirs("NC Files/{}/{}".format(date.year, date.month))
    try:
      server.retrbinary("RETR " + file_on_server.format(date.year, date.month, date.day), open(cmems_name.format(date.year, date.month, date.day), 'wb').write)
      print ("File saved: " + cmems_name.format(date.year, date.month, date.day))
      successful_downloads.append("{}_{}_{}".format(date.year, date.month, date.day))
      download_count += 1
    except:
      print("Download failed: " + cmems_name.format(date.year, date.month, date.day))
      failed_downloads.append("{}_{}_{}".format(date.year, date.month, date.day))
  date += dt.timedelta(days=1)

# Disconnect from the FTP server
server.quit()

# Save download logs to csv
with open("Successful_Downloads.txt", 'w') as f:
  for item in successful_downloads:
    f.write(item + "\n")
if len(failed_downloads) > 0:
  with open("Failed_Downloads.txt", 'w') as f:
    for item in failed_downloads:
      f.write(item + "\n")
    
# Update the user
if download_count > 0:
  print("Done Saving Files! {} new files downloaded.".format(download_count))
else:
  print("No new files were downloaded.")

Downloading file number 1: 2020-01-01
File saved: NC Files/2020/1/2020_1_1.nc
Downloading file number 2: 2020-01-02
File saved: NC Files/2020/1/2020_1_2.nc
Downloading file number 3: 2020-01-03
File saved: NC Files/2020/1/2020_1_3.nc
Downloading file number 4: 2020-01-04
File saved: NC Files/2020/1/2020_1_4.nc
Downloading file number 5: 2020-01-05
File saved: NC Files/2020/1/2020_1_5.nc
Downloading file number 6: 2020-01-06
File saved: NC Files/2020/1/2020_1_6.nc
Downloading file number 7: 2020-01-07
File saved: NC Files/2020/1/2020_1_7.nc
Downloading file number 8: 2020-01-08
File saved: NC Files/2020/1/2020_1_8.nc
Downloading file number 9: 2020-01-09
File saved: NC Files/2020/1/2020_1_9.nc
Downloading file number 10: 2020-01-10
File saved: NC Files/2020/1/2020_1_10.nc
Downloading file number 11: 2020-01-11
File saved: NC Files/2020/1/2020_1_11.nc
Downloading file number 12: 2020-01-12
File saved: NC Files/2020/1/2020_1_12.nc
Downloading file number 13: 2020-01-13
File saved: NC File

## Delete all CMEMS data - Be careful!

In [None]:
os.chdir(os.path.join(dataset_location_path, dataset_name))
if cmems_label_folder in os.listdir():
  os.chdir(cmems_label_folder)

  if "Successful_Downloads.txt" in os.listdir(): os.remove("Successful_Downloads.txt")
  if "Failed_Downloads.txt" in os.listdir(): os.remove("Failed_Downloads.txt")

  if "NC Files" in os.listdir():
    for y in os.listdir("NC Files"):
      for m in os.listdir(os.path.join("NC Files", y)):
        for f in os.listdir(os.path.join("NC Files", y, m)):
          os.remove(os.path.join("NC Files", y, m, f))
        os.rmdir(os.path.join("NC Files", y, m))
      os.rmdir(os.path.join("NC Files", y))
    os.rmdir("NC Files")

  if len(os.listdir()) == 0:
    os.chdir("..")
    os.rmdir(cmems_label_folder)

  print("All files should have been deleted.")
else:
  print("No CMEMS data to delete.")

# Step 4: Patch-wise Labelling

In [None]:
#@title Labelling
def generate_label(year, month, day, image, odata, nc_file):
  # Get the footprint information, so that we can interpolate the coordinates of each pixel
  # NOTE: This uses the 'footprint' provided by OData, in which the order of the points has been consistently
  # manipulated along with the image, so the order of the points is consistent in the satellite frame of reference.
  # This is different to the OpenSearch 'footprint', where the points are ordered in the global frame, and thus
  # do not follow any flips/rotations performed to the image as part of the ESA preprocessing.
  footprint = odata['footprint'][9:-2].split(",")
  footprint = [(float(footprint[i].split(" ")[1]), float(footprint[i].split(" ")[0])) for i in range(4)]

  # The OData 'footprint' is always ordered in according to the satellite frame
  # of reference. When the images are flipped/rotated in preprocessing, the
  # 'footprint' points remain ordered in the satellite frame orietation. The
  # 'footprint' consists of 5 points: [(0), (1), (2), (3), (0)], and they
  # are ordered as follows (in the satellite frame):
  #
  #                 (0)                 (1)
  #                     +-------------+
  #                     |             |
  #                     |             |
  #                     |             |
  #                     |             |
  #                     +-------------+
  #                 (3)                 (2)
  #
  # So, if the latitude of (1) is greater than the latitude of (2), then the
  # image was acquired during an ASCENDING pass, otherwise it was during a
  # DESCENDING pass. This can be used to determin the pass direction without
  # relying on the 'orbit direction' entry which seems to be unreliable in both
  # OData and OpenSearch APIs.
  if footprint[1][0] > footprint[2][0] and footprint[2][1] < footprint[0][1]:
    # The satellite was definitely ASCENDING, and the footprint straddles the date line
    for i in range(4): footprint[i] = (footprint[i][0], footprint[i][1] + (footprint[i][1] // -360) * 360)
  elif footprint[2][0] > footprint[1][0] and footprint[3][1] < footprint[1][1]:
    # The satellite was definitely DESCENDING, and the footprint straddles the date line
    for i in range(4): footprint[i] = (footprint[i][0], footprint[i][1] + (footprint[i][1] // -360) * 360)
  
  top_left_lat     = footprint[0 if odata['Pass direction'] == "ASCENDING" else 2][0]
  top_left_lon     = footprint[0 if odata['Pass direction'] == "ASCENDING" else 2][1]
  top_right_lat    = footprint[1 if odata['Pass direction'] == "ASCENDING" else 3][0]
  top_right_lon    = footprint[1 if odata['Pass direction'] == "ASCENDING" else 3][1]
  bottom_right_lat = footprint[2 if odata['Pass direction'] == "ASCENDING" else 0][0]
  bottom_right_lon = footprint[2 if odata['Pass direction'] == "ASCENDING" else 0][1]
  bottom_left_lat  = footprint[3 if odata['Pass direction'] == "ASCENDING" else 1][0]
  bottom_left_lon  = footprint[3 if odata['Pass direction'] == "ASCENDING" else 1][1]
  # OData 'pass direction' works reliably in the section above, because any mistakes happen twice
  # and effectively cancel out (although the image and label will be 180 degrees rotated from the
  # global frame). To deal with the issue of crossing the date line, we need to know for sure which
  # direction the pass was in (ASCENDING/DESCENDING). This will figure that out, and correct for
  # negative lon values causing problems when interpolating.
  
  # Create linspace objects for left and right edges so that the rows can be iterated later
  left_edge  = [np.linspace(top_left_lat,  bottom_left_lat,  image_dimension), np.linspace(top_left_lon,  bottom_left_lon,  image_dimension)]
  right_edge = [np.linspace(top_right_lat, bottom_right_lat, image_dimension), np.linspace(top_right_lon, bottom_right_lon, image_dimension)]

  # Extract the relevent tables from the .nc file
  full_concentration = nc_file.variables['ice_conc'][0, :, :]
  full_uncertainty   = nc_file.variables['total_uncertainty'][0, :, :]
  full_lat = np.array(nc_file.variables['lat'])
  full_lon = np.array(nc_file.variables['lon'])

  # Compute the bounds of the image on the concentration map to reduce computational requirements later on
  index_values = np.zeros((4, 2), dtype=np.uint16)
  # top_left
  distance_from_pixel = (full_lat - top_left_lat)**2 + (full_lon - (top_left_lon - 360 * ((top_left_lon + 180) // 360)))**2 # This is not Euclidian distance, but should be faster and still preserve order
  index = distance_from_pixel.argmin()
  index_values[0, 0], index_values[0, 1] = index // distance_from_pixel.shape[1], index % distance_from_pixel.shape[1]
  # top_right
  distance_from_pixel = (full_lat - top_right_lat)**2 + (full_lon - (top_right_lon - 360 * ((top_right_lon + 180) // 360)))**2 # This is not Euclidian distance, but should be faster and still preserve order
  index = distance_from_pixel.argmin()
  index_values[1, 0], index_values[1, 1] = index // distance_from_pixel.shape[1], index % distance_from_pixel.shape[1]
  # bottom_left
  distance_from_pixel = (full_lat - bottom_left_lat)**2 + (full_lon - (bottom_left_lon - 360 * ((bottom_left_lon + 180) // 360)))**2 # This is not Euclidian distance, but should be faster and still preserve order
  index = distance_from_pixel.argmin()
  index_values[2, 0], index_values[2, 1] = index // distance_from_pixel.shape[1], index % distance_from_pixel.shape[1]
  # bottom_right
  distance_from_pixel = (full_lat - bottom_right_lat)**2 + (full_lon - (bottom_right_lon - 360 * ((bottom_right_lon + 180) // 360)))**2 # This is not Euclidian distance, but should be faster and still preserve order
  index = distance_from_pixel.argmin()
  index_values[3, 0], index_values[3, 1] = index // distance_from_pixel.shape[1], index % distance_from_pixel.shape[1]
  # Find the reduced size arrays
  min_y = min(index_values[:, 0])
  min_x = min(index_values[:, 1])
  max_y = max(index_values[:, 0])
  max_x = max(index_values[:, 1])
  # Slice the full arrays
  reduced_concentration = full_concentration[min_y:max_y, min_x:max_x]
  reduced_uncertainty   = full_uncertainty[min_y:max_y, min_x:max_x]
  reduced_lat = full_lat[min_y:max_y, min_x:max_x]
  reduced_lon = full_lon[min_y:max_y, min_x:max_x]

  # Create the label arrays
  concentration_label = np.zeros((image_dimension, image_dimension), dtype=np.float_)
  uncertainty_label   = np.zeros((image_dimension, image_dimension), dtype=np.float_)

  # Iterate through each pixel and find the corresponding value for the label
  for y_pixel in range(image_dimension):
    interpolated_row = [np.linspace(left_edge[0][y_pixel], right_edge[0][y_pixel], image_dimension), np.linspace(left_edge[1][y_pixel], right_edge[1][y_pixel], image_dimension)]
    for x_pixel in range(image_dimension):
      if image[y_pixel, x_pixel] != 0:
        # Only compute the concentration for non-zero pixels in the image (i.e. ignore black edges)
        pixel_lat = interpolated_row[0][x_pixel]
        pixel_lon = interpolated_row[1][x_pixel]
        # Find the closest coordinate to this pixel, and use its value
        distance_from_pixel = (reduced_lat - pixel_lat)**2 + (reduced_lon - (pixel_lon - 360 * ((pixel_lon + 180) // 360)))**2 # This is not Euclidian distance, but should be faster and still preserve order
        index = distance_from_pixel.argmin()
        y, x = index // distance_from_pixel.shape[1], index % distance_from_pixel.shape[1]
        concentration_label[y_pixel, x_pixel] = 100 if reduced_concentration.mask[y, x] else reduced_concentration[y, x]
        uncertainty_label[y_pixel, x_pixel]   = 0 if reduced_uncertainty.mask[y, x] else reduced_uncertainty[y, x]

  # Save the label patches
  os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
  if not os.path.exists("Concentration Labels/{}/{}/{}".format(year, month, day)): os.makedirs("Concentration Labels/{}/{}/{}".format(year, month, day))
  if not os.path.exists("Uncertainty Labels/{}/{}/{}".format(year, month, day)): os.makedirs("Uncertainty Labels/{}/{}/{}".format(year, month, day))
  cv2.imwrite(concentration_name.format(year, month, day, odata['id']), concentration_label)
  cv2.imwrite(uncertainty_name.format(year, month, day, odata['id']), uncertainty_label)

#------------------------------------------------------------------------------#

# Read in the list of successfully downloaded images
os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))
if "Successful_Downloads.txt" in os.listdir():
  with open("Successful_Downloads.txt", 'r') as file:
    needs_labelling = file.read().splitlines()
  print("Found list of all successfully downloaded images.")
else:
  needs_labelling = []
  print("Cannot find Successful_Downloads file for SENTINEL-1 images.")

# Check for successful label file
os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
if "Successful_Labels.txt" in os.listdir():
  # If some labelling has been done, remove the already lebelled images from the list to avoid reprocessing
  with open("Successful_Labels.txt", 'r') as file:
    already_labelled = file.read().splitlines()
  for item in already_labelled:
    if item in needs_labelling:
      needs_labelling.remove(item)
  print("Removed all images which were already labelled.")
else:
  already_labelled = []
  print("No images have been labelled yet.")

# Delete the failed labels file so that they can be retried and updated as necessary
if "Failed_Labels.txt" in os.listdir(): os.remove("Failed_Labels.txt")
  
if len(needs_labelling) > 0:
  print("Attempting to label {} remaining images.\n".format(len(needs_labelling)))
  # Iterate through each image in needs_labelling
  for filename in needs_labelling:
    try:
      # Get the odata info (date, footprint, etc)
      year  = filename.split("_")[0]
      month = filename.split("_")[1]
      day   = filename.split("_")[2]
      uuid  = filename.split("_")[3]
      odata = search_api.get_product_odata(uuid, full=True)


      # Load the resampled image as grayscale
      os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))
      image = cv2.cvtColor(cv2.imread(sentinel_1_resampled_name.format(year, month, day, uuid)), cv2.COLOR_BGR2GRAY)

      # Load the appropriate concentration map
      os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
      nc_file = nc.Dataset(cmems_name.format(year, month, day), 'r', format="NETCDF3")
      
      # Generate the label from the avaiable information
      generate_label(year, month, day, image, odata, nc_file)

      # Log the success
      os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
      with open("Successful_Labels.txt", 'a+') as f:
        f.write(filename + "\n")
      clear_output()
      print("Finished labelling {} of {}: {}".format(needs_labelling.index(filename) + 1, len(needs_labelling), filename))
    except KeyboardInterrupt:
      raise
    except:
      # Log the failure
      os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
      with open("Failed_Labels.txt", 'a+') as f:
        f.write(filename + "\n")
      clear_output()
      print("Labelling Failed: {}".format(filename))

  clear_output()
  print("Finished trying to generate all required labels.")

else:
  # Needs_labelling is empty
  print("\nNo more images need to be labelled.")

Finished trying to generate all required labels.


## Delete all labelling patches - Be careful!

In [None]:
os.chdir(os.path.join(dataset_location_path, dataset_name))
if cmems_label_folder in os.listdir():
  os.chdir(cmems_label_folder)

  if "Successful_Labels.txt" in os.listdir(): os.remove("Successful_Labels.txt")
  if "Failed_Labels.txt" in os.listdir(): os.remove("Failed_Labels.txt")

  if "Concentration Labels" in os.listdir():
    for y in os.listdir("Concentration Labels"):
      for m in os.listdir(os.path.join("Concentration Labels", y)):
        for d in os.listdir(os.path.join("Concentration Labels", y, m)):
          for f in os.listdir(os.path.join("Concentration Labels", y, m, d)):
            os.remove(os.path.join("Concentration Labels", y, m, d, f))
          os.rmdir(os.path.join("Concentration Labels", y, m, d))
        os.rmdir(os.path.join("Concentration Labels", y, m))
      os.rmdir(os.path.join("Concentration Labels", y))
    os.rmdir("Concentration Labels")

  if "Uncertainty Labels" in os.listdir():
    for y in os.listdir("Uncertainty Labels"):
      for m in os.listdir(os.path.join("Uncertainty Labels", y)):
        for d in os.listdir(os.path.join("Uncertainty Labels", y, m)):
          for f in os.listdir(os.path.join("Uncertainty Labels", y, m, d)):
            os.remove(os.path.join("Uncertainty Labels", y, m, d, f))
          os.rmdir(os.path.join("Uncertainty Labels", y, m, d))
        os.rmdir(os.path.join("Uncertainty Labels", y, m))
      os.rmdir(os.path.join("Uncertainty Labels", y))
    os.rmdir("Uncertainty Labels")

  if len(os.listdir()) == 0:
    os.chdir("..")
    os.rmdir(cmems_label_folder)

  print("All files should have been deleted.")
else:
  print("No CMEMS data to delete.")

KeyboardInterrupt: ignored

# Step 5: Curate Samples

In [None]:
#@title Curate Samples
# Function to decide whether or not to keep a sample
def keepSample(date_uuid):
  keep = True
  # Load the original image, to check the aspect ratio
  os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))
  original = cv2.imread(sentinel_1_quicklook_name.format(date_uuid.split("_")[0], date_uuid.split("_")[1], date_uuid.split("_")[2], date_uuid.split("_")[3]))
  aspect_ratio = np.shape(original)[1] / np.shape(original)[0]
  if aspect_ratio < 0.8 or aspect_ratio > 1.2: keep = False
  
  if keep: # don't do unnecessary checks
    # Load the concentration label, to check the variance
    os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
    concentration = cv2.medianBlur(cv2.imread(concentration_name.format(date_uuid.split("_")[0], date_uuid.split("_")[1], date_uuid.split("_")[2], date_uuid.split("_")[3]), cv2.IMREAD_GRAYSCALE), 5) / 100
    if np.var(concentration) < 0.05: keep = False
  return keep
#------------------------------------------------------------------------------#

# Check if a list of samples is already there
os.chdir(os.path.join(dataset_location_path, dataset_name))
if "Samples.txt" in os.listdir() and "Ignored Samples.txt" in os.listdir():
  with open("Samples.txt", 'r') as f:
    included_samples = f.read().splitlines()
  with open("Ignored Samples.txt", 'r') as f:
    ignored_samples = f.read().splitlines()
  already_sorted = included_samples + ignored_samples
  print("Found already sorted samples.")
else:
  already_sorted = []
  print("Nothing previously sorted.")

# Get list of labeled images
os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
if "Successful_Labels.txt" in os.listdir():
  with open("Successful_Labels.txt", 'r') as f:
    date_uuids = f.read().splitlines()

  # Remove samples already sorted
  for date_uuid in already_sorted:
    date_uuids.remove(date_uuid)

  # Decide which samples to keep
  os.chdir(os.path.join(dataset_location_path, dataset_name))
  for date_uuid in date_uuids:
    clear_output()
    print("Evaluating sample {} of {}: {}".format(date_uuids.index(date_uuid) + 1, len(date_uuids), date_uuid))
    if keepSample(date_uuid):
      os.chdir(os.path.join(dataset_location_path, dataset_name))
      with open("Samples.txt", 'a+') as f:
        f.write(date_uuid + "\n")
    else:
      os.chdir(os.path.join(dataset_location_path, dataset_name))
      with open("Ignored Samples.txt", 'a+') as f:
        f.write(date_uuid + "\n")
  
  # Update the user
  os.chdir(os.path.join(dataset_location_path, dataset_name))
  print("\nSamples.txt file saved with all included samples.")
  with open("Ignored Samples.txt", 'r') as f:
    ignored_samples = f.read().splitlines()
  with open("Samples.txt", 'r') as f:
    samples = f.read().splitlines()
  print("{} samples included, out of {} in total.".format(len(samples), len(samples) + len(ignored_samples)))
  print("All excluded samples were listed in the Ignored Samples txt file.")
else:
  print("Successful_Labels.txt could not be found.")

Evaluating sample 5293 of 5293: 2020_1_1_ec3c6c6a-e47f-4e1b-9b26-303cf14a4cad

Samples.txt file saved with all included samples.
1867 samples included, out of 5293 in total.
All excluded samples were listed in the Ignored Samples txt file.


### Delete curated sample lists - Be careful!

In [None]:
os.chdir(os.path.join(dataset_location_path, dataset_name))
if "Samples.txt" in os.listdir(): os.remove("Samples.txt")
if "Ignored Samples.txt" in os.listdir(): os.remove("Ignored Samples.txt")

print("All curated sample lists should have been deleted.")

All curated sample lists should have been deleted.


# Step 6: Generate Batch Sets

In [None]:
#@title ## Get all existing batch sets
#seed = 1
# Function to split the dataset and save the txt files
def split_dataset():
  print("Splitting dataset now.")
  os.chdir(os.path.join(dataset_location_path, dataset_name))
  
  # Read in the 'good' samples, and split for train/test
  samples_dataframe      = pd.read_csv("Samples.txt", header=None, names=["date_uuid"])
  test_dataframe         = samples_dataframe.sample(frac=0.2, random_state=seed)
  train_dataframe        = samples_dataframe.drop(test_dataframe.index)

  # Read in the 'bad' samples, and select a number of them for testing later on
  test_ignored_dataframe = pd.read_csv("Ignored Samples.txt", header=None, names=["date_uuid"])
  test_ignored_dataframe = test_ignored_dataframe.sample(n=min(100, len(test_ignored_dataframe['date_uuid'])), random_state=seed)

  # Print a summary
  print("Dataset has just been split.\n{} for training, {} for testing, and {} from the ignored samples.".format(len(train_dataframe['date_uuid']), len(test_dataframe['date_uuid']), len(test_ignored_dataframe['date_uuid'])))

  # Save the split lists to file
  os.chdir(os.path.join(dataset_location_path, dataset_name, "Batches"))
  train_dataframe.to_csv("Train.txt", header=False, index=False, mode='w')
  test_dataframe.to_csv("Test.txt", header=False, index=False, mode='w')
  test_ignored_dataframe.to_csv("Test_Ignored.txt", header=False, index=False, mode='w')
  print("The split sets have been saved to file for future use.")

# Function to generate and save the image batches
def generate_batches(augmentation_args):
  #batch_size = 50
  # Image preprocessing functions applied at batch time
  def image_preprocessing(img):
    return img

  def concentration_preprocessing(img):
    img[:, :, 0] = cv2.medianBlur(img, 5)
    return img

  def uncertainty_preprocessing(img):
    img[:, :, 0] = cv2.medianBlur(img, 5)
    return img
  
  # Add a column for the file paths, generated from the file name
  train_dataframe["path"]         = ["{}/{}/{}/{}.png".format(name.split("_")[0], name.split("_")[1], name.split("_")[2], name) for name in train_dataframe['date_uuid']]
  test_dataframe["path"]          = ["{}/{}/{}/{}.png".format(name.split("_")[0], name.split("_")[1], name.split("_")[2], name) for name in test_dataframe['date_uuid']]
  test_ignored_dataframe["path"]  = ["{}/{}/{}/{}.png".format(name.split("_")[0], name.split("_")[1], name.split("_")[2], name) for name in test_ignored_dataframe['date_uuid']]

  # Create DataGenerator objects
  train_image_datagen               = ImageDataGenerator(rescale=1./255, **augmentation_args, preprocessing_function=image_preprocessing)
  train_label_datagen               = ImageDataGenerator(rescale=1./100, **augmentation_args, preprocessing_function=concentration_preprocessing)
  train_uncertainty_datagen         = ImageDataGenerator(rescale=1./100, **augmentation_args, preprocessing_function=uncertainty_preprocessing)

  test_image_datagen                = ImageDataGenerator(rescale=1./255, preprocessing_function=image_preprocessing)
  test_label_datagen                = ImageDataGenerator(rescale=1./100, preprocessing_function=concentration_preprocessing)
  test_uncertainty_datagen          = ImageDataGenerator(rescale=1./100, preprocessing_function=uncertainty_preprocessing)

  test_ignored_image_datagen        = ImageDataGenerator(rescale=1./255, preprocessing_function=image_preprocessing)
  test_ignored_label_datagen        = ImageDataGenerator(rescale=1./100, preprocessing_function=concentration_preprocessing)
  test_ignored_uncertainty_datagen  = ImageDataGenerator(rescale=1./100, preprocessing_function=uncertainty_preprocessing)

  # Set the DataGens to flow from the dataframe paths
  train_image_generator              =              train_image_datagen.flow_from_dataframe(train_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, sentinel_1_folder, "Resampled Images"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="rgb",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False,
                                                                                            seed=seed)
  train_label_generator              =              train_label_datagen.flow_from_dataframe(train_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Concentration Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False,
                                                                                            seed=seed)
  train_uncertainty_generator        =        train_uncertainty_datagen.flow_from_dataframe(train_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Uncertainty Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False,
                                                                                            seed=seed)
  
  test_image_generator               =               test_image_datagen.flow_from_dataframe(test_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, sentinel_1_folder, "Resampled Images"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="rgb",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)
  test_label_generator               =               test_label_datagen.flow_from_dataframe(test_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Concentration Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)
  test_uncertainty_generator         =         test_uncertainty_datagen.flow_from_dataframe(test_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Uncertainty Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)
  
  test_ignored_image_generator       =       test_ignored_image_datagen.flow_from_dataframe(test_ignored_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, sentinel_1_folder, "Resampled Images"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="rgb",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)
  test_ignored_label_generator       =       test_ignored_label_datagen.flow_from_dataframe(test_ignored_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Concentration Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)
  test_ignored_uncertainty_generator = test_ignored_uncertainty_datagen.flow_from_dataframe(test_ignored_dataframe,
                                                                                            directory=os.path.join(dataset_location_path, dataset_name, cmems_label_folder, "Uncertainty Labels"),
                                                                                            x_col="path",
                                                                                            target_size=(image_dimension, image_dimension),
                                                                                            color_mode="grayscale",
                                                                                            class_mode=None,
                                                                                            batch_size=batch_size,
                                                                                            shuffle=False)

  # Update the user
  print("Finished preparing DataGenerator objects.")

  # Iterate through all the batches and save to file
  generators = {"Train"        : (train_image_generator,        train_label_generator,        train_uncertainty_generator),
                "Test"         : (test_image_generator,         test_label_generator,         test_uncertainty_generator),
                "Test_Ignored" : (test_ignored_image_generator, test_ignored_label_generator, test_ignored_uncertainty_generator)}

  # Iterate through each segment (train, val, etc...)
  for segment, gen_tuple in generators.items():
    # Update the user
    print("\nDealing with {} now.".format(segment))
    
    # Move into the folder for this segment
    os.chdir(os.path.join(dataset_location_path, dataset_name, "Batches", batch_name))
    if segment not in os.listdir(): os.mkdir(segment)
    os.chdir(segment)

    # Find the number of digits required to count the number of batches
    digits = int(np.ceil(np.log10(len(gen_tuple[0]))))

    # Iterate through each batch, and save the batch's data
    for batch_number in range(len(gen_tuple[0])):
      batch_data = {"images"        : np.array(next(gen_tuple[0])),
                    "labels"        : np.array(next(gen_tuple[1]), ),
                    "uncertainties" : np.array(next(gen_tuple[2]))}
      filename = ("{:0" + str(digits) +  "}.npz").format(batch_number)
      if filename not in os.listdir():
        np.savez(filename, **batch_data)
        print("Saved batch {} of {}".format(batch_number + 1, len(gen_tuple[0])))
      else:
        print("Batch already saved: {} of {}".format(batch_number + 1, len(gen_tuple[0])))

  print("\nFinished saving batches.")

#------------------------------------------------------------------------------#

# Names for each batch segemnt
batch_segments = ["Train", "Test", "Test_Ignored"]

# Set data augmentation parameters
augmentation_args = {"rotation_range" : 10,
                     "fill_mode"        : "constant",
                     "cval"             : 0,
                     "horizontal_flip"  : True,
                     "vertical_flip"    : True}

# Get list of available batch sets
os.chdir(os.path.join(dataset_location_path, dataset_name))
if "Batches" in os.listdir():
  os.chdir("Batches")
  if len(os.listdir()) > len(batch_segments):
    print("Here is a list of all available batch sets:")
    for d in os.listdir():
      if os.path.isdir(d): print("\t" + d)
    print("Please choose one and enter its name into the following cell. Otherwise create a new batch set below.")
  else:
    print("No batches available. Please create a new batch set below.")
else:
  os.mkdir("Batches")
  split_dataset()
  print("\nNo batches available. Please create a new batch set below.")

Splitting dataset now.
Dataset has just been split.
1494 for training, 373 for testing, and 100 from the ignored samples.
The split sets have been saved to file for future use.

No batches available. Please create a new batch set below.


In [None]:
#@title ## Create new batch set
batch_name = "Southern_Melting_3" #@param {type:"string"}
use_augmentation = False #@param {type:"boolean"}
if use_augmentation: batch_name += "_A"
#------------------------------------------------------------------------------#

# Check that the batch name is unique
os.chdir(os.path.join(dataset_location_path, dataset_name, "Batches"))
if batch_name in os.listdir():
  print("This batch set name already exists. Please chose a unique name if you want to create a new batch set.")
else:
  # Check if all three split files are here
  if not all(["{}.txt".format(s) in os.listdir() for s in batch_segments]):
    # Missing split files. Make them
    print("Warning: One or more of the split files was missing. It will be generated now.")
    split_dataset()
  else:
    print("Dataset has already been split. To re-split, delete the txt files and run this code again.")
  
  # Load the dataset splits
  train_dataframe        = pd.read_csv("Train.txt", header=None, names=['date_uuid'])
  test_dataframe         = pd.read_csv("Test.txt", header=None, names=['date_uuid'])
  test_ignored_dataframe = pd.read_csv("Test_Ignored.txt", header=None, names=['date_uuid'])
  
  # Generate the batch set
  os.mkdir(batch_name)
  generate_batches(augmentation_args if use_augmentation else {})

Dataset has already been split. To re-split, delete the txt files and run this code again.
Found 1494 validated image filenames.
Found 1494 validated image filenames.
Found 1494 validated image filenames.
Found 373 validated image filenames.
Found 373 validated image filenames.
Found 373 validated image filenames.
Found 100 validated image filenames.
Found 100 validated image filenames.
Found 100 validated image filenames.
Finished preparing DataGenerator objects.

Dealing with Train now.
Saved batch 1 of 94
Saved batch 2 of 94
Saved batch 3 of 94
Saved batch 4 of 94
Saved batch 5 of 94
Saved batch 6 of 94
Saved batch 7 of 94
Saved batch 8 of 94
Saved batch 9 of 94
Saved batch 10 of 94
Saved batch 11 of 94
Saved batch 12 of 94
Saved batch 13 of 94
Saved batch 14 of 94
Saved batch 15 of 94
Saved batch 16 of 94
Saved batch 17 of 94
Saved batch 18 of 94
Saved batch 19 of 94
Saved batch 20 of 94
Saved batch 21 of 94
Saved batch 22 of 94
Saved batch 23 of 94
Saved batch 24 of 94
Saved batch

# Validation Procedures (Optional)

## Patch Comparison

In [None]:
#@title Compare Patches
def get_height_width(odata):
  # Get footprint for interpolation
  footprint = odata['footprint'][9:-2].split(",")
  top_left_lat     = float(footprint[0 if odata['Pass direction'] == "ASCENDING" else 2].split(" ")[1])
  top_left_lon     = float(footprint[0 if odata['Pass direction'] == "ASCENDING" else 2].split(" ")[0])
  top_right_lat    = float(footprint[1 if odata['Pass direction'] == "ASCENDING" else 3].split(" ")[1])
  top_right_lon    = float(footprint[1 if odata['Pass direction'] == "ASCENDING" else 3].split(" ")[0])
  bottom_right_lat = float(footprint[2 if odata['Pass direction'] == "ASCENDING" else 0].split(" ")[1])
  bottom_right_lon = float(footprint[2 if odata['Pass direction'] == "ASCENDING" else 0].split(" ")[0])
  bottom_left_lat  = float(footprint[3 if odata['Pass direction'] == "ASCENDING" else 1].split(" ")[1])
  bottom_left_lon  = float(footprint[3 if odata['Pass direction'] == "ASCENDING" else 1].split(" ")[0])
  
  # Get coordinate pairs for each corner
  top_left     = (top_left_lat, top_left_lon)
  top_right    = (top_right_lat, top_right_lon)
  bottom_left  = (bottom_left_lat, bottom_left_lon)
  bottom_right = (bottom_right_lat, bottom_right_lon)
  
  # Assume the image is rectangular, and calculate the dimensions
  height = (geoDist.distance(top_left, bottom_left).km + geoDist.distance(top_right, bottom_right).km) / 2.0
  width  = (geoDist.distance(top_left, top_right).km + geoDist.distance(bottom_left, bottom_right).km) / 2.0
  
  return (height, width)

# Create directory for Validation data
os.chdir(os.path.join(dataset_location_path, dataset_name))
if validations_folder not in os.listdir(): os.mkdir(validations_folder)

# Flag for everything being ready to make the comparison
ok_flag = True

# Load list of sentinel images
os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))
if "Successful_Downloads.txt" in os.listdir():
  with open("Successful_Downloads.txt") as f:
    successful_downloads = f.read().splitlines()
else:
  print("No Image Downloads!")
  ok_flag = False

# Load list of labels completed
os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
if "Successful_Labels.txt" in os.listdir():
  with open("Successful_Labels.txt") as f:
    successful_labels = f.read().splitlines()
else:
  print("No Labels Generated!")
  ok_flag = False

# Check that every image has a label
if ok_flag:
  for filename in successful_downloads:
    if filename not in successful_labels:
      ok_flag = False
      break

# Make the comparison if everyhting is ok so far
if ok_flag:
  # Read in the list of ids which have already been processed
  os.chdir(os.path.join(dataset_location_path, dataset_name, validations_folder))
  already_processed = []
  if "Patch_Label_Validations.txt" in os.listdir():
    with open("Patch_Label_Validations.txt", 'r') as f:
      already_processed = f.read().splitlines()
  
  # Only process ids which have not yet been processed
  needs_processing = []
  os.chdir(os.path.join(dataset_location_path, dataset_name))
  if "Samples.txt" in os.listdir():
    with open("Samples.txt", 'r') as f:
      successful_downloads = f.read().splitlines()
  for filename in successful_downloads:
    if filename not in already_processed:
      needs_processing.append(filename)

  # Process each id
  for filename in needs_processing[:20]:
    # Get date information
    y = filename.split("_")[0]
    m = filename.split("_")[1]
    d = filename.split("_")[2]
    uuid = filename.split("_")[3]

    # Load the image
    os.chdir(os.path.join(dataset_location_path, dataset_name, sentinel_1_folder))
    # full_image  = cv2.cvtColor(cv2.imread(sentinel_1_quicklook_name.format(y, m, d, uuid)), cv2.COLOR_BGR2RGB)
    small_image = cv2.cvtColor(cv2.imread(sentinel_1_resampled_name.format(y, m, d, uuid)), cv2.COLOR_BGR2RGB)

    # Load the labels
    os.chdir(os.path.join(dataset_location_path, dataset_name, cmems_label_folder))
    concentration = cv2.imread(concentration_name.format(y, m, d, uuid), cv2.IMREAD_GRAYSCALE)
    uncertainty   = cv2.imread(uncertainty_name.format(y, m, d, uuid), cv2.IMREAD_GRAYSCALE)

    # Plot
    plt.gcf().set_size_inches(20,5)
    plt.suptitle(filename)
    # plt.subplot(1, 4, 1)
    # plt.imshow(full_image)
    # plt.title("Original SAR Image")
    # plt.axis('off')
    plt.subplot(1, 3, 1)
    plt.imshow(small_image)
    plt.title("Resampled SAR Image")
    plt.axis('off')
    plt.subplot(1, 3, 2)
    plt.imshow(concentration, cmap='hot', vmin=0, vmax=100)
    plt.title("Concentration Patch Label")
    plt.axis('off')
    plt.colorbar()
    plt.subplot(1, 3, 3)
    plt.imshow(uncertainty, cmap='hot', vmin=0, vmax=100)
    plt.title("Uncertainty Patch Label")
    plt.axis('off')
    plt.colorbar()

    # Save to Drive
    os.chdir(os.path.join(dataset_location_path, dataset_name, validations_folder))
    if not os.path.exists("Patch Label Validation/{}/{}/{}".format(y, m, d)): os.makedirs("Patch Label Validation/{}/{}/{}".format(y, m, d))
    plt.savefig(label_comparison_name.format(y, m, d, uuid), bbox_inches=0)
    # clear_output()
    plt.show()

    odata = search_api.get_product_odata(uuid, full=True)
    size = get_height_width(odata)
    print("Done saving file {} of {}. Pixel Spacing (rg x az): {:.2f}km x {:.2f}km".format(needs_processing.index(filename) + 1, len(needs_processing), size[1] / image_dimension, size[0] / image_dimension))

    # Save the id to the text file for progress tracking
    with open("Patch_Label_Validations.txt", 'a+') as f:
      f.write(filename + "\n")
else:
  print("'ok_flag' tripped. Check that all previous steps have been completed correctly.")

Output hidden; open in https://colab.research.google.com to view.

### Delete all patch comparisons - Be careful!

In [None]:
os.chdir(os.path.join(dataset_location_path, dataset_name))
if validations_folder in os.listdir():
  os.chdir(validations_folder)

  if "Patch_Label_Validations.txt" in os.listdir(): os.remove("Patch_Label_Validations.txt")

  if "Patch Label Validation" in os.listdir():
    for y in os.listdir("Patch Label Validation"):
      for m in os.listdir(os.path.join("Patch Label Validation", y)):
        for d in os.listdir(os.path.join("Patch Label Validation", y, m)):
          for f in os.listdir(os.path.join("Patch Label Validation", y, m, d)):
            os.remove(os.path.join("Patch Label Validation", y, m, d, f))
          os.rmdir(os.path.join("Patch Label Validation", y, m, d))
        os.rmdir(os.path.join("Patch Label Validation", y, m))
      os.rmdir(os.path.join("Patch Label Validation", y))
    os.rmdir("Patch Label Validation")

  if len(os.listdir()) == 0:
    os.chdir("..")
    os.rmdir(validations_folder)

  print("All files should have been deleted.")
else:
  print("No validation data to delete.")

All files should have been deleted.
