# Open Source training data import

This is a demo notebook showing how to import training data from the Radiant ML hub for use in the BEAM tool. Selective use of training data from different countries can improve the performance of the BEAM models when used to supplement training data from the areas in question. To use this notebook it is necessary to sign up for an API key on the Radiant ML Hub: https://mlhub.earth/

Notebook for demonstration purposes only.

In [None]:
%pip install radiant_mlhub
%mlhub configure

In [None]:
# Import packages
import rasterio
from rasterio.features import rasterize
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import mapping, Point, Polygon, MultiPolygon
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import cv2
from fastai.vision.all import *
from tqdm import tqdm


Uncomment this next cell and run if using Google Colab:

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')


In [None]:
path = Path(f"your_path")


Enter the list of cities you would like to use for training:

In [None]:
city_list = ['ramp_mzuzu_malawi', 'ramp_accra_ghana']

In [None]:
from radiant_mlhub import Dataset

for i in city_list:
  ds = Dataset.fetch(i)
  ds.download(output_dir = path)
  for c in ds.collections:
      print(c.id)

Define helper functions:

In [None]:

def generate_mask(raster_path, shape_path, output_path = None, file_name = None):
    '''Function that generates a binary mask from a vector file (shp or geojson)
    raster_path = path to the .tif;
    shape_path = path to the shapefile or GeoJson.
    output_path = Path to save the binary mask.
    file_name = Name of the file.'''

    # Load raster
    with rasterio.open(raster_path, "r") as src:
        raster_img = src.read()
        raster_meta = src.meta
        raster_crs = src.crs

    # Load shapefile
    train_df = gpd.read_file(shape_path)
    shape_crs = train_df.crs # get the CRS of the shapefile


    # Verify CRS of the raster file
    if train_df.crs != src.crs:
      print(f'Raster CRS: {src.crs}, Vector CRS: {train_df.crs}.\n Convert vector and raster to the same CRS.')
      # convert the shapefile to the raster CRS
      train_df = train_df.to_crs(raster_crs)
      # update the shape CRS
      shape_crs = train_df.crs

    # Generate the mask
    def poly_from_utm(polygon, transform):
        poly_pts = []
        poly = unary_union(polygon)
        for i in np.array(poly.exterior.coords):
            poly_pts.append( ~ transform * tuple(i))
        new_poly = Polygon(poly_pts)
        return new_poly

    poly_shp = []
    im_size = (src.meta['height'], src.meta['width'])
    for num, row in train_df.iterrows():
        if row['geometry'].geom_type == 'MultiPolygon':
            for p in row['geometry'].geoms: # iterate over polygons within a MultiPolygon
                poly = poly_from_utm(p, src.meta['transform'])
                poly_shp.append(poly)
        elif row['geometry'].geom_type == 'Polygon':
            poly = poly_from_utm(row['geometry'], src.meta['transform'])
            poly_shp.append(poly)
        else:
            # raise an error or skip the object
            raise TypeError("Invalid geometry type")


    if len(poly_shp) > 0:
        mask = rasterize(shapes=poly_shp, out_shape=im_size)
    else:
        mask = np.zeros(im_size)
    # apply erosion
    kernel = np.ones((3, 3), np.uint8)
    mask = cv2.erode(mask, kernel, iterations=1)

    # Save or show mask
    mask = mask.astype('uint8')
    bin_mask_meta = src.meta.copy()
    bin_mask_meta.update({'count': 1})
    if (output_path != None and file_name != None):
        os.chdir(output_path)
        with rasterio.open(file_name, 'w', **bin_mask_meta) as dst:
            dst.write(mask * 255, 1) # Change 255 to 1 if classes need to be 0 and 1
    else:
        return mask


def save_masks(images, mask, maskdir):
    if not os.path.exists(maskdir):
        os.makedirs(maskdir)
    for image in tqdm(images):
        if image.name.endswith(('.TIF', '.tif')):
            shapes = image.name
            generate_mask(image, mask, maskdir, shapes)

def copy_img_msk(image_path):
  '''Copies and saves images and masks'''

  # Loop through all images in the given folder
  for fn in tqdm(image_path):
    output_path = fn.parent.parent

    # Create 'image_tiles' and 'mask_tiles' directories if they don't already exist
    if not os.path.exists(output_path/'image_tiles'):
      os.makedirs(output_path/'image_tiles')
    if not os.path.exists(output_path/'mask_tiles'):
      os.makedirs(output_path/'mask_tiles')

    # Create mask for current image
    img = np.array(PILImage.create(fn))
    msk_fn = str(fn).replace('os_images', 'os_raster_masks')
    msk = np.array(PILMask.create(msk_fn))
    x, y, _ = img.shape

    # Check if the output files already exist
    img_out = f'{output_path}/image_tiles/{fn.name}.png'
    msk_out = f'{output_path}/mask_tiles/{fn.name}.png'

    # Save the output files only if they don't exist
    if not os.path.exists(img_out):
      Image.fromarray(img).save(img_out)
    if not os.path.exists(msk_out):
      Image.fromarray(msk).save(msk_out)

    # Image.fromarray(img).save(f'{output_path}/image_tiles/{fn.name}.png')
    # Image.fromarray(msk).save(f'{output_path}/mask_tiles/{fn.name}.png')

Now copy the source images and labels into folders with consistent names:

In [None]:
image_dir = f'{path}/os_images' # change this to your image directory
label_dir = f'{path}/os_labels' # change this to your label directory

# create the image and label directories if they do not exist
if not os.path.exists(image_dir):
  os.makedirs(image_dir, exist_ok=True)

if not os.path.exists(label_dir):
  os.makedirs(label_dir, exist_ok=True)

for i in city_list:
  raster_dir = f'{path}/{i}/{i}_source' # change this to your raster directory
  vector_dir = f'{path}/{i}/{i}_labels' # change this to your vector directory

  # copy and rename all the raster files to the image directory
  for root, dirs, files in os.walk(raster_dir):
      for file in files:
          if file.endswith((".tif", ".tiff")): # change this to match your raster files
              raster_path = os.path.join(root, file)
              folder_name = os.path.basename(root) # get the name of the folder that contains the raster file
              image_code = folder_name.partition("source")[-1] # join the substrings after the first "_" with "_"
              new_file_name = i + image_code + ".tif" # create a new file name with the folder name and extension
              new_file_path = os.path.join(image_dir, new_file_name) # create a new file path with the image directory and new file name
              if not os.path.exists(new_file_path):
                shutil.copy(raster_path, new_file_path) # copy the raster file to the new file path

  # copy and rename all the vector files to the label directory
  for root, dirs, files in os.walk(vector_dir):
      for file in files:
          if file.endswith(".geojson"): # change this to match your vector files
              vector_path = os.path.join(root, file)
              folder_name = os.path.basename(root) # get the name of the folder that contains the vector file
              image_code = folder_name.partition("labels")[-1] # join the substrings after the first "_" with "_"
              new_file_name = i + image_code + ".geojson" # create a new file name with the folder name and extension
              new_file_path = os.path.join(label_dir, new_file_name) # create a new file path with the label directory and new file name
              if not os.path.exists(new_file_path):
                shutil.copy(vector_path, new_file_path) # copy the vector file to the new file path

In [None]:
len(os.listdir(image_dir))

4687

In [None]:
len(os.listdir(label_dir))

4687

In [None]:
output_path = f"{path}/raster_masks_output" # change this to your desired output path
if not os.path.exists(output_path):
  os.makedirs(output_path)

And generate masks:

In [None]:
for i in city_list:

  raster_dir = f'{path}/{i}/{i}_source' # change this to your raster directory
  vector_dir = f'{path}/{i}/{i}_labels' # change this to your vector directory

  for root, dirs, files in os.walk(raster_dir):
      for file in files:
          if file.endswith((".tif", ".tiff")): # change this to match your raster files
              raster_path = os.path.join(root, file)
              folder_name = os.path.basename(root) # get the name of the folder that contains the raster file
              image_code = folder_name.partition("source")[-1] # join the substrings after the first "_" with "_"
              shape_path = os.path.join(vector_dir, f"{i}_labels{image_code}", "vector_labels.geojson") # change this to match your geojson files
              file_name = i + image_code + ".tif"
              output_file = os.path.join(output_path, file_name)
              if not os.path.exists(output_file):
                print(f"Processing {file_name}")
                generate_mask(raster_path, shape_path, output_path, file_name)

Now save everything:

In [None]:
images_list = get_image_files(image_dir)
copy_img_msk(images_list)

100%|██████████| 4687/4687 [02:53<00:00, 27.07it/s]
