# Chapter-2: Prepare Training Data

In this chapter, we will go through how to download HLD data from s3 buckets, preprocess them, and split them for training a ML model.
After completing this chapter, you will have familiarized with the process of transfering images into ImageLabeler for analysis. The following objectives are part of this chapter:
- Review and execute code that will download pre-prepared high latitude dust data from an S3 bucket.
- Learn the types of constant variables and the format of each, which are needed to complete the data download.
- Review and execute the code necessary to prepare the environment for data transfer.
- Review and execute code that generates helper methods that facilitate the download and the visualization of the data.
- Review and execute code that checks the downloaded data and prepares the splits.

The flow of the process is illustrated in this diagram:

<img src="workflow.png">



## Install requirements

In [None]:
!pip install -r ../chapter-3/src/requirements.txt

## Import Packages

In [2]:
import boto3
import fiona

import math
import numpy as np
import os
import random
import rasterio.features
import cv2
import numpy.ma as ma

import re
import requests
import shutil

from datetime import datetime
from glob import glob
from io import BytesIO
from IPython.display import Image as Display
from PIL import Image
from rasterio.warp import calculate_default_transform
from matplotlib import pyplot as plt

## Setup Constant variables

In [3]:
ACCOUNT_NUMBER = <account number>
ROLE_NAME = "notebookAccessRole"
ROLE_ARN = f"arn:aws:iam::{ACCOUNT_NUMBER}:role/{ROLE_NAME}"
SOURCE_BUCKET = "impact-datashare" # identifier for IMPACT's s3 buckets

# NOTE: Use image_url function above to create a valid url, if the shapefile generation was not done in Aqua, TrueColor 
DATA_FOLDER = "../chapter-3/data"
EVENT = "hld-labeled"
IMAGE_FOLDER = "images"
SHAPEFILE_FOLDER = "shapefiles"
URL = "https://gibs.earthdata.nasa.gov/wms/epsg4326/best/wms.cgi?SERVICE=WMS&REQUEST=GetMap&layers=MODIS_Aqua_CorrectedReflectance_TrueColor&version=1.3.0&crs=EPSG:4326&transparent=false&width={}&height={}&bbox={}&format=image/tiff&time={}"
KM_PER_DEG_AT_EQ = 111.
RESOLUTION = 0.25

## Setup environment for Notebook Access

In [4]:
def assumed_role_session():
    """ Assume the "notebookAccessRole" role we created using AWS CDK in chapter-0"""
    client = boto3.client('sts')
    creds = client.assume_role(
        RoleArn=ROLE_ARN,
        RoleSessionName=ROLE_NAME
    )['Credentials']
    return boto3.session.Session(
        aws_access_key_id=creds['AccessKeyId'],
        aws_secret_access_key=creds['SecretAccessKey'],
        aws_session_token=creds['SessionToken'],
        region_name='us-east-1'
    )


### Create and Delete folders

In [5]:
def mkdir(foldername):
    """
    creates folders if 'foldername' doesn't exist
    """
    if os.path.exists(foldername):
        print(f"'{foldername}' folder already exists.")
        return
    os.makedirs(foldername)
    print(f"Created folder: {foldername}")

    
def delete_folder(foldername):
    """deletes folder and its contents """
    if os.path.exists(foldername):
        shutil.rmtree(foldername) 
    else:
        print(f"Folder {foldername} doesn't exist.")
    
    

## Image Querying Operations

These set of helper functions are used to query gibs for MODIS imagery with requiired query parameters, calculate the image dimensions of a given extent,and convert shapefiles into rasters of 0's and 1's corresponding to the shape.

In [None]:
def image_url(query_date, bbox, sensor, product, width, height):
    """create a url to return an image based on the query parameters"""
    BASE_URL = 'https://gibs.earthdata.nasa.gov/wms/epsg4326/best/wms.cgi'
    param_dict = {
        "BBOX": bbox,
        "CRS": "EPSG:4326",
        "FORMAT": "image/jpeg",
        "HEIGHT": height,
        "LAYERS": "MODIS_%s_CorrectedReflectance_%s" % (sensor, product),
        "REQUEST": "GetMap",
        "SERVICE": "WMS",
        "TIME": query_date,
        "TRANSPARENT": "false",
        "VERSION": "1.3.0",
        "WIDTH": width,
    }

    return "{}?{}".format(BASE_URL, urlencode(param_dict))


def calculate_width_height(extent, resolution):
    """
    extent: [lower_latitude, left_longitude, higher_latitude, right_longitude], EG: [51.46162974683544,-22.94768591772153,53.03698575949367,-20.952234968354432]
    resolution: represents the pixel resolution, i.e. km/pixel. Should be a value from this list: [0.03, 0.06, 0.125, 0.25, 0.5, 1, 5, 10]
    """
    lats = extent[::2]
    lons = extent[1::2]
    km_per_deg_at_lat = KM_PER_DEG_AT_EQ * np.cos(np.pi * np.mean(lats) / 180.)
    width = int((lons[1] - lons[0]) * km_per_deg_at_lat / resolution)
    height = int((lats[1] - lats[0]) * KM_PER_DEG_AT_EQ / resolution)
    return (width, height)


def modis_url(time, extent, resolution):
    """
    time: utc time in iso format EG: 2020-02-19T00:00:00Z
    extent: [lower_latitude, left_longitude, higher_latitude, right_longitude], EG: [51.46162974683544,-22.94768591772153,53.03698575949367,-20.952234968354432]
    resolution: represents the pixel resolution, i.e. km/pixel. Should be a value from this list: [0.03, 0.06, 0.125, 0.25, 0.5, 1, 5, 10]
    """
    width, height = calculate_width_height(extent, resolution)
    extent = ','.join(map(lambda x: str(x), extent))
    return (width, height, URL.format(width, height, extent, time))


def bitmap_from_shp(fiona_shape, transform, img_shape, filename):
    """ extract out the smoke pixels using the shapefile
     from the transform defined
    Args:
        fiona_shape (Collection): fiona shape collection obtained by fiona.open()
        transfrom (rasterio.transfrom.Affine): rasterio transform object
    """
    geoms = []
    y_mtx = np.zeros((img_shape))
    for shape in fiona_shape:
        geoms.append(shape["geometry"])
    bitmap_filename = filename.replace('.shp', '_bitmap.png')
    # raster the geoms onto a bitmap
    geom_map = [(geo, 255) for geo in geoms]
    y_mtx = rasterio.features.rasterize(
        geom_map,
        out_shape=(img_shape[1], img_shape[0]),
        transform=transform
    )
    img = Image.fromarray(y_mtx)
    print(f"Preparing Bitmap: {filename}")
    img.save(f"{DATA_FOLDER}/{IMAGE_FOLDER}/{bitmap_filename}")
    

def explode(coords):
    """
    Explode a GeoJSON geometry's coordinates object and yield coordinate tuples.
    As long as the input is conforming, the type of the geometry doesn't matter.
    """
    for e in coords:
        if isinstance(e, (float, int)):
            yield coords
            break
        else:
            for f in explode(e):
                yield f


def extract_bbox(fiona_shape, offset=0):
    """
    Extract bounding box from shapefile
    """
    x, y = zip(*list(explode(fiona_shape['geometry']['coordinates'])))
    return min(y) - offset, min(x) - offset, max(y) + offset, max(x) + offset


def download_image(date, bounding_box, shapefile_name):
    """
    Download images from gibs (https://earthdata.nasa.gov/eosdis/science-system-description/eosdis-components/gibs)
    date: date of event
    bounding_box: [lower_latitude, left_longitude, higher_latitude, right_longitude], EG: [51.46162974683544,-22.94768591772153,53.03698575949367,-20.952234968354432]
    """
    resolution = RESOLUTION
    width, height, url = modis_url(date, bounding_box, RESOLUTION)
    print(url)
    response = requests.get(url)
    response.raise_for_status()
    file_name = shapefile_name.replace('shp', 'tiff')
    file_name = f"{DATA_FOLDER}/{IMAGE_FOLDER}/{file_name}"
    print(f"Downloading Image: {file_name}")
    with open(file_name, 'wb') as img_file:
        img_file.write(response.content)
    return width, height, file_name

## Create training, Validation and Test Splits

While Training a ML model, The danger in the training process is that your model may overfit to the training set. That is, the model might learn an overly specific function that performs well on your training data, but does not generalize to images it has never seen. This is called Overfitting.

The train, validation, and testing splits are built to combat overfitting.


### Train Split
The training set the largest corpus of your dataset that you reserve for training your model. After training, inference on these images will be taken with a grain of salt, since the model has already had a chance to look at and memorize the correct output.

### Validation Split
The validation set is a separate section of your dataset that you will use during training to get a sense of how well your model is doing on images that are not being used in training.

### Test Split
After all of the training experiments have concluded, you probably have gotten a sense on how your model might do on the validation set. But it is important to remember that the validation set metrics may have influenced you during the creation of the model, and in this sense you might, as a designer, overfit the new model to the validation set.

In [7]:
def create_split(split, files):
    """
    Clear and create folder with new files.
    split: choice of "train", "test", and "val"
    files: list of tiff file paths

    """
    print(f'Preparing {split} split with {len(files)} examples.')
    folder_name = f"{DATA_FOLDER}/{split}"
    if os.path.exists(folder_name):
        delete_folder(folder_name)
    mkdir(folder_name)
    for filename in files:
        internal_filename = filename.split('/')[-1]
        bitmap_filename = filename.replace('.tiff', '_bitmap.png')
        shutil.copyfile(filename, f"{folder_name}/{internal_filename}")
        shutil.copyfile(bitmap_filename, f"{folder_name}/{bitmap_filename.split('/')[-1]}")
         
# prepare train, val, and test splits
def prepare_splits(source_folder, splits={'train': 0.6, 'val': 0.2, 'test': 0.2}):
    """ Creates training, validation and test splits from `source folder`
    """
    files = glob(f"{source_folder}/*.tiff")
    print(f"Total examples found: {len(files)}")
    random.shuffle(files)
    length = len(files)
    train_limit = math.ceil(length * splits['train'])
    val_limit = train_limit + math.ceil(length * splits['train'])
    create_split('train', files[0:train_limit])
    create_split('val', files[train_limit:val_limit])
    create_split('test', files[train_limit:val_limit])
    

## Download shapefiles from S3 bucket and images from WorldView
- This function downloads the shapefiles that we previously labeled using imagelabeler.
- The date-time and bounds of the shapefiles are found and the corresponding GIBS imagery is downloaded
- The shapefiles are converted into bitmaps, which will serve as training labels.
- The Images are finally stored in `DATA_FOLDER`


In [8]:
def prepare_datasets(boto_session):
    """
    Download and prepare images from available shapefiles
    boto_session: Boto session currently in use.
    """
    s3_connection = boto_session.resource('s3')
    bucket = s3_connection.Bucket(SOURCE_BUCKET)
    objects = list(bucket.objects.filter(Prefix=f"hld/"))
    foldername = f"{DATA_FOLDER}/{SHAPEFILE_FOLDER}"
    mkdir(foldername)
    for iter_object in objects:
        print(iter_object.key)
        splits = iter_object.key.split('/')
        local_foldername = f"{foldername}/{splits[1]}"
        mkdir(local_foldername)
        filename = f"{local_foldername}/{splits[-1]}"
        if not(os.path.exists(filename)):
            bucket.download_file(iter_object.key, filename)
        else: 
            print(f"File already exists. {filename}")
    mkdir(f"{DATA_FOLDER}/{IMAGE_FOLDER}")
    for shapefilename in glob(f"{foldername}/*/*.shp"):
        date = shapefilename.split('_')[1]
        filename = shapefilename.split('/')[-1]
        with fiona.open(shapefilename, "r") as shapefile:
            bounds = shapefile.bounds
            bounds = [bounds[1], bounds[0], bounds[3], bounds[2]]
            width, height, image_filename = download_image(date, bounds, filename)
            try:
                with rasterio.open(image_filename) as src:
                    bitmap_from_shp(shapefile, src.transform, (width, height), filename)
            except:
                print(f"Unable to download file: {image_filename}")
                os.remove(image_filename)


## Download processed images that are stored in the s3 bucket, incase the above processing step fails

In [9]:
def download_dataset(boto_session):
    """
    Download and store data in folders.
    boto_session: Boto session currently in use.
    """
    s3_connection = session.resource('s3')
    bucket = s3_connection.Bucket(SOURCE_BUCKET)
    objects = list(bucket.objects.filter(Prefix=f"{EVENT}/"))
    foldername = f"{DATA_FOLDER}/{IMAGE_FOLDER}"
    mkdir(foldername)
    for iter_object in objects:
        print(iter_object.key)
        splits = iter_object.key.split('/')
        if splits[-1]:
            filename = f"{foldername}/{splits[-1]}"
            bucket.download_file(iter_object.key, filename)


## Putting it all together - Creating a session with permissions to preparing dataset

In [10]:
session = assumed_role_session()  # Create a aws session with appropriate permissions
# prepare_datasets(session) # preprocess shapefiles and images to ML ready format 
download_dataset(session) # download preprocessed dataset
prepare_splits(f"{DATA_FOLDER}/{IMAGE_FOLDER}")  # create splits for training

Created folder: ../chapter-3/data/images
hld-labeled/
hld-labeled/high-latitude-dust_2002-10-22_276.tiff
hld-labeled/high-latitude-dust_2002-10-22_276_bitmap.png
hld-labeled/high-latitude-dust_2002-10-23_274.tiff
hld-labeled/high-latitude-dust_2002-10-23_274_bitmap.png
hld-labeled/high-latitude-dust_2003-01-19_53.tiff
hld-labeled/high-latitude-dust_2003-01-19_53_bitmap.png
hld-labeled/high-latitude-dust_2003-01-19_77.tiff
hld-labeled/high-latitude-dust_2003-01-19_77_bitmap.png
hld-labeled/high-latitude-dust_2003-01-19_admin_77.tiff
hld-labeled/high-latitude-dust_2003-01-19_admin_77_bitmap.png
hld-labeled/high-latitude-dust_2003-03-09_270.tiff
hld-labeled/high-latitude-dust_2003-03-09_270_bitmap.png
hld-labeled/high-latitude-dust_2003-03-11_268.tiff
hld-labeled/high-latitude-dust_2003-03-11_268_bitmap.png
hld-labeled/high-latitude-dust_2003-03-12_263.tiff
hld-labeled/high-latitude-dust_2003-03-12_263_bitmap.png
hld-labeled/high-latitude-dust_2003-03-12_265.tiff
hld-labeled/high-latitude

## Visualize downloads

In [None]:
def get_test_data(num_samples=5):
    """ Samples 'num_samples' # of test datasets from the test data split,
    returns the images and the labels
    """
    test_array = []
    bmp_array = []
    all_images = glob('data/test/*.tif*')
    random.shuffle(all_images)
    test_images_sampled = all_images[:num_samples]
    for test_image in test_images_sampled:
        image = cv2.imread(test_image)
        bmp_image = cv2.imread(test_image.replace('.tiff','_bitmap.png'))
        test_array.append(image)
        bmp_array.append(bmp_image)
    return test_array, bmp_array

modis_batch, bmp_batch = get_test_data()

for j in range(len(modis_batch)):
    bmp_data = bmp_batch[j]
    f, ax = plt.subplots(1, 2, constrained_layout=True, dpi=100)
    ax[0].imshow(modis_batch[j].astype('uint8'))
    ax[0].set_title('RGB Image')
    ax[0].xaxis.set_ticks([])
    ax[0].yaxis.set_ticks([])
    ax[1].imshow(modis_batch[j].astype('uint8'))
    ax[1].xaxis.set_ticks([])
    ax[1].yaxis.set_ticks([])
    ax[1].set_title('SME label overlay')
    ax[1].imshow(ma.masked_where(bmp_data != 0, bmp_data)[:,:,0],alpha=0.35,cmap='Purples')

# Note the trained model name in the DESTINATION_BUCKETdashboard

This will be used in the next chapter to deploy and infer from the model.

In [None]:
LOG_FOLDER = "tensorboard_logs"
TENSORFLOW_LOGS_PATH = f"s3://{BUCKET_NAME}/{LOG_FOLDER}"

aws_region = sagemaker_session.boto_region_name
!AWS_REGION={aws_region} tensorboard --logdir {TENSORFLOW_LOGS_PATH}