In [1]:
import os
from multiprocessing import Pool, Manager
import pandas as pd
from geopy import distance
from datetime import timedelta
import planetary_computer as pc
from pystac_client import Client
import rioxarray
import numpy as np
import cv2

In [2]:
catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace
)

In [3]:
# get our bounding box to search latitude and longitude coordinates
def get_bounding_box(latitude, longitude, meter_buffer=3000):
    """
    Given a latitude, longitude, and buffer in meters, returns a bounding
    box around the point with the buffer on the left, right, top, and bottom.

    Returns a list of [minx, miny, maxx, maxy]
    """
    distance_search = distance.distance(meters=meter_buffer)

    # calculate the lat/long bounds based on ground distance
    # bearings are cardinal directions to move (south, west, north, and east)
    min_lat = distance_search.destination((latitude, longitude), bearing=180)[0]
    min_long = distance_search.destination((latitude, longitude), bearing=270)[1]
    max_lat = distance_search.destination((latitude, longitude), bearing=0)[0]
    max_long = distance_search.destination((latitude, longitude), bearing=90)[1]

    return [min_long, min_lat, max_long, max_lat]

In [4]:
# get our date range to search, and format correctly for query
def get_date_range(date, time_buffer_days=15):
    """Get a date range to search for in the planetary computer based
    on a sample's date. The time range will include the sample date
    and time_buffer_days days prior

    Returns a string"""
    datetime_format = "%Y-%m-%d"
    range_start = pd.to_datetime(date) - timedelta(days=time_buffer_days)
    date_range = f"{range_start.strftime(datetime_format)}/{pd.to_datetime(date).strftime(datetime_format)}"

    return date_range

In [5]:
def crop_sentinel_image(item, bounding_box, band):
    """
    Given a STAC item from Sentinel-2 and a bounding box tuple in the format
    (minx, miny, maxx, maxy), return a cropped portion of the item's visual
    imagery in the bounding box.

    Returns the image as a numpy array with dimensions (color band, height, width)
    """
    (minx, miny, maxx, maxy) = bounding_box

    image = rioxarray.open_rasterio(pc.sign(item.assets[band].href)).rio.clip_box(
        minx=minx,
        miny=miny,
        maxx=maxx,
        maxy=maxy,
        crs="EPSG:4326",
    )

    return image.to_numpy()

In [6]:
# Get images
def get_images(row):
    """
    Given a row from the metadada, return 3 cropped images from sentinel
    1. True color image
    2. NIR image
    3. Water mask
    """
    bbox = get_bounding_box(row.latitude, row.longitude, meter_buffer=3000)
    date_range = get_date_range(row.date)
    
    # search the planetary computer sentinel-l2a and landsat level-2 collections
    search = catalog.search(
        collections=["sentinel-2-l2a"], 
        bbox=bbox, 
        datetime=date_range
    )
    
    # get items
    items = [item for item in search.item_collection()]
    
    # get details of all of the items returned
    item_details = pd.DataFrame(
        [
            {
                "datetime": item.datetime.strftime("%Y-%m-%d"),
                "platform": item.properties["platform"],
                "min_long": item.bbox[0],
                "max_long": item.bbox[2],
                "min_lat": item.bbox[1],
                "max_lat": item.bbox[3],
                "bbox": item.bbox,
                "item_obj": item,
            }
            for item in items
        ]
    )
    
    # check which rows actually contain the sample location
    item_details["contains_sample_point"] = (
        (item_details.min_lat < row.latitude)
        & (item_details.max_lat > row.latitude)
        & (item_details.min_long < row.longitude)
        & (item_details.max_long > row.longitude)
    )
    item_details = item_details[item_details["contains_sample_point"]]
    item_details[["datetime", "platform", "contains_sample_point", "bbox"]].sort_values(
        by="datetime"
    )
    
    #Get best item
    best_item = (
    item_details[item_details.platform.str.contains("Sentinel")]
    .sort_values(by="datetime", ascending=False)
    .iloc[0]
    )
    item = best_item.item_obj
    
    true_color = crop_sentinel_image(item, bbox, "visual")
    scl = crop_sentinel_image(item, bbox, "SCL")[0]
    nir = crop_sentinel_image(item, bbox, "B08")
    
    # transpose
    nir = np.transpose(nir, axes=[1, 2, 0])
    visual = np.transpose(true_color, axes=[1, 2, 0])
    water_mask = np.stack([cv2.resize(scl, (visual.shape[1], visual.shape[0]))] * 3, -1) == 6
    water_mask = np.where(water_mask, 255, 0)
    
    # Return images
    return visual, nir, water_mask

In [7]:
# Save Image
def save_image(row, failed_points):
    """
    Given a row, save the images generated
    """
    try:
        nir_fname = f'{row.uid}_NIR.npy'
        water_mask_fname = f'{row.uid}_WaterMask.npy'
        true_color_fname = f'{row.uid}_TrueColor.npy'
        
        # images
        true_color, nir, water_mask = get_images(row)
        
        # save
        np.save('../data/downloaded/sentinel/'+nir_fname, nir)
        np.save('../data/downloaded/sentinel/'+water_mask_fname, water_mask)
        np.save('../data/downloaded/sentinel/'+true_color_fname, true_color)
    except Exception as e:
        failed_points.append(row['uid'])

In [8]:
if os.path.exists('../data/downloaded/failed/sentinel.npy'):
    pass
else:
    np.save('../data/downloaded/failed/sentinel.npy', np.array([]))

In [9]:
metadata = pd.read_csv('../data/metadata.csv')

In [10]:
%%time
p = Pool(processes=16)
manager = Manager()
failed_points = manager.list()

for i, r in metadata.iterrows():
    p.apply_async(save_image, (r, failed_points, ))
p.close()
p.join()

ERROR 1: Request for 324644053-326073302 failed with response_code=206
ERROR 1: Request for 259143870-260328652 failed with response_code=206
ERROR 1: Request for 339245321-340673442 failed with response_code=206
ERROR 1: Request for 109288333-110041330 failed with response_code=206
ERROR 1: Request for 238243280-239419883 failed with response_code=206
ERROR 1: Request for 115127395-116092634 failed with response_code=206
ERROR 1: Request for 241735477-243079580 failed with response_code=206
ERROR 1: Request for 195398952-197316259 failed with response_code=206


CPU times: user 7.32 s, sys: 1.75 s, total: 9.07 s
Wall time: 2h 20min 3s


In [11]:
len(failed_points)

7304

In [12]:
len(failed_points)/len(metadata)

0.309885447602885

In [13]:
np.save('../data/downloaded/failed/sentinel.npy', failed_points)