In [1]:
import os
from multiprocessing import Pool, Manager
import pandas as pd
from geopy import distance
from datetime import timedelta
import planetary_computer as pc
from pystac_client import Client
import rioxarray
import numpy as np
import cv2
from tqdm import tqdm
import warnings

In [2]:
catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace
)

In [3]:
# get our bounding box to search latitude and longitude coordinates
def get_bounding_box(latitude, longitude, meter_buffer=3000):
    """
    Given a latitude, longitude, and buffer in meters, returns a bounding
    box around the point with the buffer on the left, right, top, and bottom.

    Returns a list of [minx, miny, maxx, maxy]
    """
    distance_search = distance.distance(meters=meter_buffer)

    # calculate the lat/long bounds based on ground distance
    # bearings are cardinal directions to move (south, west, north, and east)
    min_lat = distance_search.destination((latitude, longitude), bearing=180)[0]
    min_long = distance_search.destination((latitude, longitude), bearing=270)[1]
    max_lat = distance_search.destination((latitude, longitude), bearing=0)[0]
    max_long = distance_search.destination((latitude, longitude), bearing=90)[1]

    return [min_long, min_lat, max_long, max_lat]

In [4]:
# get our date range to search, and format correctly for query
def get_date_range(date, time_buffer_days=15):
    """Get a date range to search for in the planetary computer based
    on a sample's date. The time range will include the sample date
    and time_buffer_days days prior

    Returns a string"""
    datetime_format = "%Y-%m-%d"
    range_start = pd.to_datetime(date) - timedelta(days=time_buffer_days)
    date_range = f"{range_start.strftime(datetime_format)}/{pd.to_datetime(date).strftime(datetime_format)}"

    return date_range

In [5]:
def crop_sentinel_image(item, bounding_box, band):
    """
    Given a STAC item from Sentinel-2 and a bounding box tuple in the format
    (minx, miny, maxx, maxy), return a cropped portion of the item's visual
    imagery in the bounding box.

    Returns the image as a numpy array with dimensions (color band, height, width)
    """
    (minx, miny, maxx, maxy) = bounding_box

    image = rioxarray.open_rasterio(pc.sign(item.assets[band].href)).rio.clip_box(
        minx=minx,
        miny=miny,
        maxx=maxx,
        maxy=maxy,
        crs="EPSG:4326",
    )

    return image.to_numpy()

In [6]:
# Get images
def get_images(row):
    """
    Given a row from the metadada, return 2 cropped images from sentinel
    1. True color image
    2. Water mask
    """
    bbox = get_bounding_box(row.latitude, row.longitude, meter_buffer=3000)
    date_range = get_date_range(row.date)
    
    # search the planetary computer sentinel-l2a and landsat level-2 collections
    search = catalog.search(
        collections=["sentinel-2-l2a"], 
        bbox=bbox, 
        datetime=date_range,
        query={"eo:cloud_cover": {"lt": 10}}
    )
    
    # get items
    items = [item for item in search.item_collection()]
    
    # get details of all of the items returned
    item_details = pd.DataFrame(
        [
            {
                "datetime": item.datetime.strftime("%Y-%m-%d"),
                "platform": item.properties["platform"],
                "min_long": item.bbox[0],
                "max_long": item.bbox[2],
                "min_lat": item.bbox[1],
                "max_lat": item.bbox[3],
                "bbox": item.bbox,
                "item_obj": item,
            }
            for item in items
        ]
    )
    
    # check which rows actually contain the sample location
    item_details["contains_sample_point"] = (
        (item_details.min_lat < row.latitude)
        & (item_details.max_lat > row.latitude)
        & (item_details.min_long < row.longitude)
        & (item_details.max_long > row.longitude)
    )
    item_details = item_details[item_details["contains_sample_point"]]
    item_details[["datetime", "platform", "contains_sample_point", "bbox"]].sort_values(
        by="datetime"
    )
    
    #Get best item
    best_item = (
    item_details[item_details.platform.str.contains("Sentinel")]
    .sort_values(by="datetime", ascending=False)
    .iloc[0]
    )
    item = best_item.item_obj

    bbox = get_bounding_box(row.latitude, row.longitude, meter_buffer=1000)
    true_color = crop_sentinel_image(item, bbox, "visual")
    scl = crop_sentinel_image(item, bbox, "SCL")
    
    # transpose
    visual = np.transpose(true_color, axes=[1, 2, 0]).astype(np.uint8)
    
    # Return images
    return visual, scl[0]

In [7]:
def get_features(row):
    img, wm = get_images(row)
    water_scaled = np.stack([cv2.resize(wm, (img.shape[1], img.shape[0]))] * 3, -1) == 6
    f = {}
    if water_scaled.sum() == 0:
        f['uid'] = row.uid
        f['r'] = np.nan
        f['g'] = np.nan
        f['b'] = np.nan
        f['gmax'] = np.nan
        f['gmin'] = np.nan
        f['gvr'] = np.nan
        f['gvb'] = np.nan
        f['rvb'] = np.nan
        f['gmaxvb'] = np.nan
        f['gminvb'] = np.nan
    else:
        f['uid'] = row.uid
        f['r'] = img[:, :, 0][water_scaled[:, :, 0]].mean()
        f['g'] = img[:, :, 1][water_scaled[:, :, 1]].mean()
        f['b'] = img[:, :, 2][water_scaled[:, :, 2]].mean()
        f['gmax'] = np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 95)
        f['gmin'] = np.percentile(img[:, :, 1][water_scaled[:, :, 1]], 5)
        f['gvr'] = f['g'] / f['r']
        f['gvb'] = f['g'] / f['b']
        f['rvb'] = f['r'] / f['b']
        f['gmaxvb'] = f['gmax'] / f['b']
        f['gminvb'] = f['gmin'] / f['b']
    
    return f

In [8]:
# Save Image
def save_features(row, failed_points, features):
    """
    Given a row, save the features generated
    """
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("error")
            features.append(get_features(row))
    except Exception as e:
        #print(e)
        failed_points.append({'uid': row['uid']})

In [9]:
metadata = pd.read_csv('../data/metadata.csv')

In [10]:
%%time
def save_features_wrapper(args):
    r, failed_points, features = args
    save_features(r, failed_points, features)

# Utilizamos una lista compartida para almacenar los puntos válidos
manager = Manager()
failed_points = manager.list()
features = manager.list()

head = len(metadata.head(100))

# Obtener el número total de filas
total_rows = len(metadata.head(head))

# Crear un iterable de argumentos para el método map
args = [(r, failed_points, features) for _, r in metadata.head(head).iterrows()]

# Crear un Pool de procesos
with Pool(processes=32) as pool:
    # Utilizar tqdm para la barra de progreso
    with tqdm(total=total_rows) as pbar:
        # Mapear la función sobre los argumentos
        for _ in pool.imap_unordered(save_features_wrapper, args):
            pbar.update(1)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.15it/s]

CPU times: user 77.6 ms, sys: 136 ms, total: 214 ms
Wall time: 16.5 s





In [11]:
features_df = pd.DataFrame.from_records(features)
#features_df.to_csv('../data/downloaded/sentinel/features.csv')

In [12]:
failed_df = pd.DataFrame.from_records(failed_points)
#failed_df.to_csv('../data/downloaded/failed/sentinel.csv')

In [13]:
len(failed_points)

50

In [14]:
len(features)

50

In [15]:
len(failed_points)/len(metadata.head(head))

0.5

In [16]:
len(features)/len(metadata.head(head))

0.5

In [17]:
for i in features:
    print(i)

{'uid': 'abau', 'r': nan, 'g': nan, 'b': nan, 'gmax': nan, 'gmin': nan, 'gvr': nan, 'gvb': nan, 'rvb': nan, 'gmaxvb': nan, 'gminvb': nan}
{'uid': 'aabm', 'r': 33.81514873921219, 'g': 52.5508413287691, 'b': 38.32712983944844, 'gmax': 62.0, 'gmin': 47.0, 'gvr': 1.5540621079046422, 'gvb': 1.3711134005834378, 'rvb': 0.8822770940809592, 'gmaxvb': 1.617653089592587, 'gminvb': 1.2262854066266384}
{'uid': 'aajk', 'r': 60.40432623186511, 'g': 78.22376159979088, 'b': 54.2003659652333, 'gmax': 87.0, 'gmin': 72.0, 'gvr': 1.295002634406101, 'gvb': 1.4432330890527072, 'rvb': 1.1144634386899035, 'gmaxvb': 1.6051552134501443, 'gminvb': 1.3284043145794298}
{'uid': 'aarq', 'r': 47.59121621621622, 'g': 41.600429975429975, 'b': 31.330773955773957, 'gmax': 163.0, 'gmin': 5.0, 'gvr': 0.8741199171383028, 'gvb': 1.327781753305951, 'rvb': 1.5189926774038602, 'gmaxvb': 5.202552615843079, 'gminvb': 0.15958750355346868}
{'uid': 'aalr', 'r': 188.94818652849742, 'g': 182.5181347150259, 'b': 165.0777202072539, 'gmax