# Fill out missing chips

The initial extended evoland chips were gotten from AWS Element84 S2 COGs. This dataset is missing a bunch of recently reprocessed sentinel 2 data from the beginning of the data availability.

This notebook aims to fill in any gaps using data from the official CDSE repository. 

To do this we follow a few steps. 

1. Get all dates of the chips available already
2. Cross-reference with CDSE stac catalog
3. Use SH Processing API to query missing chips

To find the best way to do this we need to find out approximate numbers of how many chips are missing.

In [1]:
from pathlib import Path
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import math
import traceback

from tqdm import tqdm
import rasterio
import geopandas as gpd
from sentinelhub import (
    geo_utils,
    CRS,
    BBox,
    DataCollection,
    MimeType,
    MosaickingOrder,
    SentinelHubRequest,
    SHConfig,
    SentinelHubCatalog,
    SentinelHubDownloadClient,
)
from affine import Affine
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
samples = gpd.read_parquet("../data/samples.parquet")

In [3]:
config = SHConfig("default-profile")

In [4]:
evalscript_body = """
AVAILABLE = new Set();
NOT_AVAILABLE = new Set();
TO_INCLUDE_SET = new Set(TO_INCLUDE);

function setup() {
    return {
        input: [{
            bands: ["B02","B03","B04","B05","B06","B07","B08","B8A","B11","B12","SCL"],
            units: "DN"
        }],
        output: TO_INCLUDE.map(date => ({
            id: date,
            bands: 11,
            sampleType: "INT16"
        })),
        mosaicking: "ORBIT"
    };
}

function preProcessScenes(collections) {
    collections.scenes.orbits = collections.scenes.orbits.filter(function(orbit) {
        var orbitDateFrom = orbit.dateFrom.split("T")[0];
        var toGet = TO_INCLUDE_SET.has(orbitDateFrom);
        if(toGet){
          AVAILABLE.add(orbitDateFrom)
        }
        return toGet
    });
    return collections;
}

function updateOutputMetadata(scenes, inputMetadata, outputMetadata){
  outputMetadata.userData = Array.from(AVAILABLE)
}

function evaluatePixel(samples) {
    let dataOutputs = {};
    let sampleIndex = 0;
    
    for(let date of TO_INCLUDE){
        if(AVAILABLE.has(date) && sampleIndex < samples.length){
            let sample = samples[sampleIndex];
            dataOutputs[date] = [
                sample.B02, sample.B03, sample.B04, sample.B05, sample.B06,
                sample.B07, sample.B08, sample.B8A, sample.B11, sample.B12, sample.SCL
            ];
            sampleIndex++;
        } else {
            dataOutputs[date] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
        }
    }
    return dataOutputs;
}
"""

In [5]:
def process_sample(sample, evalscript_body, config):
    """Process a single sample - designed to be run in parallel."""
    centroid = sample.geometry
    crs = CRS.get_utm_from_wgs84(centroid.x, centroid.y)
    x, y = geo_utils.transform_point((centroid.x, centroid.y), CRS.WGS84, crs)
    x_round = math.floor(x / 10) * 10
    y_round = math.ceil(y / 10) * 10
    resolution = 10
    size = 32
    bounds = (
        x_round - (size * resolution) // 2,
        y_round - (size * resolution) // 2,
        x_round + (size * resolution) // 2,
        y_round + (size * resolution) // 2,
    )

    bbox = BBox(bounds, crs=crs)
    time_interval = "2015-01-01", "2025-01-01"

    catalog = SentinelHubCatalog(config=config)
    search_iterator = catalog.search(
        DataCollection.SENTINEL2_L2A,
        bbox=bbox,
        time=time_interval,
        filter="eo:cloud_cover < 80",
        fields={
            "include": ["id", "properties.datetime", "properties.eo:cloud_cover"],
            "exclude": [],
        },
    )

    results = list(search_iterator)
    available_timestamps = list(
        set(item["properties"]["datetime"][0:10] for item in results)
    )

    available_timestamps.sort()

    list_of_requests = []
    for year in range(15, 25):
        year_ts = [ts for ts in available_timestamps if ts[2:4] == str(year)]
        evalscript = (
            "//VERSION 3\n" + "TO_INCLUDE = " + json.dumps(year_ts) + evalscript_body
        )

        request_all_bands = SentinelHubRequest(
            evalscript=evalscript,
            input_data=[
                SentinelHubRequest.input_data(
                    data_collection=DataCollection.SENTINEL2_L2A.define_from(
                        "s2l2a", service_url=config.sh_base_url
                    ),
                    time_interval=(f"20{year}-01-01", f"20{year + 1}-01-01"),
                    mosaicking_order=MosaickingOrder.LEAST_RECENT,
                )
            ],
            responses=[
                SentinelHubRequest.output_response(output, MimeType.TIFF)
                for output in year_ts
            ]
            + [SentinelHubRequest.output_response("userdata", MimeType.JSON)],
            bbox=bbox,
            size=(size, size),
            config=config,
        )
        list_of_requests.append(request_all_bands)

    download_list = [request.download_list[0] for request in list_of_requests]
    # download data with multiple threads
    data = SentinelHubDownloadClient(config=config).download(
        download_list, max_threads=5
    )

    out_path = Path(f"../data/tiffs/{sample.sample_id}")
    out_path.mkdir(parents=True, exist_ok=True)
    for all_bands_response in data:
        to_save = list(all_bands_response["userdata.json"].values())
        profile = {
            "driver": "GTiff",
            "dtype": "uint16",
            "nodata": 0.0,
            "width": 32,
            "height": 32,
            "count": 11,
            "crs": crs.epsg,
            "transform": Affine(10.0, 0.0, bounds[0], 0.0, -10.0, bounds[3]),
            "blockxsize": 512,
            "blockysize": 512,
            "tiled": True,
            "compress": "zstd",
            "interleave": "pixel",
        }

        for date in to_save:
            filename = f"{date}.tif"
            with rasterio.open(out_path / filename, "w", **profile) as dst:
                dst.write(np.transpose(all_bands_response[filename], (2, 0, 1)))

    return sample.sample_id, len(to_save), None

In [6]:
to_get = []
for sample_id in samples.sample_id:
    if not Path(f"../data/tiffs/{sample_id}").exists():
        to_get.append(sample_id)

In [7]:
to_get

[3714, 3770, 3780, 3790]

In [12]:
to_get = [3536, 3565, 3738, 3743, 3773, 3787]

In [13]:
samples = samples.query("sample_id in @to_get")

In [15]:
# Main execution with ThreadPoolExecutor
max_workers = 10  # Adjust based on your system and API rate limits

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {
        executor.submit(
            process_sample, sample, evalscript_body, config
        ): sample.sample_id
        for _, sample in samples.iterrows()
    }

    for future in tqdm(as_completed(futures)):
        sample_id = futures[future]
        try:
            result_id, count, error = future.result()
            if error:
                print(f"Sample {sample_id} failed: {error}")
            else:
                print(f"Sample {sample_id} succeeded: {count} files written")
        except Exception:
            # Now this will catch real exceptions with full stack traces
            print(f"Sample {sample_id} failed with exception:")
            traceback.print_exc()

1it [00:27, 27.61s/it]

Sample 3565 succeeded: 56 files written


2it [00:32, 14.44s/it]

Sample 3773 succeeded: 69 files written


3it [00:36,  9.74s/it]

Sample 3536 succeeded: 68 files written


4it [00:39,  7.06s/it]

Sample 3787 succeeded: 86 files written


5it [00:41,  5.02s/it]

Sample 3738 succeeded: 64 files written


6it [00:42,  7.01s/it]

Sample 3743 succeeded: 73 files written



