### Import required libraries

In [1]:
import s3fs
import rioxarray
import rasterio
import rio_cogeo.cogeo
import xarray as xr
import re
from datetime import datetime
import pandas as pd
import boto3
import tempfile
import json

In [2]:
session = boto3.Session()
s3_client = session.client("s3")
bucket_name = ("veda-data-store-staging") # bucket name to store the transformed COGs
FOLDER_NAME = "fldas_anomalies_SoilMoi00_10cm_tavg_cog" # Name of the folder to store the COGs

files_processed = pd.DataFrame(
    columns=["file_name", "COGs_created"]
)

In [3]:
def get_all_s3_keys(bucket):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket, "Prefix": "FLDAS/FLDAS_NOAH01_C_GL_MA.001/"}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            if obj["Key"].endswith(".nc"):
                keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

keys = get_all_s3_keys("gesdisc-cumulus-prod-protected") # fetching all the keys from the prod bucket for netCDFs to be transformed

In [4]:

var = "SoilMoi00_10cm_tavg" # Variable name to be transformed

### Reading all the files from S3, transforming them into COGs and storing them in the desired location

In [5]:
for name in keys:
    fs = s3fs.S3FileSystem(anon=False)
    fileobj = fs.open(f"gesdisc-cumulus-prod-protected/{name}")
    xds = xr.open_dataset(fileobj, engine="h5netcdf")
    xds = xds.assign_coords(lon=(((xds.lon + 180) % 360) - 180)).sortby("lon")
    for time_increment in range(0, len(xds.time)):
        filename = name.split("/")[-1]
        filename_elements = re.split("[_ .]", filename)
        data = getattr(xds.isel(time=time_increment), var)
        data = data.isel(lat=slice(None, None, -1))
        data.rio.set_spatial_dims("lon", "lat", inplace=True)
        data.rio.write_crs("epsg:4326", inplace=True)

        # # insert date of generated COG into filename
        filename_elements.pop()
        filename_elements[-1] = pd.to_datetime(xds.time.values[0]).strftime("%Y%m%d")
        filename_elements.insert(2, var)
        cog_filename = "_".join(filename_elements)
        # # add extension
        cog_filename = f"{cog_filename}.tif"

        with tempfile.NamedTemporaryFile() as temp_file:
            data.rio.to_raster(
                temp_file.name,
                driver="COG",
            )
            s3_client.upload_file(
                Filename=temp_file.name,
                Bucket=bucket_name,
                Key=f"{FOLDER_NAME}/{cog_filename}",
            )
# A dataframe to keep track of files that are transfomed
            files_processed = files_processed._append(
                {"file_name": name, "COGs_created": cog_filename},
                ignore_index=True,
            )

        print(f"Generated and saved COG: {cog_filename}")

Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198201_19820101.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198202_19820201.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198203_19820301.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198204_19820401.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198205_19820501.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198206_19820601.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198207_19820701.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198208_19820801.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198209_19820901.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198210_19821001.tif
Generated and saved COG: FLDAS_NOAH01_SoilMoi00_10cm_tavg_C_GL_MA_ANOM198211_19821101.tif
Generated 

### Fetch the metadata from the netCDF files , create a JSON file and upload the csv file for all the files transformed

In [8]:
with tempfile.NamedTemporaryFile(mode="w+") as fp:
    json.dump(xds.attrs, fp)
    json.dump({"data_dimensions": dict(xds.dims)}, fp)
    json.dump({"data_variables": list(xds.data_vars)}, fp)
    fp.flush()

    s3_client.upload_file(
        Filename=fp.name,
        Bucket=bucket_name,
        Key=f"{FOLDER_NAME}/metadata.json",
    )
files_processed.to_csv(
    f"s3://{bucket_name}/{FOLDER_NAME}/files_converted.csv",
)
print("Done generating COGs")