In [None]:
import os
import json
import s3fs
import numpy as np
import xarray as xr
import hashlib
from datetime import datetime, timezone
from typing import Union, List, Optional
import math

In [None]:
MINIO_ENDPOINT_URL = "http://localhost:9000" # Your MinIO endpoint
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin"
S3_BUCKET_NAME = "fusion-lake"

In [None]:
s3_options = {
    "key": MINIO_ACCESS_KEY,
    "secret": MINIO_SECRET_KEY,
    "client_kwargs": {'endpoint_url': MINIO_ENDPOINT_URL},
    "config_kwargs": {'s3': {'addressing_style': 'path'}} # Important for MinIO
}

In [None]:
s3 = s3fs.S3FileSystem(**s3_options)

In [None]:
prefix = 'stac/'  # Replace with your prefix
stac_files = s3.ls(f's3://{S3_BUCKET_NAME}/{prefix}')
stac_files = [f for f in stac_files if f.endswith('.json')]
for obj in stac_files:
    print(obj)

In [None]:
s3_storage_options_for_intake_xr = s3_options.copy()

In [None]:

def normalize(arr):
    return (arr - arr.min()) / (arr.max() - arr.min() + 1e-6)

def compute_cloud_mask(chunk):
    # Extract relevant bands
    blue = chunk[1]
    red = chunk[3]
    nir = chunk[4]
    swir1 = chunk[5]
    thermal = chunk[9]

    # Normalized indices
    ndvi = (nir - red) / (nir + red + 1e-6)
    
    blue_n = normalize(blue)
    swir1_n = normalize(swir1)

    # Cloud condition
    potential_cloud = (
        (blue_n > 0.2) &
        (swir1_n > 0.2) &
        (ndvi < 0.3) &
        (thermal < 30000)  
    )

    return potential_cloud.astype(np.uint8)

def cloud_cover_posterior(arr):
    
    def process_block(block):
        mask = compute_cloud_mask(block)
        return np.array([[mask.mean()]], dtype='float32') 

    # Apply over chunks using map_blocks
    cloud_fraction = arr.data.map_blocks(
        lambda block: process_block(block),
        dtype='float32',
        drop_axis=0
    )
    
    cloud_cover_da = xr.DataArray(
    cloud_fraction,
        dims=['chunk_y', 'chunk_x'],
        name='cloud_fraction'
    )
    
    # mean = cloud_cover_da.mean(dim=['chunk_y', 'chunk_x']).compute()
    # variance = cloud_cover_da.var().compute()
    
    # print(f"Mean cloud cover: {mean.item() * 100:.2f}%")
    # print(f"Variance of cloud cover: {(variance.item() * 100**2):.2f} (% squared)")
    
    # print(cloud_cover_da)
    
    return cloud_cover_da

In [None]:
def update_ledger(output_s3_bucket,summary_json_content,summary_data,scene_id_for_summary):
    fs_for_ledger = s3
    
    ledger_bucket = output_s3_bucket
    ledger_file_name = "ledgers.csv"
    ledger_s3_key = f"{ledger_file_name}"
    ledger_full_s3_path = f"s3://{ledger_bucket}/{ledger_s3_key}"

    print(f"Updating placeholder ledger at: {ledger_full_s3_path}")

    try:
        summary_json_bytes = summary_json_content.encode('utf-8')
        sha256_hash = hashlib.sha256(summary_json_bytes).hexdigest()
        timestamp_utc_str = summary_data["calculation_timestamp_utc"] # Use timestamp from summary

        scene_id_ledger = summary_data["scene_id"]
        mean_val_ledger = summary_data["mean"]
        var_val_ledger = summary_data["variance"]
        stage_ledger = "Cloud Posterior"

        new_ledger_line = f"{scene_id_ledger},{sha256_hash},{timestamp_utc_str},{stage_ledger},{mean_val_ledger},{var_val_ledger}\n"

        if fs_for_ledger.exists(ledger_s3_key):
            print(f"Ledger file {ledger_s3_key} exists, appending.")
            with fs_for_ledger.open(ledger_s3_key, "ab") as f:
                f.write(new_ledger_line.encode("utf-8"))
        else:
            print(f"Ledger file {ledger_s3_key} does not exist, creating with header.")
            header = "scene_id,summary_json_sha256,timestamp_utc,stage,mean,variance\n"
            content = header + new_ledger_line
            with fs_for_ledger.open(ledger_full_s3_path, "wb") as f:
                f.write(content.encode("utf-8"))
        print(f"Ledger updated successfully for placeholder scene {scene_id_ledger}.")

    except Exception as e:
        print(f"Failed to update ledger for placeholder scene {scene_id_for_summary}: {e}")
        import traceback
        traceback.print_exc()

    print("\nPlaceholder summary JSON and ledger update complete.")

In [None]:

def save_json(cloud_cover_da,input_zarr_name,zarr_catalog_url,output_s3_path,output_s3_bucket):
    scene_id_for_summary = input_zarr_name

    mean = cloud_cover_da.mean(dim=['chunk_y', 'chunk_x']).compute()
    variance = cloud_cover_da.var().compute()

    summary_data = {
        "scene_id": scene_id_for_summary,
        "processed_product": "NDVI",
        "source_zarr": zarr_catalog_url, # Link back to the input
        "output_ndvi_zarr": output_s3_path,
        "mean": float(mean.item()), # Ensure it's a standard float
        "variance": float(variance.item()),
        "calculation_timestamp_utc": datetime.now(timezone.utc).isoformat()
    }

    summary_json_content = json.dumps(summary_data, indent=2)
    print(f"Summary JSON content: \n{summary_json_content}")

    summary_json_s3_path = f"s3://{output_s3_bucket}/posterior/ndvi_summary_{scene_id_for_summary}.json"


    try:
        print(f"Uploading summary JSON to {summary_json_s3_path}...")
        with s3.open(summary_json_s3_path, 'wb') as f: # s3.open expects path within bucket
            f.write(summary_json_content.encode('utf-8'))
        print("Summary JSON uploaded successfully.")
    except Exception as e:
        print(f"Error uploading summary JSON: {e}")
        
    update_ledger(output_s3_bucket,summary_json_content,summary_data,scene_id_for_summary)

In [None]:
def save_zarr_json(cloud_cover_da,id,zarr_catalog_url):
    input_zarr_name = id
    output_s3_bucket = zarr_catalog_url.split('/')[2]
    output_zarr_prefix = "posterior"
    output_zarr_name = f"{input_zarr_name}_cloud_cover.zarr" 
    output_s3_path = f"s3://{output_s3_bucket}/{output_zarr_prefix}/{output_zarr_name}"

    output_s3_map_root = f"{output_s3_bucket}/{output_zarr_prefix}/{output_zarr_name}"
    if 's3' not in locals() or not isinstance(s3, s3fs.S3FileSystem):
        s3 = s3fs.S3FileSystem(**s3_options)
    s3_map_for_writing = s3fs.S3Map(root=output_s3_map_root, s3=s3, check=False)

    # --- 5. Save the DataArray as a new Zarr store (Chunk-wise write) ---
    cloud_ds_to_save = cloud_cover_da.to_dataset()
    # print(cloud_ds_to_save)
    print("Saving Zarr to S3...")
    try:
        # task = cloud_ds_to_save.to_zarr(
        #     store=s3_map_for_writing,
        #     mode='w',
        #     consolidated=True
        # )
        print(f" Zarr store successfully written to {output_s3_path}")
    except Exception as e:
        print(f"Error saving  Zarr to S3: {e}")
        import traceback
        traceback.print_exc()
        raise
    
    save_json(cloud_cover_da,input_zarr_name,zarr_catalog_url,output_s3_path,output_s3_bucket)

    print("\nCloud cover calculation and Zarr upload complete.")

In [None]:
def save_posterior(cloud_cover_da,id,zarr_catalog_url):
    save_zarr_json(cloud_cover_da,id,zarr_catalog_url)

In [None]:
for obj in stac_files:
    stac_catalog_url = f's3://{obj}'
    
    with s3.open(stac_catalog_url, 'r') as f:
        stac_item_dict = json.load(f)
    print(f"Successfully read JSON content from {stac_catalog_url}")
    
    # print(stac_item_dict['assets'])
    zarr_link = stac_item_dict['assets']['data_zarr']['href']
    zarr_catalog_url = f's3://{S3_BUCKET_NAME}/{zarr_link}'
    
    # print(f"Zarr catalog URL: {zarr_catalog_url}")
    
    ds_lazy = xr.open_zarr(
        store=zarr_catalog_url, # Path to the S3 Zarr store root
        storage_options=s3_options,
        consolidated=True,
        chunks={}
    )
    print("Loaded Zarr dataset (lazy)")
    
    id = stac_item_dict['id']
    
    arr = ds_lazy[id]
    
    cloud_cover_da = cloud_cover_posterior(arr)
    print("Posterior Calculation Done")
    
    save_posterior(cloud_cover_da,id,zarr_catalog_url)
    
    print('--------------')