In [1]:
import json

# Load and pretty-print the content of croissant.json
with open("geocroissant.json", "r") as f:
    croissant_data = json.load(f)

# Pretty-print JSON to console
print(json.dumps(croissant_data, indent=2))

{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "geocr": "http://mlcommons.org/croissant/geocr/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "dct": "http://purl.org/dc/terms/",
    "sc": "https://schema.org/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "dataBiases": "cr:dataBiases",
    "dataCollection": "cr:dataCollection",
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
   

In [2]:
import torch
import mlcroissant as mlc
from torch.utils.data import Dataset, DataLoader
import rasterio
import numpy as np
from pathlib import Path

# Step 1: Point to the local Croissant metadata file
url = "geocroissant.json"
# Step 2: Load the dataset metadata
dataset = mlc.Dataset(url)

# Step 3: Get image paths from Croissant metadata
metadata = dataset.metadata
base_path = Path(metadata.distribution[0].content_url)
image_pattern = metadata.distribution[1].includes

# Find all image files matching the pattern (use the _merged.tif regex)
image_files = sorted(base_path.glob("**/*_merged.tif"))

print(f"Found {len(image_files)} image files")
print(f"Base path: {base_path}")

# Step 4: Create a PyTorch Dataset
class CroissantDataset(Dataset):
    """PyTorch Dataset using Croissant metadata."""
    
    def __init__(self, image_paths, split=None):
        if split:
            self.image_paths = [p for p in image_paths if split in str(p)]
        else:
            self.image_paths = image_paths
        print(f"Dataset initialized with {len(self.image_paths)} images for split: {split}")
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        mask_path = str(image_path).replace("_merged.tif", ".mask.tif")
        
        # Read the GeoTIFF image
        with rasterio.open(image_path) as src:
            image = src.read().astype(np.float32)
        
        # Read the mask
        with rasterio.open(mask_path) as src:
            mask = src.read(1).astype(np.int64)
        
        return {
            'image': torch.from_numpy(image),
            'mask': torch.from_numpy(mask),
            'image_path': str(image_path),
            'mask_path': str(mask_path)
        }

# Step 5: Create dataset splits
train_data = CroissantDataset(image_files, split="training")
val_data = CroissantDataset(image_files, split="validation")

# Step 6: Create DataLoaders
train_loader = DataLoader(train_data, batch_size=1, shuffle=False)
val_loader = DataLoader(val_data, batch_size=1, shuffle=False)

# Step 7: Iterate over the dataset
print("\n=== Training Data Samples ===")
for i, example in enumerate(train_loader):
    if i >= 3:
        break
    print(f"\nSample {i+1}:")
    print(f"  Image path: {example['image_path'][0]}")
    print(f"  Image shape: {example['image'].shape}")
    print(f"  Mask shape: {example['mask'].shape}")
    print(f"  Image value range: [{example['image'].min():.2f}, {example['image'].max():.2f}]")
    print(f"  Mask unique values: {torch.unique(example['mask']).tolist()}")


Found 6 image files
Base path: ..\..\hls_burn_scars
Dataset initialized with 3 images for split: training
Dataset initialized with 3 images for split: validation

=== Training Data Samples ===

Sample 1:
  Image path: ..\..\hls_burn_scars\training\subsetted_512x512_HLS.S30.T10SDH.2020248.v1.4_merged.tif
  Image shape: torch.Size([1, 6, 512, 512])
  Mask shape: torch.Size([1, 512, 512])
  Image value range: [0.00, 0.45]
  Mask unique values: [-1, 0, 1]

Sample 2:
  Image path: ..\..\hls_burn_scars\training\subsetted_512x512_HLS.S30.T10SEH.2018245.v1.4_merged.tif
  Image shape: torch.Size([1, 6, 512, 512])
  Mask shape: torch.Size([1, 512, 512])
  Image value range: [0.00, 1.00]
  Mask unique values: [-1, 0, 1]

Sample 3:
  Image path: ..\..\hls_burn_scars\training\subsetted_512x512_HLS.S30.T10SEH.2018280.v1.4_merged.tif
  Image shape: torch.Size([1, 6, 512, 512])
  Mask shape: torch.Size([1, 512, 512])
  Image value range: [0.00, 0.55]
  Mask unique values: [0, 1]


In [3]:
!mlcroissant validate --jsonld=geocroissant.json

I0122 15:48:24.178021 17212 validate.py:53] Done.
