In [1]:
import time
import s3fs
import xarray as xr
import os
import s3fs

In [2]:
bucket = 'nasa-waterinsight'
directory = 'NLDAS3/forcing/temp-chunk/timechunk01'

In [3]:
s3fsfs = s3fs.S3FileSystem()

**TODO:** This is faster if the file with the closest matching chunk shape (at least in the time dimension) to rechunk and write the data.

In [4]:
%%time
files = s3fsfs.glob(f's3://{bucket}/{directory}/*.nc')
ds = xr.open_dataset(s3fsfs.open(f's3://{files[0]}'), chunks={})
da = ds['Tair']

CPU times: user 5.5 s, sys: 1.99 s, total: 7.49 s
Wall time: 19.9 s


In [5]:
def create_chunked_file(
    da: xr.DataArray, 
    s3fsfs: s3fs.S3FileSystem,
    bucket: str = 'nasa-eodc-public',
    time_chunk=24, 
    lat=50, 
    lon=90, 
    variable_name='Tair',
    output_dir='test_files',
    upload_to_s3=True,
    verbose=True,
):
    """
    Create a NetCDF file with specified chunking and optionally upload to S3.
    
    Parameters:
    -----------
    da : xarray.DataArray
        The input data array to be chunked.
    bucket : str
        S3 bucket name.
    s3fsfs : s3fs.S3FileSystem
        S3 filesystem object.
    time_chunk : int, default 24
        Number of time steps per chunk.
    lat : int
        Latitude chunk size.
    lon : int
        Longitude chunk size.
    variable_name : str, default 'tair'
        Name of the variable in the file.
    output_dir : str, default 'test_files'
        Directory in S3 bucket to store the file.
    upload_to_s3 : bool, default True
        Whether to upload the file to S3.
    verbose : bool, default True
        Whether to print progress information.
        
    Returns:
    --------
    dict
        Dictionary with file paths and timing information.
    """
    
    # Create chunks dictionary
    chunks = {'time': time_chunk, 'lat': lat, 'lon': lon}
    chunk_shape_as_string = ('_').join([f"{k}{v}" for k, v in chunks.items()])
    
    # Generate filename
    filename = f"{chunk_shape_as_string}_{variable_name}.nc"
    
    if verbose:
        print(f"Creating file with chunks: {chunks}")
        print(f"Output filename: {filename}")
    
    # Time the chunking and file writing process
    start_time = time.time()
    
    # Apply chunking
    chunked_da = da.chunk(chunks=chunks)
    encoding = da.encoding.copy()
    del encoding['source']
    del encoding['original_shape']
    del encoding['preferred_chunks']
    encoding['chunksizes'] = tuple(chunks.values())
    
    # Write to NetCDF file
    chunked_da.to_netcdf(filename, mode='w', encoding={variable_name: encoding})
    
    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    
    if verbose:
        print(f"File created in {elapsed_time:.2f} seconds")
    
    # Upload to S3 if requested
    s3_path = None
    
    if upload_to_s3:
        s3_path = f's3://{bucket}/NLDAS/netcdf/{output_dir}/{filename}'
        
        if verbose:
            print(f"Uploading to {s3_path}...")
        
        s3fsfs.put(filename, s3_path)
    
    # Return info about the file
    return {
        'local_path': os.path.abspath(filename),
        's3_path': s3_path,
        'chunks': chunks,
        'file_size_mb': os.path.getsize(filename) / (1024 * 1024),
        'processing_time': elapsed_time,
    }

In [6]:
chunk_configurations = [
    # 1 chunk for all time steps
    dict(time_chunk=24, lat=50, lon=90),
    dict(time_chunk=24, lat=100, lon=180),
    dict(time_chunk=24, lat=500, lon=900),
    dict(time_chunk=24, lat=1000, lon=1800),
    # 6 timesteps per chunk
    dict(time_chunk=6, lat=100, lon=180),
    dict(time_chunk=6, lat=200, lon=360),
    dict(time_chunk=6, lat=1000, lon=1800),
    dict(time_chunk=6, lat=2000, lon=3600),
    # 1 timestep per chunk
    dict(time_chunk=1, lat=250, lon=450),
    dict(time_chunk=1, lat=500, lon=900),
    dict(time_chunk=1, lat=2500, lon=4500),
    dict(time_chunk=1, lat=5000, lon=9000),
]

In [7]:
for config in chunk_configurations[0]:
    create_chunked_file(da=da, s3fsfs=s3fsfs, **config)

Creating file with chunks: {'time': 1, 'lat': 250, 'lon': 450}
Output filename: time1_lat250_lon450_Tair.nc
File created in 594.83 seconds
Uploading to s3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time1_lat250_lon450_Tair.nc...
