In [20]:
import time
import s3fs
import xarray as xr
import os

import sys
sys.path.append("..")
import helpers

In [21]:
bucket = 'nasa-eodc-scratch'
timechunks = '24'
directory = f'NLDAS/netcdf/.timechunk{timechunks}'

In [47]:
credentials = helpers.get_credentials()
s3fsfs = helpers.create_s3filesystem(credentials)

In [48]:
%%time
files = s3fsfs.glob(f's3://{bucket}/{directory}/*.nc')
ds = xr.open_dataset(s3fsfs.open(f's3://{files[0]}'))
da = ds['Tair'].sel(lat=slice(40,62), lon=slice(-125, -103))

CPU times: user 3.41 s, sys: 1.88 s, total: 5.29 s
Wall time: 32.4 s


In [44]:
def create_chunked_file(
    da: xr.DataArray, 
    s3fsfs: s3fs.S3FileSystem,
    bucket: str = 'nasa-eodc-scratch',
    time_chunk=24, 
    lat=50, 
    lon=90, 
    variable_name='Tair',
    output_dir='test_files',
    upload_to_s3=True,
    verbose=True,
):
    """
    Create a NetCDF file with specified chunking and optionally upload to S3.
    
    Parameters:
    -----------
    da : xarray.DataArray
        The input data array to be chunked.
    bucket : str
        S3 bucket name.
    s3fsfs : s3fs.S3FileSystem
        S3 filesystem object.
    time_chunk : int, default 24
        Number of time steps per chunk.
    lat : int
        Latitude chunk size.
    lon : int
        Longitude chunk size.
    variable_name : str, default 'tair'
        Name of the variable in the file.
    output_dir : str, default 'chunking_test'
        Directory in S3 bucket to store the file.
    upload_to_s3 : bool, default True
        Whether to upload the file to S3.
    verbose : bool, default True
        Whether to print progress information.
        
    Returns:
    --------
    dict
        Dictionary with file paths and timing information.
    """
    
    # Create chunks dictionary
    chunks = {'time': time_chunk, 'lat': lat, 'lon': lon}
    chunk_shape_as_string = ('_').join([f"{k}{v}" for k, v in chunks.items()])
    
    # Generate filename
    filename = f"{chunk_shape_as_string}_{variable_name}.nc"
    
    if verbose:
        print(f"Creating file with chunks: {chunks}")
        print(f"Output filename: {filename}")
    
    # Time the chunking and file writing process
    start_time = time.time()
    
    # Apply chunking
    chunked_da = da.chunk(chunks=chunks)
    encoding = da.encoding.copy()
    del encoding['source']
    del encoding['original_shape']
    del encoding['preferred_chunks']
    encoding['chunksizes'] = tuple(chunks.values())
    
    # Write to NetCDF file
    chunked_da.to_netcdf(filename, mode='w', encoding={variable_name: encoding})
    
    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    
    if verbose:
        print(f"File created in {elapsed_time:.2f} seconds")
    
    # Upload to S3 if requested
    s3_path = None
    
    if upload_to_s3:
        s3_path = f's3://{bucket}/NLDAS/netcdf/{output_dir}/{filename}'
        
        if verbose:
            print(f"Uploading to {s3_path}...")
        
        s3fsfs.put(filename, s3_path)
    
    # Return info about the file
    return {
        'local_path': os.path.abspath(filename),
        's3_path': s3_path,
        'chunks': chunks,
        'file_size_mb': os.path.getsize(filename) / (1024 * 1024),
        'processing_time': elapsed_time,
    }

In [34]:
chunk_configurations = [
    # 1 chunk for all time steps
    dict(time_chunk=24, lat=45, lon=90), # 0.39MB
    dict(time_chunk=24, lat=350, lon=700), # 23.52 MB
    dict(time_chunk=24, lat=500, lon=1000), # 23.52 MB
    dict(time_chunk=24, lat=1100, lon=2200), # 232.32 MB
    # 6 timesteps per chunk
    dict(time_chunk=6, lat=90, lon=180), # 0.39MB
    dict(time_chunk=6, lat=700, lon=1400), # 12MB
    dict(time_chunk=6, lat=2200, lon=2200), # 116.16 MB
    # 1 timestep per chunk
    dict(time_chunk=1, lat=225, lon=450), # 0.4MB
    dict(time_chunk=1, lat=1000, lon=2000), # 8MB
    dict(time_chunk=1, lat=2200, lon=2200), # 19.36 MB
]

In [45]:
create_chunked_file(da, s3fsfs, **chunk_configurations[0])

Creating file with chunks: {'time': 24, 'lat': 45, 'lon': 90}
Output filename: time24_lat45_lon90_Tair.nc
File created in 60.15 seconds
Uploading to s3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time24_lat45_lon90_Tair.nc...


{'local_path': '/home/jovyan/veda-odd/nldas_benchmarking/01_create_intake_stac/time24_lat45_lon90_Tair.nc',
 's3_path': 's3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time24_lat45_lon90_Tair.nc',
 'chunks': {'time': 24, 'lat': 45, 'lon': 90},
 'file_size_mb': 216.09884071350098,
 'processing_time': 60.152421712875366}

In [51]:
for config in chunk_configurations[-3:]:
    create_chunked_file(da=da, s3fsfs=s3fsfs, **config)

Creating file with chunks: {'time': 1, 'lat': 225, 'lon': 450}
Output filename: time1_lat225_lon450_Tair.nc
File created in 244.13 seconds
Uploading to s3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time1_lat225_lon450_Tair.nc...
Creating file with chunks: {'time': 1, 'lat': 1000, 'lon': 2000}
Output filename: time1_lat1000_lon2000_Tair.nc
File created in 578.12 seconds
Uploading to s3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time1_lat1000_lon2000_Tair.nc...
Creating file with chunks: {'time': 1, 'lat': 2200, 'lon': 2200}
Output filename: time1_lat2200_lon2200_Tair.nc
File created in 356.01 seconds
Uploading to s3://nasa-eodc-scratch/NLDAS/netcdf/test_files/time1_lat2200_lon2200_Tair.nc...


In [49]:
# filename = 'time6_lat2200_lon2200_Tair.nc'
# output_dir = 'test_files'
# s3_path = f's3://{bucket}/NLDAS/netcdf/{output_dir}/{filename}'

# s3fsfs.put(filename, s3_path)

[None]

In [52]:
!ls -ltr -h *Tair.nc

-rw-r--r-- 1 jovyan jovyan 217M May 16 19:09 time24_lat45_lon90_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 19:15 time24_lat350_lon700_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 19:25 time24_lat500_lon1000_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 19:35 time24_lat1100_lon2200_Tair.nc
-rw-r--r-- 1 jovyan jovyan 218M May 16 19:37 time6_lat90_lon180_Tair.nc
-rw-r--r-- 1 jovyan jovyan 220M May 16 19:47 time6_lat700_lon1400_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 19:58 time6_lat2200_lon2200_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 20:11 time1_lat225_lon450_Tair.nc
-rw-r--r-- 1 jovyan jovyan 221M May 16 20:21 time1_lat1000_lon2000_Tair.nc
-rw-r--r-- 1 jovyan jovyan 219M May 16 20:27 time1_lat2200_lon2200_Tair.nc
