### Use fsspec to glob the public cloud dataset, *nc4 file!!

In [13]:
import fsspec

# create the filesystem object for aws s3 
fs = fsspec.filesystem('s3', anon=True) # anonymous access (no need to login and provide credentials)


In [14]:
# take open access NOAA OISST data
oisst_files = fs.glob('s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.*.nc')

oisst_files = sorted(['s3://'+f for f in oisst_files])
#['s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100101.nc',
# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100102.nc',
# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100103.nc',
# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100104.nc',
#...
#]
print(f'Dataset info:\n  Total number: {len(oisst_files)}\n  First file: {oisst_files[0]}\n  Last file: {oisst_files[-1]}')

Dataset info:
  Total number: 31
  First file: s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240801.nc
  Last file: s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240831.nc


In [19]:
def get_s3_files(bucket_path, pattern, anonymous=True):
    """Get files from S3 bucket matching the pattern.
    
    Args:
        bucket_path (str): S3 bucket path
        pattern (str): File pattern to match
        anonymous (bool): Whether to use anonymous access
    
    Returns:
        list: List of matched file paths
    """
    fs = fsspec.filesystem('s3', anon=anonymous)
    files = fs.glob(f'{bucket_path}/{pattern}')
    return sorted(['s3://' + f for f in files])

# 使用示例
files = get_s3_files(
    'noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr',
    '202408/*.nc'
)

files

['s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240801.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240802.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240803.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240804.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240805.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240806.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240807.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.20240808.nc',
 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/20240

In [15]:
from virtualizarr import open_virtual_dataset


def create_virtual_datasets(file_urls:str, storage_options=None):
    """Create virtual datasets from a list of URLs.

    Args:
        file_urls (list): List of URLs to create virtual datasets from.
        storage_options (dict, optional): Storage options for data access.
            Defaults to {'anon': True} for anonymous access.

    Returns:
        list: List of virtual dataset objects.
    """
    if storage_options is None:
        storage_options = {'anon': True} # anonymous acces (no login), access public dataset

    virtual_datasets = [
        open_virtual_dataset(
            url,
            indexes={},
            reader_options={'storage_options': storage_options}
        )
        for url in file_urls
    ]

    return virtual_datasets



oisst_datasets = create_virtual_datasets(oisst_files)


In [22]:
type(oisst_datasets)

list

In [16]:
import xarray as xr

virtual_ds = xr.concat(
    oisst_datasets,
    dim='time',
    coords='minimal', 
    compat='override',
    combine_attrs='override'
)

In [17]:
virtual_ds