# Test the speed of reading GFS data with 0.25 and 1.00 resolution degree into `Xarray dataset` from four sources: `NOMADS server` , `Azure Blob Storage` , `AWS S3 Bucket`, and `Google Cloud Storage` in local environment.

In [1]:
import xarray as xr
import urllib.request
from datetime import datetime, timedelta

In [2]:
yesterday = datetime.now() - timedelta(1)
yesterday = yesterday.strftime("%Y%m%d")

yesterday

'20240805'

### 0.25 resolution degree = 515 MB / file 
- NOMADS
- Azure Blob Storage
- AWS S3 Bucket
- Google Cloud Storage

In [3]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"
filename, _ = urllib.request.urlretrieve(URL)

ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 2.45 s
Wall time: 1min 6s


In [4]:
%%time
URL = f"https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 1.72 s
Wall time: 50.2 s


In [5]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 1.88 s
Wall time: 43.2 s


In [6]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 1.48 s
Wall time: 44.2 s


### 1.00 resolution degree = 42.5 MB / file 
- NOMADS
- Azure Blob Storage
- AWS S3 Bucket
- Google Cloud Storage

In [7]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 406 ms
Wall time: 6.34 s


In [8]:
%%time
URL = f'https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108'

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 312 ms
Wall time: 5.07 s


In [9]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 266 ms
Wall time: 4.34 s


In [10]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 219 ms
Wall time: 4.31 s


### Concatenate 25 GFS file with 1.00 resolution degree (step: 0 - 72 with interval 3)

- NOMADS
- Azure Blob Storage
- AWS S3 Bucket
- Google Cloud Storage

In [21]:
import os
import warnings
warnings.filterwarnings('ignore')

def file_path(source: str, cycle_runtime: int, forecast_hour: int, year: int, month: int, day: int, resolution_degree: float) -> str:
    
    if source == 'nomads':
        prefix_path = "https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/"
    elif source == 'az':
        prefix_path = "https://noaagfs.blob.core.windows.net/gfs/"
    elif source == 'aws':
        prefix_path = "https://noaa-gfs-bdp-pds.s3.amazonaws.com/"
    elif source == 'gcs':
        prefix_path = "https://storage.googleapis.com/global-forecast-system/"

    assert source in ['nomads', 'az', 'aws', 'gcs'], "input source must be one of ['nomads', 'az', 'aws', 'gcs']"

    product_name = "gfs"
    resolution_split = str(resolution_degree).split(".")

    file_path = (
        f"{product_name}.{year}{month:>02}{day:>02}/"
        f"{cycle_runtime:>02}/atmos/{product_name}.t{cycle_runtime:>02}z."
        f"pgrb2.{resolution_split[0]}p{resolution_split[1]:<02}.f{forecast_hour:>03}"
    )

    whole_path = os.path.join(prefix_path, file_path)

    return whole_path

In [12]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='nomads', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 7.12 s
Wall time: 2min 28s


In [13]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 10.1 s
Wall time: 2min 2s


In [14]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 9.83 s
Wall time: 1min 52s


In [15]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 9.23 s
Wall time: 1min 49s
