# Test the speed of reading GFS data with 0.25 and 1.00 resolution degree into `Xarray dataset` from four sources: `NOMADS server` , `Azure Blob Storage` , `AWS S3 Bucket`, and `Google Cloud Storage` in local environment.

### **Result**:
|GFS Data|NOMADs|Azure|AWS|GCS|
|----|----|-----|---|---|
|1 file with 0.25 res|1 m 32 s|1 m 1 s  |51.8 s|51.1 s |
|1 file with 1.00 res|7.09 s|5.27 s| 5.06 s |5.21 s  |
|25 files with 0.25 res|34 m 7 s|24 m 3 s|21 m 5 s|23 m 22 s  |
|25 files with 1.00 res|3 m 19 s|2 m 21 s |2 m 14 s|2 m 19 s |



In [1]:
import xarray as xr
import urllib.request
from datetime import datetime, timedelta

In [2]:
# use yesterday data from every sources for testing consistency
yesterday = datetime.now() - timedelta(days = 1)
yesterday = yesterday.strftime("%Y%m%d")

yesterday

'20240813'

### 0.25 resolution degree = 515 MB / file 

In [3]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"
filename, _ = urllib.request.urlretrieve(URL)

ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 4.94 s
Wall time: 1min 32s


In [4]:
%%time
URL = f"https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 3.67 s
Wall time: 1min 1s


In [5]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 3.56 s
Wall time: 51.8 s


In [6]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 3.69 s
Wall time: 51.1 s


### 1.00 resolution degree = 42.5 MB / file 

In [7]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 750 ms
Wall time: 7.09 s


In [8]:
%%time
URL = f'https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108'

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 719 ms
Wall time: 5.27 s


In [9]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 344 ms
Wall time: 5.06 s


In [10]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: total: 344 ms
Wall time: 5.21 s


### Concatenate 25 GFS file with 0.25 resolution degree (step: 0 - 24)

In [11]:
import os
import warnings
warnings.filterwarnings('ignore')

def file_path(source: str, cycle_runtime: int, forecast_hour: int, year: int, month: int, day: int, resolution_degree: float) -> str:
    
    if source == 'nomads':
        prefix_path = "https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/"
    elif source == 'az':
        prefix_path = "https://noaagfs.blob.core.windows.net/gfs/"
    elif source == 'aws':
        prefix_path = "https://noaa-gfs-bdp-pds.s3.amazonaws.com/"
    elif source == 'gcs':
        prefix_path = "https://storage.googleapis.com/global-forecast-system/"

    assert source in ['nomads', 'az', 'aws', 'gcs'], "input source must be one of ['nomads', 'az', 'aws', 'gcs']"

    product_name = "gfs"
    resolution_split = str(resolution_degree).split(".")

    file_path = (
        f"{product_name}.{year}{month:>02}{day:>02}/"
        f"{cycle_runtime:>02}/atmos/{product_name}.t{cycle_runtime:>02}z."
        f"pgrb2.{resolution_split[0]}p{resolution_split[1]:<02}.f{forecast_hour:>03}"
    )

    whole_path = os.path.join(prefix_path, file_path)

    return whole_path

In [19]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='nomads', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 1min 26s
Wall time: 34min 7s


In [20]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

In [21]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 1min 25s
Wall time: 21min 5s


In [22]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 1min 18s
Wall time: 23min 22s


### Concatenate 25 GFS file with 1.00 resolution degree (step: 0 - 72 with interval 3)

In [12]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='nomads', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 13.4 s
Wall time: 3min 19s


In [13]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 13.8 s
Wall time: 2min 21s


In [14]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 14.2 s
Wall time: 2min 14s


In [15]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: total: 14 s
Wall time: 2min 19s
