# Test the speed of reading GFS data with 0.25 and 1.00 resolution degree into `Xarray dataset` from four sources: `NOMADS server` , `Azure Blob Storage` , `AWS S3 Bucket`, `Google Cloud Storage` in Azure Databricks environment.
<br>

## The compute cost for this notebook speed test:
<img src="Databricks_Instance_Cost.png" width="100%"/>

### **Result**:
|GFS Data|NOMADs|Azure|AWS|GCS|
|----|----|-----|---|---|
|1 file with 0.25 res|6.3 s|9.24 s  |22.6 s|8.06 s |
|1 file with 1.00 res|1.58 s|1.43 s| 2.24 s |1.58 s  |
|25 files with 0.25 res|5 m 50 s|3 m 31 s|6 m 31 s|3 m 20 s  |
|25 files with 1.00 res|1 m 19 s|38.7 s|1 m 6 s|49.7 s |

---

In [None]:
!python -m pip install --upgrade pip
!pip install xarray[complete]
!pip install eccodes
!pip install ecmwflibs
!pip install cfgrib
!pip install numpy==1.23.0

Collecting pip
  Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.2.2
    Uninstalling pip-22.2.2:
      Successfully uninstalled pip-22.2.2
Successfully installed pip-24.2
Collecting xarray[complete]
  Downloading xarray-2024.7.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy>=1.23 (from xarray[complete])
  Downloading numpy-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014

In [None]:
dbutils.library.restartPython()

In [None]:
import xarray as xr
import urllib.request
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(days = 1)
yesterday = yesterday.strftime("%Y%m%d")

yesterday

'20240813'

### 0.25 resolution degree = 515 MB / file 

In [None]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"
filename, _ = urllib.request.urlretrieve(URL)

ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 3.82 s, sys: 1.84 s, total: 5.66 s
Wall time: 6.3 s


In [None]:
%%time
URL = f"https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 2.23 s, sys: 1.56 s, total: 3.8 s
Wall time: 9.24 s


In [None]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 2.21 s, sys: 1.64 s, total: 3.84 s
Wall time: 22.6 s


In [None]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 2.1 s, sys: 1.74 s, total: 3.84 s
Wall time: 8.06 s


### 1.00 resolution degree = 42.5 MB / file 

In [None]:
%%time
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 1.17 s, sys: 139 ms, total: 1.31 s
Wall time: 1.58 s


In [None]:
%%time
URL = f'https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108'

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 1.19 s, sys: 148 ms, total: 1.33 s
Wall time: 1.43 s


In [None]:
%%time
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 1.17 s, sys: 160 ms, total: 1.33 s
Wall time: 2.24 s


In [None]:
%%time
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

CPU times: user 1.17 s, sys: 166 ms, total: 1.34 s
Wall time: 1.58 s


---

### Concatenate 25 GFS file with 0.25 resolution degree (step: 0 - 24)

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

def file_path(source: str, cycle_runtime: int, forecast_hour: int, year: int, month: int, day: int, resolution_degree: float) -> str:
    
    if source == 'nomads':
        prefix_path = "https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/"
    elif source == 'az':
        prefix_path = "https://noaagfs.blob.core.windows.net/gfs/"
    elif source == 'aws':
        prefix_path = "https://noaa-gfs-bdp-pds.s3.amazonaws.com/"
    elif source == 'gcs':
        prefix_path = "https://storage.googleapis.com/global-forecast-system/"

    assert source in ['nomads', 'az', 'aws', 'gcs'], "input source must be one of ['nomads', 'az', 'aws', 'gcs']"

    product_name = "gfs"
    resolution_split = str(resolution_degree).split(".")

    file_path = (
        f"{product_name}.{year}{month:>02}{day:>02}/"
        f"{cycle_runtime:>02}/atmos/{product_name}.t{cycle_runtime:>02}z."
        f"pgrb2.{resolution_split[0]}p{resolution_split[1]:<02}.f{forecast_hour:>03}"
    )

    whole_path = os.path.join(prefix_path, file_path)

    return whole_path

In [None]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='nomads', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 1min 10s, sys: 46.6 s, total: 1min 57s
Wall time: 5min 50s


In [None]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 59.3 s, sys: 43.4 s, total: 1min 42s
Wall time: 3min 31s


In [None]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 1min 12s, sys: 45.4 s, total: 1min 58s
Wall time: 6min 11s


In [None]:
%%time
ds_list = []
for i in range(0, 24 + 1):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=.25)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 1min 5s, sys: 43.5 s, total: 1min 49s
Wall time: 3min 20s


### Concatenate 25 GFS file with 1.00 resolution degree (step: 0 - 72 with interval 3)

In [None]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='nomads', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 30.2 s, sys: 3.98 s, total: 34.2 s
Wall time: 1min 19s


In [None]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 30.2 s, sys: 3.68 s, total: 33.9 s
Wall time: 38.7 s


In [None]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 30.9 s, sys: 3.96 s, total: 34.8 s
Wall time: 1min 6s


In [None]:
%%time
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=int(yesterday[:4]), month=int(yesterday[4:6]), day=int(yesterday[6:8]), resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

CPU times: user 31.1 s, sys: 3.73 s, total: 34.8 s
Wall time: 49.7 s
