# Test the speed of reading GFS data with 0.25 and 1.00 resolution degree into `Xarray dataset` from four sources: `NOMADS server` , `Azure Blob Storage` , `AWS S3 Bucket`, `Google Cloud Storage` in Azure Databricks environment.
<br>

## The compute cost for this notebook speed test:
<img src="..\img\Databricks_Instance_Cost.png" width="100%"/>

---

In [None]:
!python -m pip install --upgrade pip
!pip install xarray[complete]
!pip install eccodes
!pip install ecmwflibs
!pip install cfgrib
!pip install numpy==1.23.0

In [None]:
dbutils.library.restartPython()

In [None]:
import xarray as xr
import urllib.request
from datetime import datetime, timedelta

yesterday = datetime.now() - timedelta(1)
yesterday = yesterday.strftime("%Y%m%d")

yesterday

### 0.25 resolution degree = 515 MB / file 
- NOMADS: 55s
- Azure Blob Storage: 11s
- AWS S3 Bucket: 9s
- Google Cloud Storage: 6s

In [None]:
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"
filename, _ = urllib.request.urlretrieve(URL)

ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f"https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.0p25.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds


### 1.00 resolution degree = 42.5 MB / file 
- NOMADS: 2s
- Azure Blob Storage: 2s
- AWS S3 Bucket: 3s
- Google Cloud Storage: 2s

In [None]:
URL = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f'https://noaagfs.blob.core.windows.net/gfs/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108'

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f"https://noaa-gfs-bdp-pds.s3.amazonaws.com/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

In [None]:
URL = f"https://storage.googleapis.com/global-forecast-system/gfs.{yesterday}/12/atmos/gfs.t12z.pgrb2.1p00.f108"

filename, _ = urllib.request.urlretrieve(URL)
ds = xr.open_dataset(
        filename,
        engine="cfgrib",
        filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
        backend_kwargs={"errors": "ignore"}
)

ds

---

### Concatenate 25 GFS file with 1.00 resolution degree (step: 0 - 72 with interval 3)
- NOMADS: 1m
- Azure Blob Storage: 37s
- AWS S3 Bucket: 50s
- Google Cloud Storage: 42s

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

def file_path(source: str, cycle_runtime: int, forecast_hour: int, year: int, month: int, day: int, resolution_degree: float) -> str:
    
    if source == 'nomad':
        prefix_path = "https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod/"
    elif source == 'az':
        prefix_path = "https://noaagfs.blob.core.windows.net/gfs/"
    elif source == 'aws':
        prefix_path = "https://noaa-gfs-bdp-pds.s3.amazonaws.com/"
    elif source == 'gcs':
        prefix_path = "https://storage.googleapis.com/global-forecast-system/"

    product_name = "gfs"
    resolution_split = str(resolution_degree).split(".")

    file_path = (
        f"{product_name}.{year}{month:>02}{day:>02}/"
        f"{cycle_runtime:>02}/atmos/{product_name}.t{cycle_runtime:>02}z."
        f"pgrb2.{resolution_split[0]}p{resolution_split[1]:<02}.f{forecast_hour:>03}"
    )

    whole_path = os.path.join(prefix_path, file_path)

    return whole_path

In [None]:
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='nomad', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

ds_merged

In [None]:
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='az', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

ds_merged

In [None]:
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='aws', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

ds_merged

In [None]:
ds_list = []
for i in range(0, 72 + 1, 3):
    URL = file_path(source='gcs', cycle_runtime=12, forecast_hour=i, year=2024, month=8, day=5, resolution_degree=1.)
    filename, _ = urllib.request.urlretrieve(URL)
    ds = xr.open_dataset(
         filename,
         engine="cfgrib",
         filter_by_keys={'typeOfLevel': 'pressureFromGroundLayer'},
         backend_kwargs={"errors": "ignore"}
    )
    ds_list.append(ds)

ds_merged = xr.concat(ds_list,  dim='step')

ds_merged