# Exploring "ERA5" using tensorlakehouse

### Data description
* Collection/dataset:"ERA5"
* Data Source: Geotiff on Cloud Object Store (COS) 

### Functionalities
* tensorlakehouse (openEO) functionalities:
    - describe collection
    - load collection

### Setup

1. Create a python virtualenv (python 3.11.7 is the recommended version). 
2. Clone the repository:
```
git clone https://github.com/IBM/tensorlakehouse-openeo-driver.git
``` 

3. Go to
```
cd tensorlakehouse-openeo-driver/tutorials
```

4. Install dependencies:
```
pip install -r tutorial_requirements.txt
```

5. Run jupyter notebook or jupyter lab
```
jupyter lab .
```


In [1]:
import rioxarray as rxr
import os
os.environ['USE_PYGEOS'] = '0'
import matplotlib.pyplot as plt
from pathlib import Path
# openeo is a client-side implementation that allows users to query OpenEO service
import openeo
# pystac_client is a client-side implementation that allows users to query STAC service
from pystac_client import Client
# OpenEO service URL
import os
from dotenv import load_dotenv
import pandas as pd
import xarray as xr
from shapely.geometry import shape
import geopandas
import folium
import time

Set URL and credentials of the OpenEO service

In [2]:
# username and password are hardcoded in the backend as an example for testing
load_dotenv()
STAC_URL="https://stac-fastapi-pgstac-geospatial-be.apps.fmaas-backend.fmaas.res.ibm.com"
OPENEO_URL="https://tensorlakehouse-openeo-driver-geospatial-be.apps.fmaas-backend.fmaas.res.ibm.com"
OPENEO_AUTH_CLIENT_ID = os.environ["OPENEO_AUTH_CLIENT_ID"]
OPENEO_AUTH_CLIENT_SECRET = os.environ["OPENEO_AUTH_CLIENT_SECRET"]
APPID_USERNAME = os.environ["APPID_USERNAME"]
APPID_PASSWORD = os.environ["APPID_PASSWORD"]

# connect to tensorlakehouse
tensorlakehouse_conn = openeo.connect(OPENEO_URL).authenticate_oidc_resource_owner_password_credentials(
username=APPID_USERNAME,
password=APPID_PASSWORD,
client_id=OPENEO_AUTH_CLIENT_ID,
client_secret=OPENEO_AUTH_CLIENT_SECRET,
provider_id="app_id")

catalog = Client.open(STAC_URL)

### Setting input parameters: time range, area of interest, collection ID, filter

In [3]:
# set time interval
start ='2024-01-01T00:00:00Z'
end = '2024-01-01T12:00:00Z'
time_range = f"{start}/{end}"

# set bounding box
west = -1.0
east = 0.0
south = 51.0
north = 52.0

collection_id = "ibm-eis-ga-1-era5-pr-wdc"

# create data dir to save files
data_dir = Path("test_data")
if not data_dir.exists():
    data_dir.mkdir()

### Searching STAC items 

In [4]:
bbox=[west, south, east, north]
print(f"Searching {collection_id=} {bbox=} datetime={time_range}")
result = catalog.search(collections=[collection_id], bbox=bbox, datetime=time_range)
counter = 0
ids = list()
geometry = list()
timestamp = list()
bands = list()

for item in result.items_as_dicts():
    ids.append(item["id"])
    geometry.append(shape(item["geometry"]))
    timestamp.append(pd.Timestamp(item["properties"]["datetime"]))
    bands.append(list(item["properties"]["cube:variables"].keys()))
    
gdf = geopandas.GeoDataFrame(data={"id": ids, "datetime": timestamp, "bands": bands}, geometry=geometry)
gdf



Searching collection_id='ibm-eis-ga-1-era5-pr-wdc' bbox=[-1.0, 51.0, 0.0, 52.0] datetime=2024-01-01T00:00:00Z/2024-01-01T12:00:00Z


Unnamed: 0,id,datetime,bands,geometry
0,ERA5P_20240101T120000__ttrc_120000_0_0,2024-01-01 12:00:00+00:00,[ttrc],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
1,ERA5P_20240101T120000__ttr_120000_0_0,2024-01-01 12:00:00+00:00,[ttr],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
2,ERA5P_20240101T120000__tsrc_120000_0_0,2024-01-01 12:00:00+00:00,[tsrc],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
3,ERA5P_20240101T120000__tsr_120000_0_0,2024-01-01 12:00:00+00:00,[tsr],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
4,ERA5P_20240101T120000__tp_120000_0_0,2024-01-01 12:00:00+00:00,[tp],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
...,...,...,...,...
619,ERA5P_20240101T000000__10v_000000_0_0,2024-01-01 00:00:00+00:00,[10v],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
620,ERA5P_20240101T000000__10u_000000_0_0,2024-01-01 00:00:00+00:00,[10u],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
621,ERA5P_20240101T000000__10fg_000000_0_0,2024-01-01 00:00:00+00:00,[10fg],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."
622,ERA5P_20240101T000000__100v_000000_0_0,2024-01-01 00:00:00+00:00,[100v],"POLYGON ((-180.00000 -90.00000, -180.00000 90...."


In [18]:
available_timestamps = sorted(gdf["datetime"].unique())
print(f"Available timestamps = {available_timestamps}")
available_bands = set([b.pop() for b in gdf["bands"]])
print(f"Available bands: {available_bands}")

Available timestamps = [Timestamp('2024-01-01 00:00:00+0000', tz='UTC'), Timestamp('2024-01-01 01:00:00+0000', tz='UTC'), Timestamp('2024-01-01 02:00:00+0000', tz='UTC'), Timestamp('2024-01-01 03:00:00+0000', tz='UTC'), Timestamp('2024-01-01 04:00:00+0000', tz='UTC'), Timestamp('2024-01-01 05:00:00+0000', tz='UTC'), Timestamp('2024-01-01 06:00:00+0000', tz='UTC'), Timestamp('2024-01-01 07:00:00+0000', tz='UTC'), Timestamp('2024-01-01 08:00:00+0000', tz='UTC'), Timestamp('2024-01-01 09:00:00+0000', tz='UTC'), Timestamp('2024-01-01 10:00:00+0000', tz='UTC'), Timestamp('2024-01-01 11:00:00+0000', tz='UTC'), Timestamp('2024-01-01 12:00:00+0000', tz='UTC')]
Available bands: {'ttrc', 'sp', 'swvl2', '2t', 'mn2t', '10v', 'ptype', 'tcwv', '100v', '100u', 'msl', 'mxtpr', 'ssrd', 'tcw', 'stl2', 'ssrdc', 'sf', '2d', 'tp', 'mcc', 'mwp', 'fdir', 'tsrc', 'tcrw', 'rsn', '10fg', 'swvl4', 'tsr', 'strd', 'swh', 'hcc', 'tcc', 'ttr', 'mwd', 'swvl3', 'lcc', 'stl1', 'i10fg', 'strdc', '10u', 'sd', 'stl4', 'sw

### Describing the collection using Tensorlakehouse

In [7]:
tensorlakehouse_conn.describe_collection(collection_id)

## Loading tensorlakehouse datacube

In [8]:
bands = ["2t"]
spatial_extent={
        'west' : west,
        'south' : south,
        'east' : east,
        'north' : north
    }

temporal_extent = [start, end]

print(f"{collection_id=} {spatial_extent=} {temporal_extent=} {bands=}")

cube = tensorlakehouse_conn.load_collection(
    collection_id=collection_id,
    spatial_extent=spatial_extent,
    temporal_extent=temporal_extent,
    bands=bands,
)


collection_id='ibm-eis-ga-1-era5-pr-wdc' spatial_extent={'west': -1.0, 'south': 51.0, 'east': 0.0, 'north': 52.0} temporal_extent=['2024-01-01T00:00:00Z', '2024-01-01T12:00:00Z'] bands=['2t']


In [9]:
output_format = "netCDF"
cube = cube.save_result(output_format)

### Submit a batch job and wait until it finishes

In [10]:
# create a batch jobs
job = cube.create_job(out_format=output_format)
job_status = job.status()
job_id = job.job_id
# create a directory to store the results
output_dir = data_dir / f"test_batch_jobs_{job_id}"
if not output_dir.exists():
    output_dir.mkdir()
# while jobs is still in progress, wait for it
while job_status not in ["canceled", "finished", "error"]:
    time.sleep(30)
    job_status = job.status()
    print(f"{job_status=}")
# get results
results = job.get_results()
# download files
results.download_files(output_dir)
print(f"Downloaded to {output_dir} directory")

Preflight process graph validation raised: [Internal] 'TensorLakehouseProcessRegistry' object has no attribute 'get_function'


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### List files, open the netcdf file and plot the image

In [None]:
downloaded_files = list(output_dir.glob("*"))
downloaded_files

In [None]:
# listing all netcdf files
netcdf_files = [f for f in downloaded_files if f.suffix == ".nc"]
ds = xr.open_dataset(netcdf_files[0])
ds

In [None]:
ds.time.values

In [None]:
# open all raster files and concatenate

da = ds[bands[0]]
da.isel({"time":0}).plot()