## Initial Steps for Working with GRIB Dataset in Python

### Step 1: Install Required Libraries
You will need pygrib, xarray, numpy, pandas, and matplotlib.
Use the following command to install them:
!pip install pygrib xarray numpy pandas matplotlib

In [1]:
import pygrib
import os
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

### Step 2: Load and Explore the GRIB Data

In [2]:
# Load the GRIB file (Replace with the path to your GRIB file)
grib_file_path = './era5_spain.grib'
backend_kwargs = {"filter_by_keys": {"shortName": "t2m"}}  # Replace "t2m" with the desired variable
grib_data = pygrib.open(grib_file_path)

In [4]:
import xarray as xr

# Load the GRIB dataset
data = xr.open_dataset('./era5_spain.grib', engine="cfgrib")

# Print available coordinates
print("Coordinates of the dataset:")
print(data.coords)

# Optionally, print specific coordinate values
print("\nLatitude values:")
print(data["latitude"].values)

print("\nLongitude values:")
print(data["longitude"].values)

# If the dataset has time:
if "time" in data.coords:
    print("\nTime values:")
    print(data["time"].values)

skipping variable: paramId==228 shortName='tp'
Traceback (most recent call last):
  File "/home/jonas/.local/lib/python3.10/site-packages/cfgrib/dataset.py", line 721, in build_dataset_components
    dict_merge(variables, coord_vars)
  File "/home/jonas/.local/lib/python3.10/site-packages/cfgrib/dataset.py", line 639, in dict_merge
    raise DatasetBuildError(
cfgrib.dataset.DatasetBuildError: key present and new value is different: key='time' value=Variable(dimensions=('time',), data=array([1546300800, 1546344000, 1546387200, ..., 1703937600, 1703980800,
       1704024000])) new_value=Variable(dimensions=('time',), data=array([1546279200, 1546322400, 1546365600, ..., 1703916000, 1703959200,
       1704002400]))
skipping variable: paramId==182 shortName='e'
Traceback (most recent call last):
  File "/home/jonas/.local/lib/python3.10/site-packages/cfgrib/dataset.py", line 721, in build_dataset_components
    dict_merge(variables, coord_vars)
  File "/home/jonas/.local/lib/python3.10/sit

Coordinates of the dataset:
Coordinates:
    number               int64 8B ...
  * time                 (time) datetime64[ns] 29kB 2019-01-01 ... 2023-12-31...
    step                 timedelta64[ns] 8B ...
    surface              float64 8B ...
  * latitude             (latitude) float64 72B 44.0 43.0 42.0 ... 37.0 36.0
  * longitude            (longitude) float64 120B -10.0 -9.0 -8.0 ... 3.0 4.0
    valid_time           (time) datetime64[ns] 29kB ...
    depthBelowLandLayer  float64 8B ...

Latitude values:
[44. 43. 42. 41. 40. 39. 38. 37. 36.]

Longitude values:
[-10.  -9.  -8.  -7.  -6.  -5.  -4.  -3.  -2.  -1.   0.   1.   2.   3.
   4.]

Time values:
['2019-01-01T00:00:00.000000000' '2019-01-01T12:00:00.000000000'
 '2019-01-02T00:00:00.000000000' ... '2023-12-30T12:00:00.000000000'
 '2023-12-31T00:00:00.000000000' '2023-12-31T12:00:00.000000000']


In [None]:
# Step 1: Extract temperature data (e.g., variable "t2m")
temperature = data["t2m"]

# Step 2: Filter measurements at 12:00
data_12 = data.sel(time=temperature["time"].dt.hour == 12)

# Step 3: Restrict to May–August
data_summer = data_12.sel(time=temp_12["time"].dt.month.isin([5, 6, 7, 8]))

temp_july_aug2 = data_summer["t2m"].sel(time=temp_summer["time"].dt.month.isin([7, 8]))

# Step 4: Extract July–August for percentile calculation
temp_july_aug = temp_summer.sel(time=temp_summer["time"].dt.month.isin([7, 8]))

# Step 5: Compute the 95th percentile for each location in July–August
percentile_95 = temp_july_aug.quantile(0.95, dim="time")

# Step 6: Label hot days (May–August) based on the 95th percentile
hot_days = temp_summer > percentile_95

# Step 7: Add the "hot_day" label to the dataset without modifying other variables
data_labeled = data.assign(hot_day=hot_days)

# Save the labeled dataset for further analysis
# data_labeled.to_netcdf("labeled_dataset.nc")

# Optional: Verify the results
print(data)
print(data_labeled.where(data_labeled["hot_day"], drop=True))




In [31]:
# Filter dataset to retain only hot days (where hot_day == True)
hot_days = data_labeled["hot_day"].where(data_labeled["hot_day"], drop=True)

# Stack latitude and longitude to create a single dimension
hot_days_stacked = hot_days.stack(points=["latitude", "longitude"])

# Drop NaNs (non-hot days)
hot_days_stacked = hot_days_stacked.dropna("points")

# Reset the index to include 'time', 'latitude', and 'longitude'
hot_days_stacked = hot_days_stacked.reset_index(["time", "latitude", "longitude"])

# Convert to a DataFrame
hot_days_df = hot_days_stacked.to_dataframe(name="hot_day")

# Display the result (date and coordinates)
print(hot_days_df[["time", "latitude", "longitude"]])


<xarray.DataArray 'hot_day' (time: 3267, points: 135)> Size: 4MB
array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])
Coordinates:
  * time                 (time) datetime64[ns] 26kB 2019-01-01 ... 2023-12-31...
    number               int64 8B 0
    step                 timedelta64[ns] 8B 00:00:00
    surface              float64 8B 0.0
    valid_time           (time) datetime64[ns] 26kB 2019-01-01 ... 2023-12-31...
    depthBelowLandLayer  float64 8B 0.0
    quantile             float64 8B 0.95
  * points               (points) object 1kB MultiIndex
  * latitude             (points) float64 1kB 44.0 44.0 44.0 ... 36.0 36.0 36.0
  * longitude            (points) float64 1kB -10.0 -9.0 -8.0 ... 2.0 3.0 4.0


In [20]:
import pandas as pd

# Filter the dataset for hot days
hot_day_data = data_labeled["hot_day"].where(data_labeled["hot_day"], drop=True)

# Stack only latitude and longitude to create a 1D representation
hot_days = hot_day_data.stack(points=["latitude", "longitude"]).dropna("points")

# Ensure no duplicate dimensions or conflicts during conversion
hot_days_df = hot_days.reset_index(["latitude", "longitude", "time"]).to_dataframe(name="hot_day")

# Print 50 hot days with dates and coordinates
print("Sample of 50 hot days with dates and coordinates:")
print(hot_days_df.head(50))


Sample of 50 hot days with dates and coordinates:
Empty DataFrame
Columns: [number, step, surface, valid_time, depthBelowLandLayer, quantile, latitude, longitude, hot_day]
Index: []


In [14]:
# Inspect the 95th percentile values
print(percentile_95-273.15)

# Compare these values with the summer temperatures
print(temp_summer.max()-273.15)


<xarray.DataArray 't2m' (latitude: 9, longitude: 15)> Size: 1kB
array([[20.38129883, 20.55343018, 20.83851318, 21.68621826, 22.30810547,
        22.59553223, 22.82792969, 23.69127197, 24.07894287, 35.16433105,
        36.30611572, 36.59200439, 34.18879395, 30.74462891, 34.67364502],
       [20.44396973, 26.47862549, 28.12498779, 28.24962158, 26.27336426,
        27.31981201, 30.13179932, 31.76206055, 30.45926514, 29.80283203,
        27.5637085 , 30.4756958 , 31.76092529, 34.75413818, 26.07731934],
       [20.81104736, 19.79202881, 29.82106934, 30.27561035, 34.40535889,
        34.08448486, 35.16856689, 31.85689697, 34.50469971, 36.51617432,
        36.6618042 , 34.1029541 , 32.05683594, 32.06547852, 26.5920166 ],
       [21.12652588, 20.13623047, 31.01103516, 33.77158203, 33.54681396,
        34.86575928, 32.60421143, 34.82915039, 35.00170898, 34.67012939,
        36.78917236, 30.81690674, 27.69758301, 27.76158447, 27.8460083 ],
       [21.12706299, 21.09835205, 33.28282471, 36.369030

In [16]:
print(temp_summer-273)


<xarray.DataArray 't2m' (time: 615, latitude: 9, longitude: 15)> Size: 332kB
array([[[12.492676, 12.529785, 12.541504, ..., 19.512207, 17.535645,
         21.35205 ],
        [13.021973, 15.27002 , 14.38916 , ..., 18.305176, 22.762207,
         15.477051],
        [13.57666 , 15.20166 , 15.531738, ..., 18.557129, 20.035645,
         16.256348],
        ...,
        [15.10791 , 16.406738, 25.105957, ..., 16.904785, 16.885254,
         16.631348],
        [15.76416 , 15.693848, 19.436035, ..., 17.322754, 17.123535,
         16.586426],
        [16.141113, 17.141113, 18.350098, ..., 24.03955 , 22.721191,
         22.416504]],

       [[13.045654, 12.633545, 12.395264, ..., 16.24878 , 13.014404,
         19.957764],
        [12.68042 , 15.338623, 14.531982, ..., 12.567139, 19.444092,
         15.692139],
        [13.446045, 15.99292 , 16.864014, ..., 17.04956 , 17.703857,
         14.969482],
...
        [19.854492, 18.94043 , 29.00879 , ..., 25.879883, 25.838867,
         25.555664],
    