# Appendix 02 - Data preprocessing: *ERA5 data*

In [1]:
import xarray as xr
import os
import numpy as np

import xarray as xr
import rioxarray
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import mapping

import pandas as pd

Path for complete file:
- Temperature:      `../data/raw/ERA5/2m_temperature`
- Precipitation:    `../data/raw/ERA5/total_precipitation`

## STEP 1: regridding

- Define a target grid:


gridtype = lonlat   <br />
xsize    = 360      <br />
ysize    = 180      <br />
xfirst   = −179.5   <br />
xinc     = 1        <br />
yfirst   = -89.5    <br />
yinc     = 1        <br />


In [2]:
os.system('cdo remapbil,../data/preprocessed/ERA5/target_grid.txt -selyear,1960/2019 ../data/raw/ERA5/2m_temperature/temperature_1950-2019.nc ../data/temp/ERA5/2m_temperature/temperature_temp.nc')
os.system('cdo splityear ../data/temp/ERA5/2m_temperature/temperature_temp.nc ../data/temp/ERA5/2m_temperature/temperature_')
os.remove('../data/temp/ERA5/2m_temperature/temperature_temp.nc')

os.system('cdo remapbil,../data/preprocessed/ERA5/target_grid.txt -selyear,1960/2019 ../data/raw/ERA5/total_precipitation/precipitation_1950-2023.nc ../data/temp/ERA5/total_precipitation/precipitation_temp.nc')
os.system('cdo splityear ../data/temp/ERA5/total_precipitation/precipitation_temp.nc ../data/temp/ERA5/total_precipitation/precipitation_')
os.remove('../data/temp/ERA5/total_precipitation/precipitation_temp.nc')

cdo(1) selyear: Process started
cdo    remapbil: Bilinear weights from lonlat (141x141) to lonlat (41x36) grid
cdo(1) selyear: Processed 435692115 values from 1 variable over 25567 timesteps.
cdo    remapbil: Processed 435692115 values from 1 variable over 21915 timesteps [6.58s 110MB].
cdo    splityear: Processed 32346540 values from 1 variable over 21915 timesteps [2.19s 90MB].
cdo(1) selyear: Process started
cdo    remapbil: Bilinear weights from lonlat (141x141) to lonlat (41x36) grid
cdo(1) selyear: Processed 434459493 values from 1 variable over 26966 timesteps.
cdo    remapbil: Processed 434459493 values from 1 variable over 21853 timesteps [6.77s 118MB].


In [None]:
temperature_dataset = xr.open_mfdataset('../data/temp/ERA5/2m_temperature/*.nc')
precipitation_dataset = xr.open_mfdataset('../data/temp/ERA5/total_precipitation/*.nc').chunk(dict(time=-1))



In [None]:
### FILLING NA IN DATASET TP

# Verifica l'asse temporale del dataset
time = precipitation_dataset.time

# Crea un indice completo con frequenza oraria
complete_time_index = pd.date_range(start=time.min().item(), end=time.max().item(), freq='D')

# Reindicizza il dataset per includere tutte le date, anche quelle mancanti
ds_reindexed = precipitation_dataset.reindex(time=complete_time_index)

# Interpola i dati per riempire i valori mancanti
precipitation_dataset = ds_reindexed.interpolate_na(dim='time', method='linear')

In [None]:
world_boundary_file   = '../data/raw/external/GADM41_WORLD.gpkg'
world_boundary = gpd.read_file(world_boundary_file, layer='ADM_1')

In [None]:
def clean_cut(dataset_ori, boundary):

    # Converting calendar and removing useless dimensions
    dataset_ori = dataset_ori.convert_calendar('noleap')
    dataset_ori = dataset_ori.drop_dims('bnds')

    # Setting the datasets for masking
    dataset_ori.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
    dataset_ori.rio.write_crs("epsg:4326", inplace=True)

    # Masking the datasets
    dataset_ori = dataset_ori.rio.clip(boundary.geometry.apply(mapping), boundary.crs, drop=True)

    return dataset_ori
    

In [None]:
dataset_t2m_ori = clean_cut(temperature_dataset, world_boundary).rolling(time=15, center=True).mean().sel(time=slice('1961','2018'))
dataset_tp_ori = clean_cut(precipitation_dataset, world_boundary).rolling(time=15, center=True).mean().sel(time=slice('1961','2018'))

In [None]:
dataset_t2m_ori.to_netcdf('../data/preprocessed/ERA5/2m_temperature/temperature_preprocessed.nc')
dataset_tp_ori.to_netcdf('../data/preprocessed/ERA5/total_precipitation/precipitation_preprocessed.nc')