# Appendix 02 - Data preprocessing: *ERA5 data*

In [1]:
import xarray as xr
import os
import numpy as np
import cftime 
import xarray as xr
import rioxarray
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import mapping
import dask 

# dask.config.set(**{'array.slicing.split_large_chunks': True})
import pandas as pd

In [2]:
world_boundary_file     = '../data/raw/external/GADM41_WORLD.gpkg' 
world_boundary      = gpd.read_file(world_boundary_file, layer = 'ADM_0')

Path for complete file:
- Temperature:      `../data/raw/ERA5/2m_temperature`
- Precipitation:    `../data/raw/ERA5/total_precipitation`

## STEP 1: regridding

- Define a target grid:


gridtype = lonlat   <br />
xsize    = 360      <br />
ysize    = 180      <br />
xfirst   = −179.5   <br />
xinc     = 1        <br />
yfirst   = -89.5    <br />
yinc     = 1        <br />


In [3]:
def sel_years(dataset, start, end):

    dataset = dataset.sel(time=slice(start, end)).chunk(dict(time=-1))
    return dataset


def interpolate_na(dataset):

    try:
        ### FILLING NA IN DATASET TP
        # Verifica l'asse temporale del dataset
        time = dataset.time

        # Crea un indice completo con frequenza oraria
        complete_time_index = pd.date_range(start=time.min().item(), end=time.max().item(), freq='D')

    except:
        dataset = dataset.convert_calendar('standard')
        ### FILLING NA IN DATASET TP
        # Verifica l'asse temporale del dataset
        time = dataset.time

        # Crea un indice completo con frequenza oraria
        complete_time_index = pd.date_range(start=time.min().item(), end=time.max().item(), freq='D')
        
    # Reindicizza il dataset per includere tutte le date, anche quelle mancanti
    ds_reindexed = dataset.reindex(time=complete_time_index)

    # Interpola i dati per riempire i valori mancanti
    dataset = dataset = ds_reindexed.interpolate_na(dim='time', method='linear')

    return dataset



def clean_cut(dataset, boundary = None, window = None, remove_empty = True):

    # Converting calendar and removing useless dimensions
    dataset = dataset.convert_calendar('noleap')
    dataset = dataset.drop_dims('bnds')

    if boundary is not None:
        # Setting the datasets for masking
        dataset.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
        dataset.rio.write_crs("epsg:4326", inplace=True)

        # Masking the datasets
        dataset = dataset.rio.clip(boundary.geometry.apply(mapping), boundary.crs, drop=True)

    if window is not None:
        dataset = dataset.rolling(time=window, center=True).mean()

    
    if remove_empty:

        start_year = dataset.time.dt.year.min().values
        end_year = dataset.time.dt.year.max().values
        dataset = dataset.sel(time=slice(str(start_year+1),str(end_year-1)))

    return dataset


def remap_cdo(in_file, out_file, grid_file):

    os.system(f'cdo remapbil,{grid_file} {in_file} {out_file}')




def standard_preprocess(in_path, temp_path, out_path, start_year, end_year, grid_file, boundary = None, window = None, remove_empty = True, out_filename = 'final'):

    remap_cdo(f'{in_path}/*.nc', f'{temp_path}/temp.nc', grid_file)

    raw_dataset = xr.open_mfdataset(f'{temp_path}/temp.nc')

    dataset_work = sel_years(raw_dataset, str(start_year),str(end_year))
    dataset_work = interpolate_na(dataset_work)
    dataset_work = clean_cut(dataset_work, boundary, window, remove_empty)

    dataset_work.to_netcdf(f'{out_path}/{out_filename}.nc')

    os.remove(f'{temp_path}/temp.nc')

### ERA5

In [4]:
raw_data_paths       = ['../data/raw/ERA5/2m_temperature', '../data/raw/ERA5/total_precipitation']
temp_data_paths      = ['../data/temp/ERA5/2m_temperature', '../data/temp/ERA5/total_precipitation']
preproc_data_paths   = ['../data/preprocessed/ERA5/2m_temperature', '../data/preprocessed/ERA5/total_precipitation']

target_grid_path    = '../data/preprocessed/ERA5/target_grid.txt' 


In [5]:
for raw_path, temp_path, preprocess_path in zip(raw_data_paths,temp_data_paths,preproc_data_paths):

    standard_preprocess(raw_path, temp_path, preprocess_path, 1968, 2019, target_grid_path, world_boundary, 15, True, 'final')

cdo    remapbil: Bilinear weights from lonlat (141x141) to lonlat (41x36) grid
cdo    remapbil: Processed 508297527 values from 1 variable over 25567 timesteps [18.83s 120MB].




cdo    remapbil: Bilinear weights from lonlat (141x141) to lonlat (41x36) grid
cdo    remapbil: Processed 536111046 values from 1 variable over 26966 timesteps [23.01s 124MB].


In [6]:
# # raw_data_paths       = ['../data/raw/CMIP6/EC-Earth3/ssp585/2m_temperature', '../data/raw/CMIP6/EC-Earth3/ssp585/total_precipitation']
# # temp_data_paths      = ['../data/temp/CMIP6/EC-Earth3/ssp585/2m_temperature', '../data/temp/CMIP6/EC-Earth3/ssp585/total_precipitation']
# # preproc_data_paths   = ['../data/preprocessed/CMIP6/EC-Earth3/ssp585/2m_temperature', '../data/preprocessed/CMIP6/EC-Earth3/ssp585/total_precipitation']

# raw_data_paths       = ['../data/raw/CMIP6/EC-Earth3/ssp585/total_precipitation']
# temp_data_paths      = ['../data/temp/CMIP6/EC-Earth3/ssp585/total_precipitation']
# preproc_data_paths   = ['../data/preprocessed/CMIP6/EC-Earth3/ssp585/total_precipitation']

# target_grid_path    = '../data/preprocessed/ERA5/target_grid.txt' 

In [7]:
# for raw_path, temp_path, preprocess_path in zip(raw_data_paths,temp_data_paths,preproc_data_paths):

#     standard_preprocess(raw_path, temp_path, preprocess_path, 2019, 2041, target_grid_path, world_boundary, 15, True, 'short_term_2020-2040')
#     standard_preprocess(raw_path, temp_path, preprocess_path, 2038, 2071, target_grid_path, world_boundary, 15, True, 'medium_term_2040-2070')
#     standard_preprocess(raw_path, temp_path, preprocess_path, 2069, 2199, target_grid_path, world_boundary, 15, True, 'long_term_2070-2100')