# Setting ERA5 Precipitation for Shetran

This notebook was created to demostrate the process of creating the precipitaiton file use in SHETRAN based in the catchment extent

---

#### Author: 
                LF Velasquez & I Rohrmueller - Newcastle University

#### Date:
                Nov 2022

#### Version:
                1.0

#### Notes:
                - To get jupyter env version type `!jupyter --version` in a python cell
                - Code source: https://github.com/planet-os/notebooks/blob/master/aws/era5-s3-via-boto.ipynb
            
#### Jupyter version:

#### Python version:

---

# Notebook set-up

## 1. Setting Python Modules

In [1]:
import os
from pathlib import Path
import xarray as xr
import pandas as pd
import geopandas as gpd
import rioxarray
import rasterstats
from datetime import datetime
import time

# libraries for working with year and months
import calendar # use to check for leap years
from calendar import monthrange

# ODC modules
import datacube
import numpy as np

# Custom modules
import utils

# Modules for config.ini
from configparser import ConfigParser
config = ConfigParser()


# Multiprocessing
import multiprocessing
from multiprocessing import get_start_method
from multiprocessing import Pool

# Set multiprocessing context
if get_start_method() is not None:
    print(f'Context is already set to: {get_start_method()}')
else:
    multiprocessing.set_start_method('fork') # this is necessary for ipython
    print(f'Context set to: {get_start_method()}')

Context is already set to: fork


### 1.1 Functions

In [2]:
def justify(a, invalid_val=0, axis=1, side='left'):    
    """
    Justifies a 2D array

    Parameters
    ----------
    A : ndarray
        Input array to be justified
    axis : int
        Axis along which justification is to be made
    side : str
        Direction of justification. It could be 'left', 'right', 'up', 'down'
        It should be 'left' or 'right' for axis=1 and 'up' or 'down' for axis=0.

    """

    if invalid_val is np.nan:
        #change to notnull
        mask = pd.notnull(a)
    else:
        mask = a!=invalid_val
    justified_mask = np.sort(mask,axis=axis)
    if (side=='up') | (side=='left'):
        justified_mask = np.flip(justified_mask,axis=axis)
    #change dtype to object
    out = np.full(a.shape, invalid_val, dtype=object)  
    if axis==1:
        out[justified_mask] = a[mask]
    else:
        out.T[justified_mask.T] = a.T[mask.T]
    return out


# Define function to use in the pool
def climate_values(climate_var, dc_data, df_grid, time_split, path_grid, path_main):    
    # VAR_NAME = 'tp'
    # ERA_VAR = climate_var
    lst_fileNames = [] # this will be used to delete the files at the end

    # Create list of timestamps
    lst_time = dc_data.coords['time'].values
        
    # # Loop inside each nc file using the timestamp
    # for count, value in enumerate(lst_time):
        
    # Temp dataframe to store rasterstats dataset
    grid_stats = pd.DataFrame() # need to make sure there are not errors
    grid_stats = df_grid[['uid']].copy()
    
    # Create file name using the date
    file_name = np.datetime_as_string(lst_time[time_split], unit='D')
    file_name = file_name.replace('-', "")
    
    # Read nc file as xarray and select one timestamp at the time
    # prep_array = lst_cds[count_cds].to_xarray()
    clim_tif = dc_data.isel(time=slice(time_split, time_split + 1))

    # Create tif file need for rasterstats
    _tif = Path(path_main / f'shetran_data/temp_data/{climate_var}_{file_name}.tif')
    clim_tif[climate_var].rio.to_raster(_tif)
    # prep_tif[VAR_NAME].rio.to_raster(f'Data/temp/{ERA_VAR}_{file_name}.tif')
    
    # Store file name
    lst_fileNames.append(f'{climate_var}_{file_name}.tif')
    
    ##################################################################################
    # RUNNING THE STATS FOR EACH GRID IN THE CATCHMENT
    ##################################################################################
    # Find the mean precipitation for each grid in the catchment
    # This produces a dict following the same order as the grid geodataframe
    catchm_stats_mean = rasterstats.zonal_stats(str(path_grid), str(_tif), stats="mean", all_touched=True, nodata=-9999)

    # Change the dictionary to a list to be added to the mask
    stats_value = [x['mean'] for x in catchm_stats_mean]

    # Add the climate information to grid geodataframe
    grid_stats[climate_var] = stats_value
    
    # Save to csv file ready for next stage
    _csv = Path(path_main / f'shetran_data/temp_data/{climate_var}_{file_name}.csv')
    grid_stats.to_csv(_csv, index=False)
    
    print(f'{climate_var}_{file_name} done')


## 2. Gobal Variables

In [26]:

# Setting the path to the work environment
dir_abs = Path().resolve().parent.parent

# Make sure temp folder directory is empty
utils.file_remove(Path(dir_abs / 'shetran_data/temp_data/'), 'all')

# Read config file values
config.read('config.ini')

# Setting CRS
crs_global = config.getint('crs_setting', 'GLB')
crs_local = config.getint('crs_setting', 'COL')

# Open Data Cube Product
dc_data = config.get('dc_product', 'LC')

# No data value
ND = config.getint('res_setting', 'NO_DATA')


# Set data cube product to read
''' [era5_reanalysis_tp_daily, era5_reanalysis_pev_daily, era5_reanalysis_tmean_daily,
era5_reanalysis_tmax_daily,era5_reanalysis_tmin_daily]'''
DC_PRODUCT = config.get('dc_product', 'lst_era_products').split(',')

# Set era5 var
''' [total_precipitation, potential_evaporation, 2m_temperature_mean,
2m_temperature_max,2m_temperature_min]'''
ERA_VAR = config.get('era_land', 'lst_era_var').split(',')

/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220106.tif DELETED!
/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220122.tif DELETED!
/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220524.tif DELETED!
/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220107.tif DELETED!
/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220518.tif DELETED!
/home/geofelpave/Documents/1_PhD_SharePoint/LFPV - PhD - Documents/00_PhD_main/006_GitHub/0065_odc_hydro/shetran_data/temp_data/2m_temperature_min_20220522

## 3. Start Proces

### 3.1. Reading shapefile

#### Create functions needed

In [27]:
def grid_latlon(soil_grid_df, crs_num):
    # Get the centroid for each grid
    temp_df = pd.DataFrame()
    temp_df = soil_grid_df.to_crs(crs_num)
    temp_df['centroid'] = temp_df['geometry'].centroid

    # Create lat (Y) and lon (X) columns
    temp_df['lat'] = temp_df['centroid'].y.astype(int)
    temp_df['lon'] = temp_df['centroid'].x.astype(int)

    # Return the columns needed
    df_grid = pd.DataFrame(temp_df[['lat', 'lon', 'geometry']].copy())

    
    return df_grid

#### Working with the mask - shapefile

In [35]:
# Read shp file to geopandas dataframe
grid_path = Path(dir_abs / 'shetran_data/active_data/final_mask_wgs84.shp')
grid = gpd.read_file(grid_path)

# Create lat and lot values using the function
'''3116 is the EPSG in Colombia'''
grid_centroids = grid_latlon(grid, crs_local)

# Get list of unique values of lon
'''This will be use to create the right order for the grid id'''
lst_lon = grid_centroids.lon.values
lst_lon = np.unique(lst_lon, axis=0)

# Create list to store the dataframes
lst_df = []

# Loop through list of longitude to create the unique values
for count, value in enumerate(lst_lon):
    temp = grid_centroids.copy()
    temp = temp.loc[grid_centroids['lon'] == value]

    # create list with uid values
    # set range values
    start = count
    end = len(temp.lat.to_list())*len(set(grid_centroids.lon.to_list()))
    increment = len(set(grid_centroids.lon.to_list()))
    
    # create list of numbers'
    '''this uses the dimensions of the grid'''
    lst_number = np.arange(start, end, increment).tolist()
    
    # Add prefix to numbers to make the final uid
    lst_uid = list(map(lambda x: f'grid_{str(x).zfill(5)}', lst_number))
    
    # Add uid as a column in dataframe
    temp['uid'] = lst_number
    
    # add temp to lst_df 
    lst_df.append(temp)


# combine all dataframe
grid_final = pd.concat(lst_df)
grid_final = grid_final.reset_index()

# create geodataframe
grid_final = gpd.GeoDataFrame(grid_final)

# Change geodataframe to WGS84
grid_wgs84 = grid_final.to_crs(crs_global)
grid_wgs84['uid'] = grid_wgs84['uid'] + 1

# # Get bbox
xmin, ymin, xmax, ymax = grid_wgs84.total_bounds
xmin, ymin, xmax, ymax

grid_final['uid'] = grid_final['uid'] + 1
grid_wgs84

Unnamed: 0,index,lat,lon,geometry,uid
0,0,888266,709018,"POLYGON ((-76.70463 3.59102, -76.68665 3.59107...",1
1,1,886266,709018,"POLYGON ((-76.70458 3.57295, -76.68660 3.57300...",39
2,2,884266,709018,"POLYGON ((-76.70453 3.55488, -76.68655 3.55493...",77
3,3,882266,709018,"POLYGON ((-76.70448 3.53681, -76.68650 3.53687...",115
4,4,880266,709018,"POLYGON ((-76.70443 3.51875, -76.68644 3.51880...",153
...,...,...,...,...,...
793,793,856266,783018,"POLYGON ((-76.03853 3.30348, -76.02054 3.30351...",646
794,794,854266,783018,"POLYGON ((-76.03849 3.28540, -76.02051 3.28544...",684
795,795,852266,783018,"POLYGON ((-76.03846 3.26733, -76.02047 3.26736...",722
796,796,850266,783018,"POLYGON ((-76.03842 3.24925, -76.02044 3.24929...",760


### Reading Climate data from ODC

In [36]:
# Set ODC application
dc = datacube.Datacube(app="era_five")

# # Set dask client
# client = dask.distributed.Client()
# display(client)

# Get data
# Load data from the datacube
buffer = 0.125
START_DATE = "2022-01-01"
END_DATE = "2022-07-31"
ds = dc.load(product=DC_PRODUCT[4],
             lat=(ymin - buffer, ymax + buffer),
             lon=(xmin - buffer, xmax + buffer),
             time=(START_DATE, END_DATE),
             dask_chunks={'time': 1, 'longitude': 200, 'latitude': 200}
             )

# Print output data
ds

Unnamed: 0,Array,Chunk
Bytes,66.25 kiB,320 B
Shape,"(212, 8, 10)","(1, 8, 10)"
Dask graph,212 chunks in 1 graph layer,212 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 66.25 kiB 320 B Shape (212, 8, 10) (1, 8, 10) Dask graph 212 chunks in 1 graph layer Data type float32 numpy.ndarray",10  8  212,

Unnamed: 0,Array,Chunk
Bytes,66.25 kiB,320 B
Shape,"(212, 8, 10)","(1, 8, 10)"
Dask graph,212 chunks in 1 graph layer,212 chunks in 1 graph layer
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Working with NC files
Create raster file for each time stamp and get statistics from it. The proces is as follow:

1. Create list of the hours in the dataset
2. Loop thourgh the xarray file slicing it for each time stamp
3. Create .tif file for each time stamp - store in temp folder
4. Find the average precipitation, temperature, evapotranspiration for each grid


The process above has been optimised using parallel processing

In [37]:
# Create list of timestamps to be used for the pool
lst_time = ds.coords['time'].values # list of date values
lst_index = list(range(len(lst_time))) # list index values
climate_var = ERA_VAR[4]

# Run parallel processing
def main():

    with Pool(8) as pool:
        start=time.time()
        result = pool.starmap(climate_values, [(climate_var, ds, grid_wgs84, t, grid_path, dir_abs) for t in lst_index])
        pool.terminate()
        print("Time Taken: ",str(time.time()-start)) 

# entry point
if __name__ == '__main__':
    main()
    print(f'-----\n {ERA_VAR[4]} hourly csv files created between {START_DATE} and {END_DATE}')

2m_temperature_min_20220219 done
2m_temperature_min_20220205 done
2m_temperature_min_20220122 done
2m_temperature_min_20220108 done
2m_temperature_min_20220129 done
2m_temperature_min_20220212 done
2m_temperature_min_20220101 done
2m_temperature_min_20220115 done
2m_temperature_min_20220109 done
2m_temperature_min_20220206 done
2m_temperature_min_20220220 done
2m_temperature_min_20220123 done
2m_temperature_min_20220116 done2m_temperature_min_20220130 done

2m_temperature_min_20220213 done
2m_temperature_min_20220102 done
2m_temperature_min_20220110 done
2m_temperature_min_20220207 done
2m_temperature_min_20220221 done
2m_temperature_min_20220131 done
2m_temperature_min_20220117 done
2m_temperature_min_20220124 done
2m_temperature_min_20220214 done
2m_temperature_min_20220103 done
2m_temperature_min_20220111 done
2m_temperature_min_20220208 done
2m_temperature_min_20220222 done
2m_temperature_min_20220201 done
2m_temperature_min_20220125 done
2m_temperature_min_20220118 done
2m_tempera

Delete .tif files from temp location

In [64]:
utils.file_remove(Path(dir_abs / 'shetran_data/temp_data/'), 'tiff')

All .tif files have been deleted from temp location


## 4. Create final CSV File
This process uses the csv file created from step 3

### 4.1. Set csv files into panda dataframe
Delete shapfile from temp to only work with csv files

In [38]:
# Add file to a list 
'''this is needed as when reding the files they might no be read in the right order'''
lst_files = []
for path in Path(Path(dir_abs / f'shetran_data/temp_data/')).glob(f'*.csv'):
        lst_files.append(path)

# Mkae sure the list is order by file name
lst_files.sort()

# Using the list create dataframe
df_prep = pd.concat((pd.read_csv(f) for f in lst_files), ignore_index=True)

# Make sure we only using the right columns
df_prep = df_prep[['uid',f'{ERA_VAR[4]}']].copy()
df_prep.columns = ['uid',ERA_VAR[4]]
df_prep

Unnamed: 0,uid,2m_temperature_min
0,1,18.192108
1,39,18.192108
2,77,18.192108
3,115,18.192108
4,153,18.192108
...,...,...
169171,646,8.762375
169172,684,9.267334
169173,722,9.267334
169174,760,9.267334


Pivot the table using the uid

In [39]:
df_prep = df_prep.pivot(columns='uid')[ERA_VAR[4]]
df_prep.reset_index()

uid,index,1,2,3,4,5,6,7,8,9,...,789,790,791,792,793,794,795,796,797,798
0,0,18.192108,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169171,169171,,,,,,,,,,...,,,,,,,,,,
169172,169172,,,,,,,,,,...,,,,,,,,,,
169173,169173,,,,,,,,,,...,,,,,,,,,,
169174,169174,,,,,,,,,,...,,,,,,,,,,


Shift all rows making sure that all NaN are at the bottom. This will ensure that the dataframe is in the order of the days we are interested in, producing the final dataframe

In [40]:
# Running the function
df_prep_final = pd.DataFrame(justify(df_prep.values, invalid_val=np.nan, side='up', axis=0), 
                  columns=df_prep.columns)

# Remove all rows where all values are NaN
df_prep_final = df_prep_final.dropna(axis=0, how='all')

df_prep_final

uid,1,2,3,4,5,6,7,8,9,10,...,789,790,791,792,793,794,795,796,797,798
0,18.192108,18.076019,18.076019,18.076019,18.076019,18.205933,18.335846,18.335846,18.335846,18.335846,...,17.512161,16.53717,16.53717,16.53717,16.53717,13.426315,10.31546,10.31546,10.31546,10.31546
1,17.521149,17.405762,17.405762,17.405762,17.405762,17.675934,17.946106,17.946106,17.946106,17.946106,...,17.163223,16.170959,16.170959,16.170959,16.170959,13.166504,10.162048,10.162048,10.162048,10.162048
2,18.054947,17.891541,17.891541,17.891541,17.891541,18.026627,18.161713,18.161713,18.161713,18.161713,...,16.908936,16.013397,16.013397,16.013397,16.013397,12.926727,9.840057,9.840057,9.840057,9.840057
3,17.641739,17.452759,17.452759,17.452759,17.452759,17.398163,17.343567,17.343567,17.343567,17.343567,...,17.195709,16.300873,16.300873,16.300873,16.300873,13.136108,9.971344,9.971344,9.971344,9.971344
4,17.19017,16.941406,16.941406,16.941406,16.941406,16.870239,16.799072,16.799072,16.799072,16.799072,...,16.068329,15.050842,15.050842,15.050842,15.050842,11.596558,8.142273,8.142273,8.142273,8.142273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,17.581757,17.249878,17.249878,17.249878,17.249878,17.175705,17.101532,17.101532,17.101532,17.101532,...,16.228577,15.476654,15.476654,15.476654,15.476654,12.399002,9.32135,9.32135,9.32135,9.32135
208,16.815201,16.462494,16.462494,16.462494,16.462494,16.654785,16.847076,16.847076,16.847076,16.847076,...,16.478775,15.561249,15.561249,15.561249,15.561249,12.232086,8.902924,8.902924,8.902924,8.902924
209,16.327469,16.270538,16.270538,16.270538,16.270538,16.148529,16.02652,16.02652,16.02652,16.02652,...,16.66066,15.520905,15.520905,15.520905,15.520905,12.105515,8.690125,8.690125,8.690125,8.690125
210,18.338242,18.138153,18.138153,18.138153,18.138153,17.989456,17.840759,17.840759,17.840759,17.840759,...,17.274628,16.022614,16.022614,16.022614,16.022614,12.180679,8.338745,8.338745,8.338745,8.338745


### 4.2. Saving the final outpt as csv file

In [68]:
# Creating the text file for min elevation
file_name = f'final_{ERA_VAR[4]}_{START_DATE.split("-")[0]}{START_DATE.split("-")[1]}_{END_DATE.split("-")[0]}{END_DATE.split("-")[1]}'
_csvFinal = Path(dir_abs / f'shetran_data/model_input/{file_name}.csv')
df_prep_final.to_csv(_csvFinal, index=False)
print(f'SHETRAN {ERA_VAR[4]} file created')


SHETRAN 2m_temperature_min file created


## 5. Create climate map
This is base in the uid

### 5.1. Setting dataframe for final file
This works with the geodatrame set up in section 3.1

In [69]:
pivoted = grid_wgs84.pivot(index='lat', columns='lon', values='uid')
pivoted_order = pivoted.sort_values('lat', ascending=False)
pivoted_order

lon,709018,711018,713018,715018,717018,719018,721018,723018,725018,727018,...,765018,767018,769018,771018,773018,775018,777018,779018,781018,783018
lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
888266,grid_00000,grid_00001,grid_00002,grid_00003,grid_00004,grid_00005,grid_00006,grid_00007,grid_00008,grid_00009,...,grid_00028,grid_00029,grid_00030,grid_00031,grid_00032,grid_00033,grid_00034,grid_00035,grid_00036,grid_00037
886266,grid_00038,grid_00039,grid_00040,grid_00041,grid_00042,grid_00043,grid_00044,grid_00045,grid_00046,grid_00047,...,grid_00066,grid_00067,grid_00068,grid_00069,grid_00070,grid_00071,grid_00072,grid_00073,grid_00074,grid_00075
884266,grid_00076,grid_00077,grid_00078,grid_00079,grid_00080,grid_00081,grid_00082,grid_00083,grid_00084,grid_00085,...,grid_00104,grid_00105,grid_00106,grid_00107,grid_00108,grid_00109,grid_00110,grid_00111,grid_00112,grid_00113
882266,grid_00114,grid_00115,grid_00116,grid_00117,grid_00118,grid_00119,grid_00120,grid_00121,grid_00122,grid_00123,...,grid_00142,grid_00143,grid_00144,grid_00145,grid_00146,grid_00147,grid_00148,grid_00149,grid_00150,grid_00151
880266,grid_00152,grid_00153,grid_00154,grid_00155,grid_00156,grid_00157,grid_00158,grid_00159,grid_00160,grid_00161,...,grid_00180,grid_00181,grid_00182,grid_00183,grid_00184,grid_00185,grid_00186,grid_00187,grid_00188,grid_00189
878266,grid_00190,grid_00191,grid_00192,grid_00193,grid_00194,grid_00195,grid_00196,grid_00197,grid_00198,grid_00199,...,grid_00218,grid_00219,grid_00220,grid_00221,grid_00222,grid_00223,grid_00224,grid_00225,grid_00226,grid_00227
876266,grid_00228,grid_00229,grid_00230,grid_00231,grid_00232,grid_00233,grid_00234,grid_00235,grid_00236,grid_00237,...,grid_00256,grid_00257,grid_00258,grid_00259,grid_00260,grid_00261,grid_00262,grid_00263,grid_00264,grid_00265
874266,grid_00266,grid_00267,grid_00268,grid_00269,grid_00270,grid_00271,grid_00272,grid_00273,grid_00274,grid_00275,...,grid_00294,grid_00295,grid_00296,grid_00297,grid_00298,grid_00299,grid_00300,grid_00301,grid_00302,grid_00303
872266,grid_00304,grid_00305,grid_00306,grid_00307,grid_00308,grid_00309,grid_00310,grid_00311,grid_00312,grid_00313,...,grid_00332,grid_00333,grid_00334,grid_00335,grid_00336,grid_00337,grid_00338,grid_00339,grid_00340,grid_00341
870266,grid_00342,grid_00343,grid_00344,grid_00345,grid_00346,grid_00347,grid_00348,grid_00349,grid_00350,grid_00351,...,grid_00370,grid_00371,grid_00372,grid_00373,grid_00374,grid_00375,grid_00376,grid_00377,grid_00378,grid_00379


### 5.2. Saving the final outpt as csv file

In [70]:
# Creating the text file for min elevation
utils.shetran_csv_file(dir_abs, f'final_{ERA_VAR[4]}_map_SHETRAN', pivoted_order, 's')
print(f'SHETRAN {ERA_VAR[4]} map file created')

SHETRAN 2m_temperature_min map file created
