In [17]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
import geopandas as gpd
import xarray as xr
import rioxarray
import matplotlib.gridspec as gridspec
import cartopy.crs as ccrs
import matplotlib as mpl
from cmcrameri import cm
from pymannkendall import original_test
from scipy.stats import entropy
folder_ts = r'Data\Datasets\amz\ts'
folder_metric = r'Data\Datasets\amz\dry'
datasets = ['cru', 'gpcc', 'chirps','imerg', 'terra', 'era_land', 'jra55','merra2']
datasets_names = ['CRU', 'GPCC', 'CHIRPS','IMERG-V6', 'TerraClimate', 'ERA5-Land', 'JRA55','MERRA2']
evapotranspiration = 100

In [18]:
def start_dry(x):
    result = np.asarray(x, dtype=float).copy()
    for i in range(len(result)):
    #return the index of the first True value
        if result[i] == True:
            return i
def dryest_month(x):
    result = np.asarray(x, dtype=float).copy()
    return pd.Series(x).idxmin()        

In [None]:
for dataset in datasets:
    #read file of dataset in folder_clean
    file_path = glob(os.path.join(folder_ts, dataset + '.nc'))
    ds = xr.open_dataset(file_path[0])
    ds_minus_et = ds < evapotranspiration
    ds_start = xr.apply_ufunc(start_dry, ds_minus_et['pr'].groupby('time.year'),
                                    input_core_dims=[['time']],
                                    output_core_dims=[[]],
                                    vectorize=True, dask='parallelized')
    ds_start = ds_start.astype('float32')
    dsl_ts = (ds.pr < 100).groupby('time.year').sum(dim='time')
    # merge data
    ds_start = ds_start.to_dataset(name='dry_start')
    dsl_ts = dsl_ts.to_dataset(name='dsl')
    ds_start = ds_start.merge(dsl_ts)
    dryy = xr.apply_ufunc(dryest_month, ds['pr'].groupby('time.year'),
                                    input_core_dims=[['time']],
                                    output_core_dims=[[]],
                                    vectorize=True, dask='parallelized')
    ds_dry = dryy.astype('float32').to_dataset(name='driest')
    ds_start = ds_start.merge(ds_dry)
    ds_median = ds_start.median('year')
    ds_start.to_netcdf(os.path.join(folder_metric, dataset + '.nc'))
    ds_median.to_netcdf(os.path.join(folder_metric,'median', dataset + '.nc'))


# station

In [52]:
stations = gpd.read_file(r'Data\Evaluation\stations_amz_ANA.geojson')
df_stat= pd.read_pickle(r'Data\Evaluation\amz_01_20_20bet.pkl').sort_values(by=['Code', 'Date']).reset_index(drop=True)
df_stat['dry']= df_stat['Total'] < evapotranspiration
df_ts = df_stat.groupby(['Code', df_stat.Date.dt.year])['dry'].apply(start_dry).reset_index().rename({'dry':'start'},  axis='columns')
df_ts['driest'] = df_stat.groupby(['Code', df_stat.Date.dt.year])['Total'].apply(dryest_month).reset_index().rename({'Total':'driest'},  axis='columns')['driest']
#from driest, take only the rest from division by 12
df_ts['driest'] = df_ts['driest'] % 12

In [56]:
df_median = df_ts.groupby('Code')[['start','driest']].median().reset_index()
stations = stations.merge(df_median,  on='Code')

In [None]:
stations.to_file(os.path.join(folder_metric, 'stations.geojson'), driver='GeoJSON')
df_ts.to_csv(os.path.join(folder_metric, 'stations_ts.csv'))