In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
intermediate_path = '../../intermediate'
# Task index from SLURM array to run specific variable and model combinations
task_id = 0
# Task list csv
task_list = 'test_run_manager_explicit_list.csv'
# Extract task details
task_details = pd.read_csv(os.path.join(intermediate_path, task_list)).iloc[task_id]

In [3]:
task_details

ESM                                                                model-name
Variable                                                                 hurs
Scenario                                                        scenario-name
Ensemble                                                        ensemble-name
Reference_Dataset                                                    obs-hist
target_period                                                       1979-2014
application_period                                                  2065-2100
ESM_Input_Location          /Users/prim232/gitrepos/basd-pipeline/input/te...
Reference_Input_Location    /Users/prim232/gitrepos/basd-pipeline/input/te...
Output_Location             /Users/prim232/gitrepos/basd-pipeline/output/t...
daily                                                                    True
monthly                                                                  True
Name: 0, dtype: object

In [4]:
if ~task_details.monthly:
    print('yes')

In [5]:
task_details.ESM_Input_Location

'/Users/prim232/gitrepos/basd-pipeline/input/test_data/sim'

In [6]:
pd.isna(task_details.ESM_Input_Location)

False

In [7]:
start, end = str.split(task_details.application_period, '-')
print(f'{start}_{end}')

2065_2100


In [8]:
input_path = '../../input'
param_data = pd.read_csv(os.path.join(input_path, 'variable_parameters.csv'))
param_data

Unnamed: 0,variable,distribution,lower_bound,lower_threshold,upper_bound,upper_threshold,trend_preservation,n_iterations,detrend,if_all_invalid_use,...,halfwin_ubc,n_quantiles,randomization_seed,max_change_factor,max_adjustment_factor,parametric,unconditional_ccs_transfer,trendless_bound_frequency,p_value_eps,adjust_p_values
0,hurs,beta,0.0,0.01,100.0,99.99,bounded,10,,0,...,,,,,,,,,,
1,pr,gamma,0.0,1e-06,,,mixed,10,,0,...,,,,,,,,,,
2,tas,normal,,,,,additive,10,True,0,...,,,,,,,,,,
3,tasmax,,,,,,,10,,0,...,,,,,,,,,,
4,tasmin,,,,,,,10,,0,...,,,,,,,,,,
5,rlds,normal,,,,,additive,10,True,0,...,,,,,,,,,,
6,rsds,beta,0.0,0.0001,1.0,0.9999,bounded,10,,0,...,15.0,,,,,,,,,
7,sfcWind,weibull,0.0,0.01,,,mixed,10,,0,...,,,,,,,,,,
8,ps,,,,,,,10,,0,...,,,,,,,,,,
9,tasrange,rice,0.0,0.01,,,mixed,10,,0,...,,,,,,,,,,


In [9]:
pd.isna(param_data.iloc[0].detrend)

True

In [10]:
import basd

In [11]:
param_dict = param_data[param_data.variable == 'pr'].dropna(axis=1).to_dict(orient='records')[0]
del param_dict['variable']
param_dict

{'distribution': 'gamma',
 'lower_bound': 0.0,
 'lower_threshold': 1.16e-06,
 'trend_preservation': 'mixed',
 'n_iterations': 10,
 'if_all_invalid_use': 0}

In [12]:
if 'n_iterations' in param_dict:
    param_dict['n_iterations'] = int(param_dict['n_iterations'])
param_dict

{'distribution': 'gamma',
 'lower_bound': 0.0,
 'lower_threshold': 1.16e-06,
 'trend_preservation': 'mixed',
 'n_iterations': 10,
 'if_all_invalid_use': 0}

In [13]:
basd.Parameters(**param_dict)

<basd.ba_params.Parameters at 0x16a5f9f10>

In [14]:
encoding_data = pd.read_csv(os.path.join(input_path, 'encoding.csv'))
encoding_data

Unnamed: 0,zlib,shuffle,complevel,fletcher32,contiguous,time_chunk,lat_chunk,lon_chunk,dtype,missing_value,_FillValue
0,True,True,5,False,False,1,max,max,float32,1e+20,1e+20


In [15]:
encoding_data_dict = encoding_data.dropna(axis=1).to_dict(orient='records')[0]
encoding_data_dict

{'zlib': True,
 'shuffle': True,
 'complevel': 5,
 'fletcher32': False,
 'contiguous': False,
 'time_chunk': 1,
 'lat_chunk': 'max',
 'lon_chunk': 'max',
 'dtype': 'float32',
 'missing_value': 1e+20,
 '_FillValue': 1e+20}

In [16]:
reset_encoding_chunks = False
if np.any( [pd.isna(encoding_data['time_chunk']), pd.isna(encoding_data['lat_chunk']), pd.isna(encoding_data['lon_chunk'])] ):
    del encoding_data_dict['time_chunk'], encoding_data_dict['lat_chunk'], encoding_data_dict['lon_chunk']
else:
    if 'max' in [encoding_data_dict['time_chunk'], encoding_data_dict['lat_chunk'], encoding_data_dict['lon_chunk']]:
        reset_encoding_chunks = True
        encoding_data_dict['chunksizes'] = (encoding_data_dict['time_chunk'], encoding_data_dict['lat_chunk'], encoding_data_dict['lon_chunk'])
    del encoding_data_dict['time_chunk'], encoding_data_dict['lat_chunk'], encoding_data_dict['lon_chunk']

In [17]:
reset_encoding_chunks

True

In [18]:
encoding_data_dict

{'zlib': True,
 'shuffle': True,
 'complevel': 5,
 'fletcher32': False,
 'contiguous': False,
 'dtype': 'float32',
 'missing_value': 1e+20,
 '_FillValue': 1e+20,
 'chunksizes': (1, 'max', 'max')}

In [19]:
encoding_data_dict['chunksizes'][0] == 'max'

False

In [20]:
dask_params = pd.read_csv(os.path.join(input_path, 'dask_parameters.csv'))
dask_params

Unnamed: 0,time_chunk_size,lat_chunk_size,lon_chunk_size,dask_temp_directory
0,50,5,5,


In [21]:
import utils

In [22]:
_,_,_,dask_temp_dir = utils.get_chunk_sizes(input_path)

In [23]:
dask_temp_dir

nan

In [24]:
if pd.isna(dask_temp_dir):
    print('it is na')

it is na


In [27]:
run_manager_df = pd.read_csv(os.path.join(input_path, 'run_manager.csv'))
run_manager_df

Unnamed: 0,ESM,ESM_Input_Location,Output_Location,Reference_Dataset,Reference_Input_Location,Variable,Scenario,Ensemble,target_period,application_period,daily,monthly
0,GFDL-ESM4,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,W5E5v2,/rcfs/projects/gcims/data/climate/W5E5v2,pr,ssp245,r1i1p1f1,1970-2014,2015-2100,True,True
1,CanESM5,,/rcfs/projects/gcims/data/climate/basd/CanESM5,,,tas,ssp370,,,1950-2014,,
2,,,,,,hurs,,,,,,
3,,,,,,sfcWind,,,,,,
4,,,,,,rsds,,,,,,
5,,,,,,rlds,,,,,,
6,,,,,,tasmin,,,,,,
7,,,,,,tasmax,,,,,,


In [28]:
def remove_nas(x):
    return x[~pd.isnull(x)]

In [29]:
esms = remove_nas(run_manager_df['ESM'].values)
esm_input_paths = remove_nas(run_manager_df['ESM_Input_Location'].values)
output_paths = remove_nas(run_manager_df['Output_Location'].values)
ref_datasets = remove_nas(run_manager_df['Reference_Dataset'].values)
ref_datasets_paths = remove_nas(run_manager_df['Reference_Input_Location'].values)
variables = remove_nas(run_manager_df['Variable'].values)
scenarios = remove_nas(run_manager_df['Scenario'].values)
ensembles = remove_nas(run_manager_df['Ensemble'].values)
target_periods = remove_nas(run_manager_df['target_period'].values)
application_periods = remove_nas(run_manager_df['application_period'].values)
daily = remove_nas(run_manager_df['daily'].values)
monthly = remove_nas(run_manager_df['monthly'].values)

In [32]:
variables = np.intersect1d(variables, ['tasmin', 'tasmax'])

In [35]:
mesh_array = np.array(np.meshgrid(esms, 
                                  variables, 
                                  scenarios, 
                                  ensembles,
                                  ref_datasets, 
                                  target_periods, 
                                  application_periods)).T.reshape(-1,7)

In [36]:
mesh_df = pd.DataFrame(mesh_array, columns = ['ESM', 'Variable', 'Scenario', 'Ensemble', 'Reference_Dataset',
                                              'target_period', 'application_period'])
# Merge in esm input locations
mesh_df = mesh_df.merge(run_manager_df[['ESM', 'ESM_Input_Location']], on='ESM', how='inner')
# Merge in reference dataset input locations
mesh_df = mesh_df.merge(run_manager_df[['Reference_Dataset', 'Reference_Input_Location']], on='Reference_Dataset', how='inner')
# Merge in output paths
mesh_df = mesh_df.merge(run_manager_df[['ESM', 'Output_Location']], on='ESM', how='inner')
# Add daily and monthly bools
mesh_df['daily'] = daily[0]
mesh_df['monthly'] = monthly[0]
mesh_df

Unnamed: 0,ESM,Variable,Scenario,Ensemble,Reference_Dataset,target_period,application_period,ESM_Input_Location,Reference_Input_Location,Output_Location,daily,monthly
0,GFDL-ESM4,tasmax,ssp245,r1i1p1f1,W5E5v2,1970-2014,2015-2100,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
1,GFDL-ESM4,tasmin,ssp245,r1i1p1f1,W5E5v2,1970-2014,2015-2100,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
2,GFDL-ESM4,tasmax,ssp370,r1i1p1f1,W5E5v2,1970-2014,2015-2100,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
3,GFDL-ESM4,tasmin,ssp370,r1i1p1f1,W5E5v2,1970-2014,2015-2100,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
4,GFDL-ESM4,tasmax,ssp245,r1i1p1f1,W5E5v2,1970-2014,1950-2014,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
5,GFDL-ESM4,tasmin,ssp245,r1i1p1f1,W5E5v2,1970-2014,1950-2014,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
6,GFDL-ESM4,tasmax,ssp370,r1i1p1f1,W5E5v2,1970-2014,1950-2014,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
7,GFDL-ESM4,tasmin,ssp370,r1i1p1f1,W5E5v2,1970-2014,1950-2014,/rcfs/projects/gcims/data/climate/cmip6/GFDL-ESM4,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/GFDL-ESM4,True,True
8,CanESM5,tasmax,ssp245,r1i1p1f1,W5E5v2,1970-2014,2015-2100,,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/CanESM5,True,True
9,CanESM5,tasmin,ssp245,r1i1p1f1,W5E5v2,1970-2014,2015-2100,,/rcfs/projects/gcims/data/climate/W5E5v2,/rcfs/projects/gcims/data/climate/basd/CanESM5,True,True


In [37]:
for index, row in mesh_df.iterrows():
    print(f'{row.Variable}, {row.Scenario}, {row.ESM}, {row.Ensemble}, {row.application_period}')

tasmax, ssp245, GFDL-ESM4, r1i1p1f1, 2015-2100
tasmin, ssp245, GFDL-ESM4, r1i1p1f1, 2015-2100
tasmax, ssp370, GFDL-ESM4, r1i1p1f1, 2015-2100
tasmin, ssp370, GFDL-ESM4, r1i1p1f1, 2015-2100
tasmax, ssp245, GFDL-ESM4, r1i1p1f1, 1950-2014
tasmin, ssp245, GFDL-ESM4, r1i1p1f1, 1950-2014
tasmax, ssp370, GFDL-ESM4, r1i1p1f1, 1950-2014
tasmin, ssp370, GFDL-ESM4, r1i1p1f1, 1950-2014
tasmax, ssp245, CanESM5, r1i1p1f1, 2015-2100
tasmin, ssp245, CanESM5, r1i1p1f1, 2015-2100
tasmax, ssp370, CanESM5, r1i1p1f1, 2015-2100
tasmin, ssp370, CanESM5, r1i1p1f1, 2015-2100
tasmax, ssp245, CanESM5, r1i1p1f1, 1950-2014
tasmin, ssp245, CanESM5, r1i1p1f1, 1950-2014
tasmax, ssp370, CanESM5, r1i1p1f1, 1950-2014
tasmin, ssp370, CanESM5, r1i1p1f1, 1950-2014
