In [None]:
import os
import re
import xarray as xr
import glob
import dask
import cftime
import netCDF4
import geopandas as gpd

from dask.distributed import LocalCluster, Client


In [None]:
cluster = LocalCluster()

In [None]:
client = Client(cluster)

In [None]:
client

In [None]:
# Define a function to standardize time
def standardize_time(ds):
    """
    Decode and standardize the time variable for a single dataset.
    """
    if 'time' in ds.variables:
        # Get the time attributes
        time_units = ds.time.attrs.get('units', None)
        calendar = ds.time.attrs.get('calendar', 'standard')
        
        # Decode the time variable
        if time_units is not None:
            decoded_times = cftime.num2date(ds.time.values, units=time_units, calendar=calendar)'
        
        # Assign back to the dataset
        ds['time'] = ('time', decoded_times)

    return ds

In [1]:
# Ensembles of ClimEx
string = 'kbi  kbh  kbg  kbf  kbe  kbd  kbc  kbb  kba  kcx  kcw  kcv  kcu  kct  kcs  kcr  kcq  kcp  kco  kcn  kcm  kcl  kck  kcj  kci  kch  kcg  kcf  kce  kcd  kcc  kcb  kca  kbz  kby  kbx  kbw  kbv  kbu  kbt  kbs  kbr  kbq  kbp  kbo  kbn  kbm  kbl  kbk  kbj  '
ensembles = string.split( )
ensembles.sort()

In [4]:
ensembles

['kba',
 'kbb',
 'kbc',
 'kbd',
 'kbe',
 'kbf',
 'kbg',
 'kbh',
 'kbi',
 'kbj',
 'kbk',
 'kbl',
 'kbm',
 'kbn',
 'kbo',
 'kbp',
 'kbq',
 'kbr',
 'kbs',
 'kbt',
 'kbu',
 'kbv',
 'kbw',
 'kbx',
 'kby',
 'kbz',
 'kca',
 'kcb',
 'kcc',
 'kcd',
 'kce',
 'kcf',
 'kcg',
 'kch',
 'kci',
 'kcj',
 'kck',
 'kcl',
 'kcm',
 'kcn',
 'kco',
 'kcp',
 'kcq',
 'kcr',
 'kcs',
 'kct',
 'kcu',
 'kcv',
 'kcw',
 'kcx']

In [None]:
# Paths
path = "O:/Man/Public/sharing-4270-CERM/VLYMI/CLIMEX/GlobusDownload/pr/" 

In [None]:
%%time
for i, ens in enumerate(ensembles):
    print(ens)
    
    #Grab files
    files = glob.glob(path + ens + "/*/pr_*")
    
    # Define your latitude and longitude bounds
    lat_min, lat_max = 3.5, 7.5
    lon_min, lon_max = -6, -1.5
    
    #Modify time variable and save datasets to list
    datasets = []
    for file in files:
        # Open dataset with chunking (lazy loading)
        ds = xr.open_dataset(file, decode_times=False, chunks={'time': 10000})

        # Modify the time variable
        ds = standardize_time(ds)

        # Subset the dataset by latitude and longitude
        ds = ds.sel(rlat=slice(lat_min, lat_max), rlon=slice(lon_min, lon_max))

        # Optional: Rechunk the dataset if needed
        ds = ds.chunk({'time': 10000})

        # Add to the list of datasets
        datasets.append(ds)

    #Concatenate files 
    ds = xr.concat(datasets, dim='time') 
    
    #Set output directory
    output_dir = f"O:\\Man\\Public\\sharing-4270-CERM\\VLYMI\\CLIMEX\\Climex DK Domain\\pr\\{ens}"

    #Create folder path in directroy
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    #Grab model name string
    match = re.search(r'pr_.*?_1h', files[0])
    
    name = match.group(0)
    
    #Save final output name
    output_path = os.path.join(output_dir, f"{name}_1955-2100.nc")
    write_delayed = ds.to_netcdf(output_path, engine="h5netcdf", compute = False)
    
    #Compute the netcdf
    dask.compute(write_delayed)
    
    break