# Processing ERA5 data

### Set up
#### Packages

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
from datetime import datetime
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import datetime as dt
from datetime import timedelta
xr.set_options(display_expand_data=False);
import dask
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from Processing_functions import AddCyclic, FixLongitude

#### Filepaths & name variables

In [2]:
## File name
#filename = 'adaptor.mars.internal-1724343246.1971328-17478-4-18f75df5-b6ea-4074-b3d0-ae2a89d93078.nc'
filename = 'a4c1c*.nc'

## Filepaths
path_to_arch = "/glade/work/glydia/processed_ERA5_data/"
comp = 'atm'
var_ind = 3

# Variables
var_list = {'atm': [['t2m','TREFHT'],['msl','PSL'],[['mtnswrf','mtnlwrf'],'RESTOM'],['z','Z3']],
            'ice': [['siconc','aice']]}
var = var_list[comp][var_ind][0]
var2 = var_list[comp][var_ind][1]

# Extensions
h_ext = {'atm': ['.h0.'],
       'ice': ['.h.']}

path_to_outdata = '/glade/work/glydia/processed_ERA5_data/'

In [3]:
cluster = PBSCluster(cores    = 1,
                     memory   = '25GiB',
                     queue    = 'casper',
                     walltime = '02:00:00',
                     project  = 'UCUB0137',
                     name='piControl_'+var)
cluster.scale(4*9)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/glydia/Arctic_breakdown/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/glydia/Arctic_breakdown/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.95:44957,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/glydia/Arctic_breakdown/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
## Chunking variables
time_ch = 72
chunks = {
    'atm': {'time': time_ch, 'latitude': 23, 'longitude': 480},
    'ice': {'time': time_ch, 'latitude': 23, 'longitude': 480}
}

### Load & modify data
#### Control data

In [6]:
%%time

yr_range = np.array([str(i) for i in np.arange(1950,2024)])
## Load data
# Open dataset
print(path_to_arch+filename)
#ds = dask.delayed(xr.open_dataset(path_to_arch+filename,chunks=chunks[comp]))
if var2 == 'RESTOM':
    ds = xr.open_mfdataset(path_to_arch+filename,chunks=chunks[comp])
    ds_sw = ds[var[0]]
    ds_lw = ds[var[1]]
    
else:
    ds = xr.open_mfdataset(path_to_arch+filename,chunks=chunks[comp])

    dsv = ds[var]

    dates = dsv.date.values
    times = []
    for d in dates:
        times.append(datetime.strptime(str(d), '%Y%m%d'))
    times = np.array(times)
    dsv = dsv.rename({'date':'time'})
    dsv = dsv.assign_coords({'time': times})

#del ds

processed_list = []
for i in range(0,len(yr_range)):
    startyr = yr_range[i]
    endyr = yr_range[i]
    ann_slice = dsv.sel(time=slice(startyr+'-01-01',endyr+'-12-17')) 
    print('sliced '+startyr+'-01-01 to '+endyr+'-12-17')

    if var == 'z':
        ann_slice = ann_slice/9.81
    
        fixedcoord_data = ann_slice.rename({'longitude':'lon','latitude':'lat', 'pressure_level':'lev'})
    else:
        fixedcoord_data = ann_slice.rename({'longitude':'lon','latitude':'lat'})
    fixedname_data = fixedcoord_data.rename(var2)
    print('   fixed coordinate and variable names')

    latfix_data = fixedname_data.reindex(lat=list(reversed(fixedname_data.lat)))

    # addcyc_data = dask.delayed(AddCyclic)(latfix_data)
    lonfix_data = FixLongitude(latfix_data, False)
    print('   added cyclic coordinate')
    # addcyc_data = AddCyclic(fixedname_data)

    # processed_list.append(addcyc_data)
    processed_list.append(lonfix_data)

# processed_comp = dask.compute(*processed_list)
print('computed list')

processed_out = xr.concat(processed_list,dim='time').chunk({'time':72})
# processed_out = xr.concat(processed_comp,dim='time').chunk({'time':88})
print('concatenated data')

if var2 != 'Z3':
    processed_out.to_netcdf(path_to_outdata+'ERA5'+h_ext[comp][0]+var2+'.195001-202312.'+'nc', 
                                format='NETCDF4',encoding={var2: {"zlib": True, "complevel": 1}})
else:
    processed_out = processed_out.chunk()
        
    processed_out.sel(time=slice('1950-01-01','1950-12-31')).to_zarr(path_to_outdata+'ERA5'+h_ext[comp][0]+var2+'.195001-202312.'+'zarr', 
                            group=var2)
    print('saved initial zarr store')
    for i in range(1,len(yr_range)):
        yr = str(yr_range[i])
        print('   saving year '+yr)
        
        processed_out.sel(time=slice(yr+'-01-01',yr+'-12-31')).to_zarr(path_to_outdata+'ERA5'+h_ext[comp][0]+var2+'.195001-202312.'+'zarr', 
                            append_dim='time', mode='a-',group=var2)
    
print('wrote data to disk')

/glade/work/glydia/processed_ERA5_data/a4c1c*.nc
sliced 1950-01-01 to 1950-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1951-01-01 to 1951-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1952-01-01 to 1952-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1953-01-01 to 1953-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1954-01-01 to 1954-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1955-01-01 to 1955-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1956-01-01 to 1956-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1957-01-01 to 1957-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1958-01-01 to 1958-12-17
   fixed coordinate and variable names
   added cyclic coordinate
sliced 1959-01-01 to 1959-12-17
   fixed coordinate and variable name

In [7]:
client.shutdown()