# BARRA <> DEA Generate Daily Summaries

In [1]:
import datacube
import rasterio
import time
import os
import numpy as np
from datetime import datetime
from datacube.helpers import write_geotiff
from matplotlib import pyplot

## Optional helper functions

In [2]:
def write_barra_data(filename, target_dataset, source_affine, nodata=-1073741824):
    profile_override = {'nodata': nodata, 'transform': source_affine}
    write_geotiff(filename, target_dataset, profile_override=profile_override)

In [3]:
def flip_barra_data(source_dataset, target_variable='accum_prcp'):
    temp_dataarray = source_dataset[target_variable].reindex(latitude=source_dataset[target_variable].latitude[::-1])
    target_dataset = temp_dataarray.to_dataset()
    target_dataset.attrs = temp_dataarray.attrs
    return target_dataset

## Before loading BARRA data...

In [4]:
# Because GDAL struggles with the BARRA NetCDF format we have two choices:
# we leave everything unmolested but exruiciatingly slow
# or
# we use the following directive. This helps GDAL read the file
# metadata correctly but it has the unfortunate side effect of flipping our data.
# Depending on what other data sourced you are reading from , you may need to 
# explicitly set this flag back to YES to avoid any unforeseen consequences
os.environ['GDAL_NETCDF_BOTTOMUP'] = 'NO'

# you will need a datacube confing:
config = {
    'db_hostname': 'agdcdev-db.nci.org.au',
    'db_port': 6432,
    'db_database': 'dg6911'
}
dc = datacube.Datacube(config=config)

## Load BARRA faster using DASK

In [5]:
import dask
import dask.distributed

client = dask.distributed.Client(n_workers=8,
                                 threads_per_worker=1,
                                 memory_limit='3G',
                                 ip='127.0.0.1')
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:37286  Dashboard: http://127.0.0.1:39267/status,Cluster  Workers: 8  Cores: 8  Memory: 24.00 GB


In [6]:
def generate_daily_summary(day, month, year):
    accum = dc.load(product='accum_prcp',
             dask_chunks={'time':6},
               time=year+'-'+month+'-'+day,
               skip_broken_datasets=True)
    affine = accum['accum_prcp'].affine
    attrs = accum['accum_prcp'].attrs
    loaded_accum = accum['accum_prcp'].compute()
    filtered_sum = loaded_accum.where(loaded_accum >= 0, drop=True).sum(dim='time')
    
    ## turn back into dataset
    total_day_accumprcp_dataset = filtered_sum.to_dataset()
    total_day_accumprcp_dataset['accum_prcp'].attrs = attrs
    total_day_accumprcp_dataset.attrs = attrs
    
    #flip
    flipped_accum_prcp = flip_barra_data(total_day_accumprcp_dataset)


    target_dir = '/g/data/u46/users/dg6911/BARRA_Daily/'+year+'/'+month+'/'
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    
    ##write to disk
    write_barra_data(target_dir+year+month+day+'_total_accum_prcp.tiff', flipped_accum_prcp, affine)

In [None]:
for year in range(1990, 2020):
    for month in range(1, 13):
        for day in range(1, 32):
            date = str(day).zfill(2)+'/'+str(month).zfill(2)+'/'+str(year)
            try:
                datetime.strptime(date, '%d/%m/%Y')
                generate_daily_summary(str(day).zfill(2), str(month).zfill(2),str(year))
            except ValueError:
                print('The date {} is invalid'.format(date))