# Installations

# Imports

In [1]:
import io
import os
%matplotlib inline
import matplotlib.pyplot as plt

# data management
import xarray as xr
import numpy as np
import pandas as pd

# Azure Connectivity
import getpass
import azure.storage.blob
from azure.storage.blob import BlobClient

# utilities
import time
import urllib.parse


# Code to Generate Average and Record Temperature
* For the specified number of years

### Function to create the empty Results xarray dataset 

In [2]:
def CreateDatasetForResults(filename, start_year, end_year, cmip=6):
    """
    A function that loads the CMIP data file and uses the xarray dataset for reference,
    creates and returns a new xarray with matching dimensions and coordinates but new, empty variables
    The variables of the xarray will be ['tasmaxavg', 'tasmaxrec'] or ['tasminavg', 'tasminrec']
    Similar three dimensions and the coordinates will be there [day: 365, lat: 720,  lon: 1440]
    Input Parameter:
    - filename : a CMIP source data file for createing the reference xarray dataset.
    - start_year : should be the integer value for the intended results date range start (e.g. 1950)
    - end_year : should be the integer value for the intended results date range end (e.g. 1959)
    - cmip        : integer number to specify cmip version (5 or 6 or other). Default is 6    
    Returns: an xarray dataset
    """
    ds_rec = None
    validations_passed = False
    data_type = ''
    
    # validations
    if not (isinstance(start_year, int) and isinstance(end_year, int) and (end_year > start_year)):
        print('Validation Error: start_year and end_year should be integers; end year should be greater than start year.')
    else:
        validations_passed = True
    
    if validations_passed:
        # load the file
        based_on = xr.open_dataset(filename)
        
        var_minmax = list(based_on.data_vars.keys())[0]  # will be tasmax or tasmin
        
        if var_minmax == 'tasmax':
            data_type = 'Average and record maximum temperature, based on tasmax'
            ds_variables = ['tasmaxavg', 'tasmaxrec']
            ds_data = {'tasmaxavg' : (['day','lat','lon'], np.empty((365, 720, 1440))), 'tasmaxrec' : (['day','lat','lon'], np.empty((365, 720, 1440)))}
        else:
            data_type = 'Average and record minimum temperature, based on tasmin'
            ds_variables = ['tasminavg', 'tasminrec']
            ds_data = {'tasminavg' : (['day','lat','lon'], np.empty((365, 720, 1440))), 'tasminrec' : (['day','lat','lon'], np.empty((365, 720, 1440)))}

        ds_rec = based_on.copy(deep = True)   
        ds_rec = ds_rec.rename_dims({'time':'day'})
        ds_rec = ds_rec.assign(ds_data)
        ds_rec = ds_rec.drop_vars(var_minmax)
        ds_rec = ds_rec.assign_coords({"day": xr.DataArray(np.arange(1,366), dims='day')})
        ds_rec = ds_rec.reset_coords('time', drop=True)
        
        n_years = (end_year - start_year) + 1
        
        new_attrs = {'Dataset' : 'Average temperature CMIP{}'.format(cmip),
                     'About dataset' : 'A dataset with average and record temparatures, across years',
                     'Original values' : var_minmax,
                     'Data variables' : ds_variables,
                     'Data description' : data_type,
                     'Range' : str(n_years) + ' years',
                     'Start year' : str(start_year),
                     'End year' : str(end_year),
                     'Years processed' : 0,
                     'Store as' : 'Avg_temp_{}_CMIP{}__{}_yrs__{}_to_{}'.format(var_minmax[-3:]
                                                                               ,cmip
                                                                               ,n_years
                                                                               ,start_year
                                                                               ,end_year)
                    }
        
        ds_rec.attrs = new_attrs
    
    return ds_rec
        


### Function to process each file at a time

In [102]:
def GenerateAverageOneFileAtATime(results, lat_indices, lon_indices, start_year, end_year
    , filename):
    """
    Takes one file at a time and updates the average, peak values in the existing 'results' dataset.
    Average and Record temperatures are captured for each cell, i.e. the lat, lon combination in the specified grid.
    The units of temperature (degree Kelvin) are preserved.
    Input Parameters:
    - results     : xarray dataset with average, record temperature for 365 days
    - lat_indices : applicable index values for latitudes. If None, will calculate for the complete grid
    - lon_indices : applicable index values for longitudes. If None, will calculate for the complete grid.
    - start_year  : start year of the range
    - end_year    : end year of the range
    - filename    : name of the .nc file to process
    Returns:  None. The input xarray dataframe 'results' is updated.
    """
    # load the file
    print(filename)
    xds = xr.open_dataset(filename)
    
    if not (lat_indices is None or lon_indices is None):
        # take applicabel subset of lat and lon
        xds = xds[dict(lat=lat_indices,lon=lon_indices)]
    
    # print("xds", xds)     ## For debugging
    
    # max or min
    var_minmax = list(xds.data_vars.keys())[0]  # will be tasmax or tasmin
        
    # note the variable names in the results dataset
    result_variables = results.attrs['Data variables']

    new_values = xds[var_minmax][:].to_numpy()

    n_years = results.attrs['Years processed']
    
    # if first year in the range
    if n_years == 0:
        results.update({result_variables[0] : (['day','lat','lon'], new_values)       # avg temperature
                        , result_variables[1] : (['day','lat','lon'], new_values)})   # record temperature        
    else:
        if var_minmax == "tasmin":        
            results.update({result_variables[1] : xr.ufuncs.minimum(new_values
                                                                    , results[result_variables[1]])})   # peak min temperature across years
        else:
            results.update({result_variables[1] : xr.ufuncs.maximum(new_values
                                                                    , results[result_variables[1]])})   # peak max temperature across years
        # for both max and min
        results.update({result_variables[0] : np.round((((results[result_variables[0]] * n_years) + new_values) / (n_years+1)),2)})  # avg across years
        results.update({result_variables[1] : np.round(results[result_variables[1]], 2)})  # round off only
        
    
    # in any case...
    n_years += 1
    results.attrs['Years processed'] = n_years


### Function to download a specified file

In [4]:
def download_file(sas_url, filename, overwrite_local_file):
    """
    Downloads the specified file from Azure blob storage
    If asked to not overwrite, then first checks if the file is available locally and does not download again in that case.
    - sas_url   : url complete with sas token
    - filename  : name of the downloaded file
    - overwrite_local_file : if True, will overwrite, else, if already available locally, will not download again.
    Returns:  None
    """
    if overwrite_local_file or not (os.path.isfile(filename)):        
        blob_client = BlobClient.from_blob_url(sas_url)
        with open(filename, "wb") as my_blob:
            download_stream = blob_client.download_blob()
            my_blob.write(download_stream.readall())

                  

### Function to prepare the average for the desired range of years

In [5]:
def PrepareAverageForRange(filename_prefix, lat_indices, lon_indices, start_year, end_year, cmip=6
                           , azure_url_prefix=None, sas_token=None
                           , overwrite_local_file=False, remove_after_use=True):
    """
    Prepares a results dataset then runs in a loop for the specified range of years:
    * downloads the file for each year (if asked, by providing azure url prefix, then provide the SAS token as well)
    * and calls the function that processes one file at time.
    * then deletes the file
    Average and Record temperatures are captured for each cell, i.e. the lat, lon combination in the specified grid.
    The units of temperature (degree Kelvin) are preserved.
    Input Parameters:
    - filename_prefix  : initial part of the filename. Year will be appended to it, along with file extension '.nc'
    - lat_indices : applicable index values for latitudes. If None, will calculate for the complete grid
    - lon_indices : applicable index values for longitudes. If None, will calculate for the complete grid.
    - start_year  : start year of the range
    - end_year    : end year of the range
    - cmip        : integer number to specify cmip version (5 or 6 or other). Default is 6
    - azure_url_prefix=None : if downloading from Azure blob storage, specify the url with the container and the folder, else specify None
    - sas_token   : A sas token with 'read' permissions to the azure blob container
    - overwrite_local_file  : if True, will overwrite, else, if already available locally, will not download again.
    - remove_after_use      : default = True. Set to False to retain the files locally
    Returns:  The xarray dataframe with 'results'
    """
    validations_passed = False
    ds_results = None
    filename = ''
    
    # validations
    if not (isinstance(start_year, int) and isinstance(end_year, int) and (end_year > start_year)):
        print('Validation Error: start_year and end_year should be integers; end year should be greater than start year.')
    else:
        validations_passed = True
    
    if validations_passed:               
        total_years = 0        
            
        # prepare to time the operation
        start_time = time.time()
        for yr in range(start_year, (end_year + 1)):
            print(yr)
            filename = filename_prefix + str(yr) + '.nc'
            
            # if required, download the file so it is available locally
            if not azure_url_prefix is None:
                if not azure_url_prefix[-1] == '/':
                    azure_url_prefix += '/'
                sas_url = azure_url_prefix + filename + "?" + sas_token    
                download_file(sas_url, filename, overwrite_local_file)
            
            # first prepare results dataset
            if total_years == 0:
                ds_results = CreateDatasetForResults(filename = filename, start_year = start_year, end_year = end_year
                                                     , cmip=5)

            # process the file
            GenerateAverageOneFileAtATime(ds_results, lat_indices, lon_indices, start_year, end_year
                                          , filename)
            total_years += 1
            
            # delete the file
            if remove_after_use:
                os.remove(filename)
            
        # print out the time it took
        execution_time = (time.time() - start_time)
        print("Complete execution time | PrepareAverageForRange | (mins) {:0.2f}".format(execution_time/60.0))
            
    return ds_results

### Function to save the Results file in Azure blob

In [6]:
def SaveResult(results, azure_url_prefix = None, sas_token=None, local_copy=True):
    """
    Create a NetCDF4 file from the xarrary dataset 'results'. Prompts for sas token if pushing to Azure Blob Storage.
    Input parameters:
    - results   :
    - azure_url_prefix : if uploading to blob, this will be the url to the container and the folder 
    - sas_token : if uploading to blob, a sas token with 'write' permissions to the azure blob container
    - local_copy: By default, True, i.e. local_copy will be retained. Setting to False will remove file only 
                  after upload to Azure Blob Storage
    Returns: string. The name of the newly created file.
    """
    # determine the name of the file
    filename = results.attrs['Store as'] + '.nc'
    
    # a local copy will initially be saved, in any case
    results.to_netcdf(filename, mode='w', format='NETCDF4')

    # if required to upload to Azure blob storage
    if not azure_url_prefix is None:
        # prepare to time the operation
        start_time = time.time()
        
        # Create a blob client using the local file name as the name for the blob
        if not azure_url_prefix[-1] == '/':
            azure_url_prefix += '/'
        sas_url = azure_url_prefix + filename + "?" + sas_token
        blob_client = BlobClient.from_blob_url(sas_url)

        # Upload the created file
        with open(filename, "rb") as data:
            blob_client.upload_blob(data)

        # if asked to not retain the local copy after use, then delete the file
        if local_copy == False:
            os.remove(filename)
            
        # print out the time it took
        execution_time = (time.time() - start_time)
        print("Complete execution time | SaveResult | (mins) {:0.2f}".format(execution_time/60.0))
    
    return filename

### Execute for the desired range of years

In [119]:
name_prefix = 'tasmin_day_BCSD_historical_r1i1p1_inmcm4_'                # without the year at the end
url_prefix = 'https://nexdcp30.blob.core.windows.net/cmip5/historical/'  # without the name of the file at the end
start_year = 1950
end_year = 1959

sas_token = getpass.getpass()  # prompts for sas token

ds_results = PrepareAverageForRange(filename_prefix = name_prefix, lat_indices = None, lon_indices = None
                                    , start_year = start_year, end_year = end_year, cmip=5
                                   , azure_url_prefix=url_prefix, sas_token=sas_token
                                   , overwrite_local_file=False, remove_after_use=True)
ds_results

 ··············································································································································


1950
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1950.nc
1951
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1951.nc
1952
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1952.nc
1953
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1953.nc
1954
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1954.nc
1955
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1955.nc
1956
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1956.nc
1957
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1957.nc
1958
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1958.nc
1959
tasmin_day_BCSD_historical_r1i1p1_inmcm4_1959.nc
Complete execution time | PrepareAverageForRange | (mins) 5.71


### Save results and upload to Azure blob

In [8]:
url_prefix = 'https://nexdcp30.blob.core.windows.net/cmip5/averages/'  # without the name of the file at the end
filename = SaveResult(ds_results, url_prefix, sas_token, local_copy=False)
filename

Complete execution time | SaveResult | (mins) 1.96


'Avg_temp_min_CMIP5__10_yrs__1950_to_1959.nc'

In [10]:
filename = 'Avg_temp_max_CMIP5__10_yrs__1980_to_1989.nc'
newds = xr.open_dataset(filename)
newds

### Function to aggregate average results from multiple pre-processed results

In [107]:
def AggregateAverageForRange(start_year, end_year, cmip=6, result_type='max'
                           , azure_url_prefix=None, sas_token=None
                           , overwrite_local_file=False, remove_after_use=True):
    """
    Makes available Average and Record temperatures for multiple decades. 
    * Date range should be a multiple of decades --- e.g. 1950 to 1969  OR 1951 to 1970.
    * Make sure the underlying data files, by decades are available. Else, an exception will be raised!
    
    Checks the preprocessed average data files in Azure cloud or locally and:
    * downloads the matching file if already exists
    * else, checks for the component files (by decade). 
    * -- If complete data not available, exception is thrown. Else, the required files are downloaded and aggregated.
    * returns the results xarray dataset
    Average and Record temperatures are captured for each cell, i.e. the lat, lon combination in the specified grid.
    The units of temperature (degree Kelvin) are preserved.
    Input Parameters:
    - start_year  : start year of the range
    - end_year    : end year of the range
    - cmip        : integer number to specify cmip version (5 or 6 or other). Default is 6
    - result_type : string 'max' or 'min'
    - azure_url_prefix=None : if downloading from Azure blob storage, specify the url with the container and the folder
                              , else specify None to check the files locally.
    - sas_token   : A sas token with 'read' permissions to the azure blob container
    - overwrite_local_file  : if True, will overwrite, else, if already available locally, will not download again.
    - remove_after_use      : default = True. Set to False to retain the files locally
    Returns:  The xarray dataframe with 'results'
    """
    validations_passed = False
    ds_results = None
    filename = ''
    
    n_years = (end_year - start_year) + 1   # range
    
    # validations
    if not (isinstance(start_year, int) and isinstance(end_year, int) and (end_year > start_year)):
        print('Validation Error: start_year and end_year should be integers; end year should be greater than start year.')
    elif not isinstance((n_years)%10, int):
        print('Validation Error: The total range (end year - start year)+1 should be a multiple of 10.')    
    else:
        validations_passed = True
    
    if validations_passed:
        n_components = int(n_years / 10)             # number of decades in the range       
        total_years = 0                    
        
        # intended filename
        filename = 'Avg_temp_{}_CMIP{}__{}_yrs__{}_to_{}.nc'.format(result_type
                                                                 ,cmip
                                                                 ,n_years
                                                                 ,start_year
                                                                 ,end_year)
        # intended component filenames
        component_files = [''] * n_components   # as many component files as many decades in the range
        component_available = [False] * n_components   # initialized as none of the files are available
        complete_data = False                   # initialized as file / components are not available
        
        component_start_year = start_year  
        for i in range(n_components):
            component_end_year = component_start_year + 9
            component_files[i] = 'Avg_temp_{}_CMIP{}__{}_yrs__{}_to_{}.nc'.format(result_type
                                                                 ,cmip,10,component_start_year,component_end_year)
            component_start_year += 10

        # prepare to time the operation
        start_time = time.time()
        
        # if local storage is to be checked first
        if azure_url_prefix is None or overwrite_local_file==False:
            if os.path.exists(filename):                # if the complete file exists
                complete_data = True                
            else:
                # check each file exists
                for i in range(n_components):
                    component_available[i] = os.path.exists(component_files[i])
                    
                complete_data = all(component_available)

        # if complete data is not available locally, check if we can look in Azure blob storage
        if not complete_data:
            if azure_url_prefix is not None:
                # check in azure
                # first the complete file...
                sas_url = azure_url_prefix + filename + "?" + sas_token
                blob_client = BlobClient.from_blob_url(sas_url)
                if blob_client.exists():
                    download_file(sas_url, filename, overwrite_local_file)
                    complete_data = True
                else:
                    for i in range(n_components):
                        # only for files not availble locally, unless overwrite_local_file = True
                        if not component_available[i] or overwrite_local_file:
                            sas_url = azure_url_prefix + component_files[i] + "?" + sas_token
                            blob_client = BlobClient.from_blob_url(sas_url)
                            if blob_client.exists():
                                download_file(sas_url, component_files[i], overwrite_local_file)
                            else:
                                errMsg = 'Missing component file (on Azure){}'.format(component_available[i])
                                raise ValueError(errMsg)
                    complete_data = True    

                
            else: 
                errMsg = 'Missing component files (locally): {}'.format(', '.join([str(component_files[i]) for i,bv in enumerate(component_available) if bv]))
                raise ValueError(errMsg)


        # complete files should be available locally at this stage
        # start by checking for the complete file first
        if os.path.exists(filename):
            ds_results = xr.open_dataset(filename)  # load the complete file directly
            total_year = n_years
            # delete the file
            if remove_after_use:
                os.remove(filename)
        else:
            # prepare from component files            
            for cfilename in component_files:
                if total_years == 0:    # first file
                    ds_results = xr.open_dataset(cfilename)                    
                else:
                    ds_comp = xr.open_dataset(cfilename)
                    result_variables = ds_results.attrs['Data variables']
                    if remove_after_use:
                        os.remove(cfilename)
                    if result_type == "min":        
                        ds_results.update({result_variables[1] : xr.ufuncs.minimum(ds_comp[result_variables[1]]
                                                                                , ds_results[result_variables[1]])})   # peak min temperature across years
                    else:
                        ds_results.update({result_variables[1] : xr.ufuncs.maximum(ds_comp[result_variables[1]]
                                                                                , ds_results[result_variables[1]])})   # peak max temperature across years
                    # for both max and min
                    ds_results.update({result_variables[0] : (ds_results[result_variables[0]] + ds_comp[result_variables[1]])})    # sum across years
                # in any case    
                total_years += 10
            # finally, for avg from component files        
            ds_results.update({result_variables[0] : np.round((ds_results[result_variables[0]] / n_components), 2)})    # avg across years
            ds_results.update({result_variables[1] : np.round(ds_results[result_variables[1]], 2)})    # round off values

        # set attributes
        ds_results.attrs['Range'] = str(n_years) + ' years'
        ds_results.attrs['Start year'] = str(start_year)
        ds_results.attrs['End year'] = str(end_year)
        ds_results.attrs['Years processed'] = total_years
        ds_results.attrs['Store as'] = 'Avg_temp_{}_CMIP{}__{}_yrs__{}_to_{}'.format(result_type
                                                                               ,cmip, n_years, start_year, end_year)
            
        # print out the time it took
        execution_time = (time.time() - start_time)
        print("Complete execution time | PrepareAverageForRange | (mins) {:0.2f}".format(execution_time/60.0))
            
    return ds_results

### Call Aggregate Results to get a range of multiple decades

In [120]:
url_prefix = 'https://nexdcp30.blob.core.windows.net/cmip5/averages/'  # without the name of the file at the end
ds_results = AggregateAverageForRange(1950, 1979, cmip=5, result_type='min'
                           , azure_url_prefix=url_prefix, sas_token=sas_token
                           , overwrite_local_file=False, remove_after_use=True)

Complete execution time | PrepareAverageForRange | (mins) 0.00


In [115]:
ds_results

### Save aggregate results dataset and upload to Azure blob

In [121]:
url_prefix = 'https://nexdcp30.blob.core.windows.net/cmip5/averages/'  # without the name of the file at the end
filename = SaveResult(ds_results, url_prefix, sas_token, local_copy=True)
filename

Complete execution time | SaveResult | (mins) 1.97


'Avg_temp_min_CMIP5__30_yrs__1950_to_1979.nc'

In [117]:
ds_results['tasminavg'][0][549][1]

In [105]:
ds_results['tasmaxrec'][0][549][1]