In [1]:
import datreant.core as dtr
import xarray

from datreant.core.limbs import Limb
from datreant.core.agglimbs import AggLimb

Import some libraries and helper classes.

Architecturally, _datreant_ allows creation of a tree-based representation of the
filesystem. In addition, it allows addition of arbitrary functionality in the 
form of helper classes called **Limb** and **AggLimb**.

These classes can be attached to any node in the tree automatically, and certain
code can be executed. This allows me to read the NC files in these directories
and create metadata. _datreant_ already has infrastructure for reading and writing
metadata, and I just use that.

**Limb** is attached to every **Treant** object corresponding to a single directory.
**AggLimb** is attached to a **Bundle** of **Treant**s, and gives an aggregate
view of all directories below the root directory which contain **Treant** metadata.

In [2]:
class NcVariableTags(Limb):

    _name = 'variables'

    def __init__(self, treant):

        super(NcVariableTags, self).__init__(treant)
        print treant.leaves
        # init state if tags not already there;
        # if read-only, check that they are there,
        # and raise exception if they are not
        try:
            with self._treant._write:
                try:
                    self._treant._state['variables']
                except KeyError:
                    self._treant._state['variables'] = dict()

                #Create a new attribute for each Treant called
                #variables. This is a dictionary, which I will use
                # to store attributes of variables in netCDF files.
                # I use xarray to obtain the attributes, and do not
                # store duplicate attributes. I also store which
                # files contain this particular variable.
                variable_dict = self._treant._state['variables']
                for leaf in treant.leaves.abspaths:
                    try:
                        data_set = xarray.open_dataset(leaf)
                        for variable in data_set.variables:
                           
                            if variable not in variable_dict:
                                variable_dict[variable] = dict()

                            if 'file_name' not in variable_dict[variable]:
                                variable_dict[variable]['file_name'] = []
                            
                            if leaf not in variable_dict[variable]['file_name']:
                                variable_dict[variable]['file_name'].append(leaf)

                            for attribute in data_set.variables[variable].attrs:

                                if attribute not in variable_dict[variable]:
                                    variable_dict[variable][attribute] = []

                                if str(data_set[variable].attrs[attribute]) not in variable_dict[variable][attribute]:

                                    variable_dict[variable][attribute]\
                                        .append(str(data_set[variable].attrs[attribute]))
                        
                        data_set.close()

                    except IOError:
                        pass

        except (IOError, OSError):
            with self._treant._read:
                try:
                    self._treant._state['variables']
                except KeyError:
                    raise KeyError(
                            ("Missing 'variables' data, and cannot write to "
                             "Treant '{}'".format(self._treant.filepath)))


    
    def __repr__(self):
        '''Pretty printing the contents'''

        with self._treant._read:
            return str(self._dict().keys())

    def keys(self):

        return self._dict().keys()

    def __getitem__(self, keys):
        '''Allow a dict interface to the variable attribute'''

        variables = self._dict()

        return variables[keys]


    def _dict(self):

        with self._treant._read:
            return self._treant._state['variables']

In [3]:

class NcAggVariableTags(AggLimb):

    _name = 'variables'

    def __init__(self, collection):
        super(NcAggVariableTags, self).__init__(collection)

    def __repr__(self):
        return "NcVariables:<{}>".format(list(self.all))

    @property
    def all(self):

        variables = [set(member.variables.keys()) for member in self._collection]
        return variables

The following two lines attach my classes to the **Treant** and **Bundle** classes.
Therefore, everytime a new **Treant** or **Bundle** object is instantiated, my
classes are also bound to them.

In [5]:
dtr.Treant._attach_limb_class(NcVariableTags)
dtr.Bundle._attach_agglimb_class(NcAggVariableTags)

Just using some example datasets for the purpose of illustrating how things might work. No **datreant** metadata files are created yet, this is a clean folder.

In [7]:
ls ../data

[0m[01;34mAMSU[0m/  [01;34mCERES[0m/  [01;34mDailyFields[0m/  [01;34mERSST[0m/  [01;34mISCCPClouds[0m/  [01;34mmoistGill[0m/


In [8]:
data_dir = dtr.Treant('../data/AMSU')

(not very!) Pretty print what variables are contained in the NC files in this folder.

In [10]:
data_dir.variables

[u'target_factor_values', u'BTemp', u'ttt', u'brightness_temperature_anomaly', u'satellites_used', u'climatology_time_bounds', u'tmt', u'longitude_bounds', u'time_bounds', u'msu_amsu_offsets', u'lon', u'offset_values', u'latitude', u'satnum', u'latitude_bounds', u'bnds', u'brightness_temperature', u'tb_factor_values', u'climatology_time', u'lat', u'tls', u'brightness_temperature_climatology', u'tlt', u'longitude', u'time']

See what files have certain variables, and what attributes they define:

In [12]:
data_dir.variables['longitude']

{u'_CoordinateAxisType': [u'Lon'],
 u'axis': [u'X'],
 u'bounds': [u'longitude_bounds'],
 u'coordinate_defines': [u'center'],
 u'file_name': [u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtls_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtlt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtmt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtts_198701_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtls_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtlt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtmt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtts_198701_201406.nc3.nc'],
 u'long_name': [u'longitude'],
 u'standard_name': [u'longitude'],
 u'units': [u'degrees_east'],
 u'valid_max': [u'180.0'],
 u'valid_min': [u'-180.0']}

In [14]:
data_dir.variables['time']

{u'_CoordinateAxisType': [u'Time'],
 u'axis': [u'T'],
 u'bounds': [u'time_bounds'],
 u'file_name': [u'/home/joymm/github/data/AMSU/UWMSU_L3_absolute_merged.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtls_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtlt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtmt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_anom_chtts_198701_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtls_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtlt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtmt_197812_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/uat4_tb_v03r03_avrg_chtts_198701_201406.nc3.nc',
  u'/home/joymm/github/data/AMSU/xinyao.nc'],
 u'long_name': [u'time'],
 u'standard_name': [u'time'],
 u'valid_max': [u'20000.0'],
 u'valid_min': [u'0.0']}

More files contain the variable **time** than **longitude**. The others contain **lon** instead.

In [15]:
another_data_dir = dtr.Treant('../data/CERES/')

In [16]:
another_data_dir.variables

<- View ->
  ../data/CERES/CERES_EBAF-Surface_Ed2.8_Subset_200003-201311.nc
  ../data/CERES/CERES_EBAF-Surface_Ed2.8_Subset_CLIM01-CLIM12.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201404.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201410.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_CLIM01-CLIM12.nc
  ../data/CERES/Pacific_SST_star_PC1.txt
  ../data/CERES/Treant.b7cf9539-da1a-4e30-9d7e-4c516730a817.json
  ../data/CERES/myCmap.py
  ../data/CERES/myCmap.pyc
  ../data/CERES/weightedMean-SFC.py
  ../data/CERES/weightedMean-TOA.py
<- ---- ->


[u'toa_lw_clr_mon', u'lon', u'sfc_net_tot_clr_mon', u'toa_sw_clr_mon', u'sc2', u'toa_sw_all_mon', u'toa_sw_all_clim', u'sfc_net_sw_clr_clim', u'toa_lw_all_clim', u'sfc_net_lw_clr_clim', u'sfc_net_lw_all_clim', u'toa_net_all_clim', u'climatology_bounds', u'sfc_net_lw_all_mon', u'toa_net_all_mon', u'lat', u'sc1', u'toa_lw_all_mon', u'sfc_net_tot_all_clim', u'ctime', u'sfc_net_tot_clr_clim', u'sfc_net_lw_clr_mon', u'sfc_net_tot_all_mon', u'solar_mon', u'sfc_net_sw_all_mon', u'toa_net_clr_mon', u'time', u'sfc_net_sw_all_clim', u'solar_clim', u'sfc_net_sw_clr_mon']

In [17]:
another_data_dir.variables['time']

{u'delta_t': [u'0000-00-01 00:00:00'],
 u'file_name': [u'/home/joymm/github/data/CERES/CERES_EBAF-Surface_Ed2.8_Subset_200003-201311.nc',
  u'/home/joymm/github/data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201404.nc',
  u'/home/joymm/github/data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201410.nc'],
 u'long_name': [u'time']}

Most files are climatologies, and they don't contain the dimension **time**.

Now that we have created two **Treant** directories, we can automatically access them using a **Bundle**.

In [18]:
data_store = dtr.discover('../data')

In [20]:
data_store

<Bundle([<Treant: 'AMSU'>, <Treant: 'CERES'>])>

In [19]:
data_store.variables

<- View ->
  ../data/AMSU/Treant.02975624-62e3-40da-b718-85fe49052eda.json
  ../data/AMSU/UWMSU_L3_absolute_merged.nc
  ../data/AMSU/uat4_tb_v03r03_anom_chtls_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_anom_chtlt_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_anom_chtmt_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_anom_chtts_198701_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_avrg_chtls_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_avrg_chtlt_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_avrg_chtmt_197812_201406.nc3.nc
  ../data/AMSU/uat4_tb_v03r03_avrg_chtts_198701_201406.nc3.nc
  ../data/AMSU/xinyao.nc
<- ---- ->
<- View ->
  ../data/CERES/CERES_EBAF-Surface_Ed2.8_Subset_200003-201311.nc
  ../data/CERES/CERES_EBAF-Surface_Ed2.8_Subset_CLIM01-CLIM12.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201404.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_200003-201410.nc
  ../data/CERES/CERES_EBAF-TOA_Ed2.8_Subset_CLIM01-CLIM12.nc
  ../data/CERES/Pacific_SS

NcVariables:<[set([u'target_factor_values', u'BTemp', u'ttt', u'brightness_temperature_anomaly', u'satellites_used', u'climatology_time_bounds', u'tmt', u'longitude_bounds', u'time_bounds', u'msu_amsu_offsets', u'lon', u'offset_values', u'latitude', u'satnum', u'latitude_bounds', u'bnds', u'brightness_temperature', u'tb_factor_values', u'climatology_time', u'lat', u'tls', u'brightness_temperature_climatology', u'tlt', u'longitude', u'time']), set([u'toa_lw_clr_mon', u'lon', u'sfc_net_tot_clr_mon', u'toa_sw_clr_mon', u'sc2', u'toa_sw_all_mon', u'toa_sw_all_clim', u'sfc_net_sw_clr_clim', u'toa_lw_all_clim', u'sfc_net_lw_clr_clim', u'sfc_net_lw_all_clim', u'toa_net_all_clim', u'climatology_bounds', u'sfc_net_lw_all_mon', u'toa_net_all_mon', u'lat', u'sc1', u'toa_lw_all_mon', u'sfc_net_tot_all_clim', u'ctime', u'sfc_net_tot_clr_clim', u'sfc_net_lw_clr_mon', u'sfc_net_tot_all_mon', u'solar_mon', u'sfc_net_sw_all_mon', u'toa_net_clr_mon', u'time', u'sfc_net_sw_all_clim', u'solar_clim', u'sfc

This **Bundle** contains information about all variables in the two directories.

The nice thing is that a lot of the "infrastructure" code is already contained within
**datreant**, and we can focus on building the functionality that we need for our
purposes.