# Exploring Datasets in the DataCube Database

In [1]:
import json
import yaml

from pygments import highlight
from pygments.lexers import SqlLexer, YamlLexer
from pygments.formatters import HtmlFormatter
from IPython.core.display import HTML

def display_yaml(yaml_in):
    if not isinstance(yaml_in, str):
        yaml_in = yaml.dump(yaml_in)
    display(HTML("""
        <style>
        {pygments_css}
        </style>
        """.format(pygments_css=HtmlFormatter().get_style_defs('.highlight'))))
    display(HTML(data=highlight(yaml_in, YamlLexer(), HtmlFormatter())))

In [2]:
from datacube import Datacube
from datacube.model import Range

In [3]:
dc = Datacube(app='explore-yaml')

# Landsat 7 PQ Scene

This is stored in GeoTiff Format.

In [4]:
datasets = dc.find_datasets(
    product='ls5_pq_scene', 
    time=('2011-08', '2011-09')
)

dataset = datasets[0]

display_yaml(dataset.metadata_doc)

In [5]:
# Or get by id
dataset = dc.index.datasets.get('273ffe77-b46f-4973-9773-85995db0e584')

Rather than accessing the `metadata_doc` structure directly, it's preferable to use the fields in `dataset.metadata`, which have been parsed from the document for you. Different products can have different fields.

(note that you can use the `<tab>` key in ipython after `.metadata` to complete fields)

In [6]:
# Landsat path/row ranges. Scenes are always one path/row.
dataset.metadata.sat_path, dataset.metadata.sat_row

(Range(begin=Decimal('94'), end=Decimal('94')),
 Range(begin=Decimal('77'), end=Decimal('77')))

In [7]:
dataset.metadata.time

Range(begin=datetime.datetime(2011, 8, 16, 0, 0, 49), end=datetime.datetime(2011, 8, 16, 0, 1, 16))

In [8]:
begin, end = dataset.metadata.time

begin

datetime.datetime(2011, 8, 16, 0, 0, 49)

In [9]:
dataset.metadata.label

'LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816'

In [10]:
dataset.metadata.platform

'LANDSAT_5'

In [11]:
print('\n'.join(sorted(dataset.metadata.fields)))

creation_dt
format
grid_spatial
gsi
id
instrument
label
lat
lon
measurements
orbit
platform
product_type
sat_path
sat_row
sources
time


### Locations (URIs)
There can be mulitple locations for a single dataset (eg, stored on tape, s3, local files...)

In [12]:
dataset.uris

['file:///g/data/rs0/scenes/pq-scenes-tmp/ls5/2011/08/output/pqa/LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816/ga-metadata.yaml']

`local_path` is a shorthand for choosing the most recent local `file` location as a [pathlib](https://docs.python.org/3/library/pathlib.html) Path:

In [13]:
dataset.local_path

PosixPath('/g/data/rs0/scenes/pq-scenes-tmp/ls5/2011/08/output/pqa/LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816/ga-metadata.yaml')

All paths given in the metadata are relative to these locations uris.

So to calculate the path to our image file, for the latest on-disk location:

In [14]:
band_offset = dataset.metadata.measurements['pqa']['path']
print(band_offset)

dataset.local_path.parent / band_offset

product/LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816.tif


PosixPath('/g/data/rs0/scenes/pq-scenes-tmp/ls5/2011/08/output/pqa/LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816/product/LS5_TM_PQ_P55_GAPQ01-002_094_077_20110816.tif')

# Landsat 5 PQ Albers

In [15]:
datasets = dc.find_datasets(
    product='ls5_pq_albers', 
    time=('2010-10-26', '2010-10-27')
)

dataset = datasets[0]
display_yaml(dataset.metadata_doc)

Active locations:

In [16]:
dataset.uris

['file:///g/data/rs0/datacube/002/LS5_TM_PQ/6_-20/LS5_TM_PQ_3577_6_-20_2010_v1501876157.nc']

Notice the date component is just `2010`: it's a stack for the whole year.

Archived locations:

In [17]:
dc.index.datasets.get_archived_locations(dataset.id)

[]

There are no results: Dave cleaned up archived/non-stacked locations from disk. Thanks Dave!

But when this dataset was first stacked, we could see the unstacked (**archived**) and stacked (**active**) locations within the index (and on disk).

If we do a search for this **stacked** location, we should be able to see which datasets it contains.

In [18]:
datasets = dc.index.datasets.get_datasets_for_location(
    'file:///g/data/rs0/datacube/002/LS5_TM_PQ/6_-20/LS5_TM_PQ_3577_6_-20_2010_v1501876157.nc'
)

['{md.id}: {md.time.begin}'.format(md=d.metadata) for d in datasets]

['016d8006-43b2-4a67-b558-ce602c9edb1c: 2010-02-12 00:38:12.500000',
 '0df89249-b759-418d-9184-e82307320133: 2010-12-13 00:37:08',
 '2dc092e2-3459-43e1-9824-7ea4d2543874: 2010-11-02 00:43:43',
 '356ffc9d-9971-4ad4-8d88-3de171121aa5: 2010-06-04 00:37:59',
 '37624e89-74f0-4d55-a7cb-45d2730f982f: 2010-01-18 00:44:41.500000',
 '3b6d80e9-f196-484b-89fc-69ee3a4f7074: 2010-04-08 00:44:24',
 '3bf65f23-79fa-4ebe-8226-6539708b97d4: 2010-11-18 00:43:17',
 '3cf772f4-ce39-421d-b7d0-e92d942e5e01: 2010-04-08 00:44:47.500000',
 '477661bc-8e98-4905-bd5c-1323d9ec7c50: 2010-05-03 00:38:31',
 '4a0fe2c4-c076-4e38-a328-85445deaa6a6: 2010-06-20 00:37:55.500000',
 '4b631164-bd12-43c1-a268-44fadf47b244: 2010-06-04 00:38:23',
 '58279f4e-3d84-496e-ba0e-95c05cdbf4d0: 2010-04-24 00:44:20',
 '5a2a9ec8-b852-4893-a4ff-4e327c1de8c2: 2010-03-23 00:44:26',
 '5a3cdb90-b545-4027-b40c-831cc02b619b: 2010-01-11 00:38:05',
 '5b94c301-2ce4-4733-8d96-6fd13bc45b80: 2010-05-19 00:38:04.500000',
 '5c4108e7-70b3-44e0-9ca4-93af28736

# BoM Rainfall Grids



In [19]:
datasets = dc.find_datasets(
    product='bom_rainfall_grids', 
    time=('2010-10-26', '2010-10-27')
)

dataset = datasets[0]

Raw document

In [20]:
display_yaml(dataset.metadata_doc)

Parsed fields

In [21]:
for name, value in dataset.metadata.fields.items():
    print('{:>20}: {}'.format(name, value))

                 lat: Range(begin=-44.525, end=-9.975)
                 lon: Range(begin=111.975, end=156.275)
                time: Range(begin=datetime.datetime(2010, 10, 26, 0, 0), end=datetime.datetime(2010, 10, 26, 0, 0))
              format: NETCDF
            platform: BoM
          instrument: rain gauge
        product_type: rainfall
                  id: 9fd6ab35-4e85-4b83-abac-13c9b02b6791
             sources: {}
         creation_dt: 2010-09-04T09:38:57
        grid_spatial: {'geo_ref_points': {'ll': {'x': 111.975, 'y': -44.525}, 'lr': {'x': 156.275, 'y': -44.525}, 'ul': {'x': 111.975, 'y': -9.975}, 'ur': {'x': 156.275, 'y': -9.975}}, 'spatial_reference': 'EPSG:4326'}
        measurements: {'rainfall': {'path': '/g/data/rr5/agcd/0_05/rainfall/daily/2010/rr.2010102620101026.grid.nc', 'layer': 'rain_day'}}


In [22]:
dc.close()