# How to Read and Explore HDF5 Files with Python
This tutorial uses snow cover fraction and snow albedo data from...

Start by importing libraries

In [1]:
import h5py
import matplotlib.pyplot as plt
from osgeo import gdal
import glob
import numpy as np

import rioxarray
from rioxarray._options import EXPORT_GRID_MAPPING, get_option
from rioxarray.crs import crs_from_user_input
from rioxarray.exceptions import(
    DimensionError,
    DimensionMissingCoordinateError,
    InvalidDimensionOrder,
    MissingCRS,
    MissingSpatialDimensionError,
    NoDataInBounds,
    OneDimensionalRaster,
    RioXarrayError,
    TooManyDimensions,
)

import rasterio
from rasterio.control import GroundControlPoint
from rasterio.crs import CRS

from pyproj import Transformer
import earthpy as et
from affine import Affine

* [Method 1 - h5py](#h5py)
* [Combine all files with glob](#glob)
* [Method 2 - GDAL](#gdal)
* [Method 3 - Rasterio](#rasterio)
* [Method 4 - rioxarray](#rioxarray)
* [Add a CRS](#crs)

## Method 1 - h5py <a class="anchor" id="h5py"></a>

In [3]:
sierra2014_albedo = h5py.File('../../data/albedo_data/SierraAlbedo2014.h5', 'r')
sierra2014_snow_frac = h5py.File('../../data/snow_fraction_data/Sierra2014.h5', 'r')

Call the file

In [4]:
sierra2014_albedo

<HDF5 file "SierraAlbedo2014.h5" (mode r)>

In [28]:
sierra2014_snow_frac

<HDF5 file "Sierra2014.h5" (mode r)>

Get the type of file

In [29]:
type(sierra2014_albedo)

h5py._hl.files.File

In [30]:
list(sierra2014_albedo)

['Grid']

In [31]:
list(sierra2014_snow_frac)

['Grid']

In [32]:
# Display contents of file using "keys" 
list(sierra2014_albedo.keys())

['Grid']

In [33]:
list(sierra2014_snow_frac.keys())

['Grid']

In [34]:
# Assign a variable to contain the Grid object
dset_sierra2014_albedo = sierra2014_albedo['Grid']

In [35]:
# Figure out what fields are contained in the Grid object
dset_sierra2014_albedo.keys()

<KeysViewHDF5 ['MODIS_GRID_500m']>

In [41]:
dset_sierra2014_snow_frac = sierra2014_snow_frac['Grid']
dset_sierra2014_snow_frac.keys()

<KeysViewHDF5 ['MODIS_GRID_500m']>

In [49]:
# What metadata is available in "Grid"? Use keys again, this time applied to the attributes, to see
for att in dset_sierra2014_albedo.attrs.keys():
    # name of attribute
    print (att)
    # value of attribute
    print (dset_sierra2014_albedo.attrs[att])

angleunits
degrees
aspect
normal
falseeasting
[0.]
falsenorthing
[-4000000.]
geoid
[6.37813700e+06 8.18191908e-02]
maplatlimit
[-90.  90.]
maplonlimit
[-255.   15.]
mapparallels
[34.  40.5]
mapprojection
eqaconicstd
nparallels
[2.]
origin
[   0. -120.    0.]
scalefactor
[1.]
trimlat
[-90.  90.]
trimlon
[-135.  135.]


In [50]:
for att in dset_sierra2014_snow_frac.attrs.keys():
    # name of attribute
    print (att)
    # value of attribute
    print (dset_sierra2014_snow_frac.attrs[att])

angleunits
b'degrees'
aspect
b'normal'
falseeasting
[0.]
falsenorthing
[-4000000.]
geoid
[6.37813700e+06 8.18191908e-02]
maplatlimit
[-90.  90.]
maplonlimit
[-255.   15.]
mapparallels
[34.  40.5]
mapprojection
b'eqaconicstd'
nparallels
[2.]
origin
[   0. -120.    0.]
scalefactor
[1.]
trimlat
[-90.  90.]
trimlon
[-135.  135.]


In [56]:
# Store all the different datasets in individual objects so they can be manipulated separately
# (in this case there's only one so the loop isn't super necessary, but included for completeness)
for ds in dset_sierra2014_albedo['MODIS_GRID_500m'].keys():      
        print (ds)
        ds_albedo_data = dset_sierra2014_albedo['MODIS_GRID_500m'][ds] # returns HDF5 dataset object

albedo


In [58]:
ds_albedo_data.attrs.keys()

<KeysViewHDF5 ['divisor']>

In [59]:
for ds in dset_sierra2014_snow_frac['MODIS_GRID_500m'].keys():      
        print (ds)
        ds_snow_frac_data = dset_sierra2014_snow_frac['MODIS_GRID_500m'][ds] # returns HDF5 dataset object

dust
grain_size
raw_snow_fraction
snow_fraction


In [60]:
ds_snow_frac_data.attrs.keys()

<KeysViewHDF5 ['divisor']>

In [63]:
ds_albedo_data

<HDF5 dataset "albedo": shape (365, 1334, 1841), type "<u2">

In [62]:
ds_snow_frac_data

<HDF5 dataset "snow_fraction": shape (365, 1334, 1841), type "|u1">

In [5]:
# xxx
with h5py.File('../../data/snow_fraction_data/Sierra2019.h5', 'r') as sierra2019:
    ls = list(sierra2019.keys())
    print('List of datasets in this file: \n', ls)
    data = sierra2019.get('dataset1')
    dataset1 = np.array(data)
    print('Shape of dataset1: \n', dataset1.shape)

List of datasets in this file: 
 ['Grid']
Shape of dataset1: 
 ()


## Combine all files with glob <a class="anchor" id="glob"></a>

In [8]:
all_snow_fraction = glob.glob('data/snow_fraction_data/*.h5')
all_snow_fraction

['data/snow_fraction_data/Sierra2015.h5',
 'data/snow_fraction_data/Sierra2005.h5',
 'data/snow_fraction_data/Sierra2011.h5',
 'data/snow_fraction_data/Sierra2001.h5',
 'data/snow_fraction_data/Sierra2010.h5',
 'data/snow_fraction_data/Sierra2014.h5',
 'data/snow_fraction_data/Sierra2004.h5',
 'data/snow_fraction_data/Sierra2013.h5',
 'data/snow_fraction_data/Sierra2003.h5',
 'data/snow_fraction_data/Sierra2017.h5',
 'data/snow_fraction_data/Sierra2007.h5',
 'data/snow_fraction_data/Sierra2016.h5',
 'data/snow_fraction_data/Sierra2006.h5',
 'data/snow_fraction_data/Sierra2012.h5',
 'data/snow_fraction_data/Sierra2002.h5',
 'data/snow_fraction_data/Sierra2019.h5',
 'data/snow_fraction_data/Sierra2009.h5',
 'data/snow_fraction_data/Sierra2018.h5',
 'data/snow_fraction_data/Sierra2008.h5']

## Method 2 - GDAL <a class="anchor" id="gdal"></a>

In [7]:
sierra2015_albedo = gdal.Open("../../data/albedo_data/SierraAlbedo2015.h5", gdal.GA_ReadOnly)

In [14]:
#sierra2015_albedo.GetSubDatasets()

Read the data as an array

In [11]:
sierra2015_albedo_array = sierra2015_albedo.ReadAsArray()

In [12]:
type(sierra2015_albedo_array)

numpy.ndarray

In [13]:
sierra2015_albedo_array[:10]

array([[[65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        ...,
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535]],

       [[65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        ...,
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535]],

       [[65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        ...,
        [65535, 65535, 65535, ..., 65535, 65535, 65535],
        [65535, 65535, 65535, ..., 65535, 655

Get the shape of the array

In [15]:
sierra2015_albedo_array.shape

(365, 1334, 1841)

Get the value from the 300th day and the 1000th pixel in the x direction and the 1500th pixel in the y direction

In [20]:
sierra2015_albedo_array[300][1000][1500]

65535

In [8]:
sierra2018 = gdal.Open('../../data/snow_fraction_data/Sierra2018.h5')
print(sierra2018.GetMetadata())

{'Grid_angleunits': 'degrees', 'Grid_aspect': 'normal', 'Grid_falseeasting': '0 ', 'Grid_falsenorthing': '-4000000 ', 'Grid_geoid': '6378137 0.0818191908426215 ', 'Grid_maplatlimit': '-90 90 ', 'Grid_maplonlimit': '-255 15 ', 'Grid_mapparallels': '34 40.5 ', 'Grid_mapprojection': 'eqaconicstd', 'Grid_MODIS_GRID_500m_dust_divisor': '10 ', 'Grid_MODIS_GRID_500m_grain_size_divisor': '1 ', 'Grid_MODIS_GRID_500m_raw_snow_fraction_divisor': '100 ', 'Grid_MODIS_GRID_500m_ReferencingMatrix': '0 500 -285750 -500 0 500250 ', 'Grid_MODIS_GRID_500m_snow_fraction_divisor': '100 ', 'Grid_nparallels': '2 ', 'Grid_origin': '0 -120 0 ', 'Grid_scalefactor': '1 ', 'Grid_trimlat': '-90 90 ', 'Grid_trimlon': '-135 135 ', 'ISOdates': '2017274 2017275 2017276 2017277 2017278 2017279 2017280 2017281 2017282 2017283 2017284 2017285 2017286 2017287 2017288 2017289 2017290 2017291 2017292 2017293 2017294 2017295 2017296 2017297 2017298 2017299 2017300 2017301 2017302 2017303 2017304 2017305 2017306 2017307 20173

In [103]:
metadata = sierra2018.GetMetadata()
metadata
## this looks like a dictionary. Try to index using dictionary notation

{'Grid_angleunits': 'degrees',
 'Grid_aspect': 'normal',
 'Grid_falseeasting': '0 ',
 'Grid_falsenorthing': '-4000000 ',
 'Grid_geoid': '6378137 0.0818191908426215 ',
 'Grid_maplatlimit': '-90 90 ',
 'Grid_maplonlimit': '-255 15 ',
 'Grid_mapparallels': '34 40.5 ',
 'Grid_mapprojection': 'eqaconicstd',
 'Grid_MODIS_GRID_500m_dust_divisor': '10 ',
 'Grid_MODIS_GRID_500m_grain_size_divisor': '1 ',
 'Grid_MODIS_GRID_500m_raw_snow_fraction_divisor': '100 ',
 'Grid_MODIS_GRID_500m_ReferencingMatrix': '0 500 -285750 -500 0 500250 ',
 'Grid_MODIS_GRID_500m_snow_fraction_divisor': '100 ',
 'Grid_nparallels': '2 ',
 'Grid_origin': '0 -120 0 ',
 'Grid_scalefactor': '1 ',
 'Grid_trimlat': '-90 90 ',
 'Grid_trimlon': '-135 135 ',
 'ISOdates': '2017274 2017275 2017276 2017277 2017278 2017279 2017280 2017281 2017282 2017283 2017284 2017285 2017286 2017287 2017288 2017289 2017290 2017291 2017292 2017293 2017294 2017295 2017296 2017297 2017298 2017299 2017300 2017301 2017302 2017303 2017304 2017305 20

In [104]:
metadata['Grid_angleunits']

'degrees'

In [106]:
ref_matrix = metadata['Grid_MODIS_GRID_500m_ReferencingMatrix']
ref_matrix
# use string functions to separate the individual components of the ref matrix

'0 500 -285750 -500 0 500250 '

In [107]:
# https://gis.stackexchange.com/questions/345691/using-python-gdal-to-reproject-an-hdf
sierra2018.GetSubDatasets()

[('HDF5:"data/snow_fraction_data/Sierra2018.h5"://Grid/MODIS_GRID_500m/dust',
  '[365x1334x1841] //Grid/MODIS_GRID_500m/dust (16-bit unsigned integer)'),
 ('HDF5:"data/snow_fraction_data/Sierra2018.h5"://Grid/MODIS_GRID_500m/grain_size',
  '[365x1334x1841] //Grid/MODIS_GRID_500m/grain_size (16-bit unsigned integer)'),
 ('HDF5:"data/snow_fraction_data/Sierra2018.h5"://Grid/MODIS_GRID_500m/raw_snow_fraction',
  '[365x1334x1841] //Grid/MODIS_GRID_500m/raw_snow_fraction (8-bit unsigned character)'),
 ('HDF5:"data/snow_fraction_data/Sierra2018.h5"://Grid/MODIS_GRID_500m/snow_fraction',
  '[365x1334x1841] //Grid/MODIS_GRID_500m/snow_fraction (8-bit unsigned character)')]

In [73]:
sierra2018_band = gdal.Open(sierra2018.GetSubDatasets()[0][0], gdal.GA_ReadOnly)
sierra2018_arr = sierra2018_band.ReadAsArray()  #now numpy array

In [74]:
sierra2018_arr.shape

(365, 1334, 1841)

## Method 3 - Rasterio <a class="anchor" id="rasterio"></a>

In [10]:
sierra2019 = rasterio.open('../../data/snow_fraction_data/Sierra2019.h5')

In [76]:
sierra2019

<open DatasetReader name='data/snow_fraction_data/Sierra2019.h5' mode='r'>

In [77]:
sierra2019.subdatasets

['HDF5:data/snow_fraction_data/Sierra2019.h5://Grid/MODIS_GRID_500m/dust',
 'HDF5:data/snow_fraction_data/Sierra2019.h5://Grid/MODIS_GRID_500m/grain_size',
 'HDF5:data/snow_fraction_data/Sierra2019.h5://Grid/MODIS_GRID_500m/raw_snow_fraction',
 'HDF5:data/snow_fraction_data/Sierra2019.h5://Grid/MODIS_GRID_500m/snow_fraction']

In [109]:
metadata_rasterio = sierra2019.tags()
metadata_rasterio

{'Grid_angleunits': 'degrees',
 'Grid_aspect': 'normal',
 'Grid_falseeasting': '0 ',
 'Grid_falsenorthing': '-4000000 ',
 'Grid_geoid': '6378137 0.0818191908426215 ',
 'Grid_maplatlimit': '-90 90 ',
 'Grid_maplonlimit': '-255 15 ',
 'Grid_mapparallels': '34 40.5 ',
 'Grid_mapprojection': 'eqaconicstd',
 'Grid_MODIS_GRID_500m_dust_divisor': '10 ',
 'Grid_MODIS_GRID_500m_grain_size_divisor': '1 ',
 'Grid_MODIS_GRID_500m_raw_snow_fraction_divisor': '100 ',
 'Grid_MODIS_GRID_500m_ReferencingMatrix': '0 500 -285750 -500 0 500250 ',
 'Grid_MODIS_GRID_500m_snow_fraction_divisor': '100 ',
 'Grid_nparallels': '2 ',
 'Grid_origin': '0 -120 0 ',
 'Grid_scalefactor': '1 ',
 'Grid_trimlat': '-90 90 ',
 'Grid_trimlon': '-135 135 ',
 'ISOdates': '2018274 2018275 2018276 2018277 2018278 2018279 2018280 2018281 2018282 2018283 2018284 2018285 2018286 2018287 2018288 2018289 2018290 2018291 2018292 2018293 2018294 2018295 2018296 2018297 2018298 2018299 2018300 2018301 2018302 2018303 2018304 2018305 20

In [115]:
grid_angleunits = metadata_rasterio['Grid_angleunits']
grid_angleunits

'degrees'

In [124]:
map_projection = metadata_rasterio['Grid_mapprojection']
map_projection

'eqaconicstd'

In [113]:
type(sierra2019)

rasterio.io.DatasetReader

In [70]:
sierra2019.shape
# that's weird

(512, 512)

## Method 4 - rioxarry <a class="anchor" id="rioxarray"></a>

In [118]:
sierra2016 = rioxarray.open_rasterio('.data/snow_fraction_data/Sierra2016.h5')
sierra2016

  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


In [119]:
type(sierra2016)

xarray.core.dataset.Dataset

## Add a CRS <a class="anchor" id="crs"></a>

In [79]:
sierra2017 = rioxarray.open_rasterio('data/snow_fraction_data/Sierra2018.h5',
                                    variable=["0"],
                                    parse_coordinates=False,)

In [84]:
# This shows that the file does not currently have a crs
print("The CRS of this data is:", sierra2017.rio.crs)

The CRS of this data is: None


In [85]:
# https://www.earthdatascience.org/courses/use-data-open-source-python/intro-raster-data-python/fundamentals-raster-data/raster-metadata-in-python/
a_crs = sierra2017.rio.crs
# assign crs to object
sierra2017 = sierra2017.rio.set_crs(a_crs, inplace=True)

CRSError: Invalid projection: : (Internal Proj Error: proj_create: unrecognized format / unknown name)

In [86]:
print(list(et.epsg.keys())[:10])

['29188', '26733', '24600', '32189', '4899', '29189', '26734', '7402', '26951', '29190']


In [87]:
proj4 = et.epsg['3310']
print(proj4)

+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs


In [88]:
sierra2017 = sierra2017.rio.set_crs(proj4, inplace=True)
print("The CRS of this data is:", sierra2017.rio.crs)

The CRS of this data is: EPSG:3310


In [89]:
sierra2017.rio.crs

CRS.from_epsg(3310)

'+proj=aea' is the projection, in this case aea = Albers Equal Area

units=m means that the units are meters

In [90]:
type(sierra2017)

xarray.core.dataset.Dataset

In [None]:
transformer = Transformer.from_crs("EPSG:4326", crs_str, always_xy=True)
west, north = transformer.transform(rds.WESTBOUNDINGCOORDINATE, rds.NORTHBOUNDINGCOORDINATE)