# HDF5 warm up

**Primary Source:** *Python and HDF5* by Andrew Collette, O'Reilly 2013.

<a href="https://www.amazon.com/Python-HDF5-Collette/dp/1449367836/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr="><img src="./img/h5py.jpg"></a>

*Our first task is to store temperature, wind, rain measurements from two  weather stations (1,2).*

### Packages, functions and constant

In [1]:
import pandas
import h5py
import time
import numpy as np
import os
import sys
import time
import timeit
from datetime import datetime


path_file='' #insert the path of this notebook

### Data

In [14]:
arr_1=np.random.normal(loc=150, scale=5, size=100)
arr_2=np.random.normal(loc=np.pi, scale=5, size=100)
arr_3=np.random.normal(loc=150, scale=5, size=100)
arr_4=np.random.normal(loc=np.pi, scale=5, size=100)

    

### Create the hdf5 file

In [20]:
f=h5py.File(path_file+"toy_experiment.hdf5",'w')


#define and assign values at the  2 groups 'series', in this way I create automatically also the groups 'series'
f["series_1/energy"]=arr_1
f["series_1/angle"]=arr_2
f["series_2/energy"]=arr_3
f["series_2/angle"]=arr_4

#fix the attributes of the 2 group 'series'
f["series_1"].attrs["date"]=datetime.now().strftime("%Y%m%d")
f["series_1"].attrs["instrument"]='XY'
f["series_1"].attrs["operator"]='Bruce'
f["series_2"].attrs["date"]='20241014'
f["series_2"].attrs["instrument"]='XY'
f["series_2"].attrs["operator"]='Tony'



# assign the attributes at  the 2 group 'series'

f["series_1/energy"].attrs["units"]="keV"
f["series_1/angle"].attrs["units"]="rad"
f["series_2/energy"].attrs["units"]="keV"
f["series_2/angle"].attrs["units"]="rad"

f.close()

### Read the hdf5 file we have created

In [21]:
with h5py.File(path_file+"toy_experiment.hdf5", "r") as f:
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    print("Keys: %s" % f.keys())
    # get first object name/key; may or may NOT be a group
    for k in f.keys():
        print(f"{k} => {f[k]}")
    for g in f.keys():
        print("*************************")
        print(g)
        for key, value in f[g].attrs.items():
                   print("%s: %s" % (key, value))
        for k in f[g].keys():
               print(k)
               print(f[g][k])
               dataset=f[g][k]
               print(dataset[0:10])
               print(f"{k} => {f[g][k].attrs.keys()}")
               for key, value in dataset.attrs.items():
                   print("%s: %s" % (key, value))
f.close()            

Keys: <KeysViewHDF5 ['series_1', 'series_2']>
series_1 => <HDF5 group "/series_1" (2 members)>
series_2 => <HDF5 group "/series_2" (2 members)>
*************************
series_1
date: 20241015
instrument: XY
operator: Bruce
angle
<HDF5 dataset "angle": shape (100,), type "<f8">
[-2.91429808  2.90022855  5.79232449  6.30683751  8.6742743   3.37271079
  0.14884002 -9.03059733 11.8887852   4.47843321]
angle => <KeysViewHDF5 ['units']>
units: rad
energy
<HDF5 dataset "energy": shape (100,), type "<f8">
[136.83069729 159.11721028 154.91997737 145.16674403 151.11199007
 148.65033647 156.64818339 146.26886367 144.81526596 151.6653548 ]
energy => <KeysViewHDF5 ['units']>
units: keV
*************************
series_2
date: 20241014
instrument: XY
operator: Tony
angle
<HDF5 dataset "angle": shape (100,), type "<f8">
[-11.66671189  -0.94286824   4.56473055   1.24805919   0.59447055
  -0.79889074  -4.21701615   5.73761091  11.95227928   1.21827822]
angle => <KeysViewHDF5 ['units']>
units: rad
ene

### Read  a file  hdf5 we don't know

Let's define an ad hoc function that indirectly retrieve the type of an object and at each step try to find out all the attributes.  

In [22]:
def query(f):
    try:  
        f.dtype
        dataset=f
        print("*************************")
        print(f)
        print(' is a dataset')
        
        print('Attributes :')
        for key, value in f.attrs.items():
                   print("%s: %s" % (key, value))  
        ty='d'
        
    except:  
      
     print(' is a group')   
     print('Attributes :')
     for key, value in f.attrs.items():
                print("%s: %s" % (key, value))
     ty='g'
      
     l_g=[]
     for g in f.keys():
         l_g.append(g)
         
     print('tree')
     print(l_g)
     print("*************************")    
     for g in f.keys():
        print('it comes from') 
        print(f)  
        print('__________________') 
        
        print(g)
        o=f[g]
        query(o)   
    return ty   


We will read a nxs file (that is nothing else that a hdf5 with a defined structure). This is not the smartest way to read a nxs (we will see) but take it as exercise.

In [23]:
start1=  time.perf_counter()
with h5py.File(path_file+"2024_03_06_0006.nxs", "r") as f:
    for k in f.keys():
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    
    # get first object name/key; may or may NOT be a group
      i=f[k]  
      print(str(k))
      print(str(i))
      query(i)
    
f.close()  
end1=  time.perf_counter()
t1=end1-start1

entry
<HDF5 group "/entry" (12 members)>
 is a group
Attributes :
NX_class: NXentry
tree
['data', 'definition', 'duration', 'end_time', 'entry_identifier', 'instrument', 'number_of_scans', 'sample', 'start_time', 'technique', 'title ', 'user']
*************************
it comes from
<HDF5 group "/entry" (12 members)>
__________________
data
 is a group
Attributes :
NX_class: NXdata
axes: energy
signal: processed_data_sum
tree
['energy', 'incoming_beam', 'incoming_beam_sum', 'mode', 'processed_data', 'processed_data_sum', 'reference_current', 'reference_current_sum', 'sample_current', 'sample_current_sum']
*************************
it comes from
<HDF5 group "/entry/data" (10 members)>
__________________
energy
*************************
<HDF5 dataset "energy": shape (201,), type "<f8">
 is a dataset
Attributes :
description: Photon energy selected
target: /entry/instrument/monochromator/energy
units: eV
it comes from
<HDF5 group "/entry/data" (10 members)>
__________________
incoming_bea

Function to navigate across the file

In [4]:
f=h5py.File(path_file+"2024_03_06_0006.nxs", "r") 

for name in f:
    print( f.get(name,getclass=True))
def printname(name):
    print(name)
   
f.visit(printname) 

print('************')
f['entry/data'].visit(printname)

<class 'h5py._hl.group.Group'>
entry
entry/data
entry/data/energy
entry/data/incoming_beam
entry/data/incoming_beam_sum
entry/data/mode
entry/data/processed_data
entry/data/processed_data_sum
entry/data/reference_current
entry/data/reference_current_sum
entry/data/sample_current
entry/data/sample_current_sum
entry/definition
entry/duration
entry/end_time
entry/entry_identifier
entry/instrument
entry/instrument/exit_slit
entry/instrument/exit_slit/gap
entry/instrument/incoming_beam_picoammeter
entry/instrument/incoming_beam_picoammeter/data
entry/instrument/incoming_beam_picoammeter/description
entry/instrument/incoming_beam_picoammeter/picoammeter_current_range
entry/instrument/monochromator
entry/instrument/monochromator/energy
entry/instrument/monochromator/grating
entry/instrument/monochromator/grating/mirror
entry/instrument/monochromator/grating/period
entry/instrument/monochromator/integration_time
entry/instrument/prefocusing_mirror
entry/instrument/prefocusing_mirror/descriptio