In [7]:
import numpy as np
import h5py
repository_path='C:/Users/julia/Documents/Resources/InfluenzaFitnessLandscape/NewApproachFromMarch2021/InfluenzaFitnessInference/'

In [4]:
matrix1 = np.random.random(size=(1000,1000))
matrix2 = np.random.random(size=(10000,100))

## create and write to hdf5 file:

In [10]:
with h5py.File(repository_path + 'results/test_data.h5','w') as hdf:
    hdf.create_dataset('dataset1',data=matrix1)
    hdf.create_dataset('dataset2',data=matrix2)

## read hdf5 file

In [11]:
with h5py.File(repository_path + 'results/test_data.h5','r') as hdf:
    ls = list(hdf.keys()) 
    print('List of datasets in this file: \n',ls)
    data = hdf.get('dataset1')
    dataset1 = np.array(data)
    print('Shape of dataset1: \n',dataset1.shape)

List of datasets in this file: 
 ['dataset1', 'dataset2']
Shape of dataset1: 
 (1000, 1000)


## create groups in hdf file

In [13]:
matrix1 = np.random.random(size=(1000,1000))
matrix2 = np.random.random(size=(1000,1000))
matrix3 = np.random.random(size=(1000,1000))
matrix4 = np.random.random(size=(1000,1000))

In [16]:
with h5py.File(repository_path + 'results/test_groups.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1',data = matrix1)
    G1.create_dataset('dataset4',data = matrix4)
    
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3',data = matrix3)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2',data = matrix2)
    

## read groups in hdf5 file

In [26]:
with h5py.File(repository_path + 'results/test_groups.h5','r') as hdf:
    base_items = list(hdf.items())
    print('Items in the base directory:', base_items)
    
    G2 = hdf.get('Group2')
    G2_items = list(G2.items())
    print('Items in Group2:', G2_items)
    G21 = G2.get('SubGroup1')
    dataset3 = np.array(G21.get('dataset3'))
    print('Shape of dataset3:', dataset3.shape)

Items in the base directory: [('Group1', <HDF5 group "/Group1" (2 members)>), ('Group2', <HDF5 group "/Group2" (2 members)>)]
Items in Group2: [('SubGroup1', <HDF5 group "/Group2/SubGroup1" (1 members)>), ('SubGroup2', <HDF5 group "/Group2/SubGroup2" (1 members)>)]
Shape of dataset3: (1000, 1000)


## hdf5 compress data

In [28]:
with h5py.File(repository_path + 'results/test_groups_compressed.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1, compression='gzip', compression_opts=9)
    G1.create_dataset('dataset4', data = matrix4, compression='gzip', compression_opts=9)
    
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3, compression='gzip', compression_opts=9)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2, compression='gzip', compression_opts=9)
    
# compression from 31 to 29 M, not much to compress here

In [56]:
# compressed data can be accessed in the same way as non-compressed data
with h5py.File(repository_path + 'results/test_groups_compressed.h5', 'r') as hdf:
    base_items = list(hdf.items())
    print('Base items: ', base_items)
    G1 = hdf.get('Group1')
    print(list(G1.keys()))
    data1 = G1.get('dataset1')
    dataset1 = np.array(data1)

Base items:  [('Group1', <HDF5 group "/Group1" (2 members)>), ('Group2', <HDF5 group "/Group2" (2 members)>)]
['dataset1', 'dataset4']


In [57]:
print(dataset1.shape)

(1000, 1000)


## hdf5 attributes

In [30]:
matrix1 = np.random.random(size=(1000,1000))
matrix2 = np.random.random(size=(10000,100))

In [36]:
# create hdf5 file
hdf = h5py.File(repository_path + 'results/test_attributes.h5', 'w')

# create the datasets
dataset1 = hdf.create_dataset('dataset1', data = matrix1)
dataset2 = hdf.create_dataset('dataset2', data = matrix2)

# set attributes
dataset1.attrs['CLASS'] = 'DATA MATRIX'
dataset1.attrs['VERSION'] = [1,2]

hdf.close()

In [46]:
# read the hdf5 file
hdf = h5py.File(repository_path + 'results/test_attributes.h5', 'r')
ls = list(hdf.keys())
print('List of datasets in this file: \n', ls)
data = hdf.get('dataset1')
dataset1 = np.array(data)
print('Shape of dataset1: \n', dataset1.shape)
# read the attributes
k = list(data.attrs.keys())
v = list(data.attrs.values())
print(k[0])
print(v[0])
print(data.attrs[k[0]])
print(data.attrs[k[1]])
print(type(data))

hdf.close()
# data (which is the h5py dataset with type class 'h5py._hl.dataset.Dataset') 
# can only be accessed while hdf5 file is open for reading 

List of datasets in this file: 
 ['dataset1', 'dataset2']
Shape of dataset1: 
 (1000, 1000)
CLASS
DATA MATRIX
DATA MATRIX
[1 2]
<class 'h5py._hl.dataset.Dataset'>


## create hdf5 files with pandas

In [59]:
import pandas as pd

In [60]:
# creates (or opens in append mode) an hdf5 file
hdf = pd.HDFStore(repository_path + 'results/hdf5_pandas.h5')

In [76]:
data1 = {'x param': ["A", "B", "C", "D"],
        'y param': ["off", "on", "off", "off"],
        'z param': [1,2,3,4]}
df1 = pd.DataFrame(data1, columns = ['x param', 'y param', 'z param'])
hdf.put('DF1', df1, format='table')

In [78]:
data = {'city': ['Tripoli', 'Sydney', 'Tripoli', 'Rome', 'Rome', 'Tripoli', 'Rome', 'Sydney',
                'Sydney'],
        'rank': ['1st', '2nd', '1st', '2nd', '1st', '2nd', '1st', '2nd', '1st'],
        'score1': [44, 48, 39, 41, 38, 44, 34, 54, 61],
        'score2': [67, 63, 55, 70, 64, 77, 45, 66, 72]
       }
df2 = pd.DataFrame(data, columns = ['city', 'rank', 'score1', 'score2'])

hdf.put('DF2Key', df2, format='table', data_columns=True)

In [79]:
hdf.close()

## read hdf5 file with pandas

In [80]:
# open hdf5 file for reading
hdf = pd.HDFStore(repository_path + 'results/hdf5_pandas.h5', mode='r')

In [83]:
hdf.keys()

['/DF1', '/DF2Key']

In [84]:
df1 = hdf.get('/DF1')

In [85]:
type(df1)

pandas.core.frame.DataFrame

In [86]:
df1.head()

Unnamed: 0,x param,y param,z param
0,A,off,1
1,B,on,2
2,C,off,3
3,D,off,4


In [87]:
hdf.close()