# h5 to webdataset converter
#### This notebook will provide you the means to convert h5 files into webdataset (.tar) files for convenient usage in pytorch dataloaders.
---

In [81]:
import webdataset as wds
import h5py
import os
import numpy as np
import itertools

OMIT_INCOMPLETE_ROWS = True
INPUT_FILEPATH = "example_data/sub-01_VC.h5"
OUTPUT_DIR = "shards"

ds = h5py.File(INPUT_FILEPATH, 'r')
row_keys = ds.keys()

column_names = []
data_columns = []

for row_key in row_keys:
    column_names.append(row_key)
    data_columns.append(ds[row_key])

if OMIT_INCOMPLETE_ROWS:
    data_columns = list(map(list, zip(*data_columns)))
else:
    data_columns = list(map(list, itertools.zip_longest(*data_columns, fillvalue=None)))

os.makedirs(OUTPUT_DIR, exist_ok=True)

pattern = os.path.join(OUTPUT_DIR, "ds_" + f"%06d.tar")
with wds.ShardWriter(pattern, maxsize=int(1e9), maxcount=int(100000)) as sink:
    for i, row in enumerate(data_columns):
        ds_key = "%09d" % i

        sample = {
            "__key__": ds_key,
        }

        for j, item in enumerate(row):
            ext = ''
            if isinstance(row, np.ndarray):
                ext = '.pyd'
            elif isinstance(row, str):
                ext = '.txt'
            else:
                ext = '.pyd'
            if row != None:
                sample[column_names[j] + ext] = row
        
        sink.write(sample)

# writing shards/ds_000000.tar 0 0.0 GB 0
