In [1]:
import h5py
import torch
import random
from pathlib import Path
from typing import List
from torch.utils.data import DataLoader, Dataset

In [2]:
# Generate some random data
random_tensors = {f'{i}': torch.rand((random.randint(10, 30), 10)) for i in range(20)}

# Save those data to some preferred file format
with h5py.File('random.h5', 'w') as hf:
    for idx, random_ten in random_tensors.items():
            hf.create_dataset(idx, data=random_ten.detach().numpy())

In [3]:
# Showing content of an h5 file
import nexusformat.nexus as nx

f = nx.nxload('random.h5')
print(f.tree)
f.close()
# random.h5 contains tensors of shape Nx10 where N is variable

root:NXroot
  0 = float32(12x10)
  1 = float32(17x10)
  2 = float32(21x10)
  3 = float32(27x10)
  4 = float32(11x10)
  5 = float32(10x10)
  6 = float32(17x10)
  7 = float32(30x10)
  8 = float32(22x10)
  9 = float32(30x10)
  10 = float32(10x10)
  11 = float32(25x10)
  12 = float32(11x10)
  13 = float32(28x10)
  14 = float32(27x10)
  15 = float32(10x10)
  16 = float32(27x10)
  17 = float32(27x10)
  18 = float32(22x10)
  19 = float32(18x10)


In [4]:
# Define a dataset tailored to the data that should be used
class FancyDataset(Dataset):
    # Dataset ... map-style dataset
    def __init__(self, h5_path: Path):
        self.data = h5py.File(h5_path, 'r')
        # use as "index map"
        self.ids_list = list(self.data.keys())

        # some additional stuff
        self.softmax = torch.nn.Softmax(dim=-1)

    # return the number of elements in the dataset
    def __len__(self):
        return len(self.ids_list)

    # return item at specific index
    def __getitem__(self, idx: int):
        identifier = self.ids_list[idx]
        idx_element = torch.from_numpy(self.data[identifier][:, :])
        return self.fancy_func(idx_element)

    # some function that does something
    def fancy_func(self, input: torch.Tensor):
        return self.softmax(input).mean(dim=0)

# Other types:
#   IterableDataset, TensorDataset, ConcatDataset, ...

In [5]:
# Wrap FancyDataset around the data
fancy = FancyDataset(Path('random.h5'))
fancy

<__main__.FancyDataset at 0x7f934b90fd30>

In [6]:
# Defines some collate function that is useful
def collate(data: List[torch.Tensor]):
    return torch.stack(data, dim=0)

# In this case, the default_collate function is also able to do this

In [7]:
# Create dataloader
torch.manual_seed(42)
dataloader = DataLoader(fancy, batch_size=5, shuffle=True, collate_fn=collate)
# dataloader = DataLoader(fancy, batch_size=5, shuffle=True)

# DataLoader with default Sampler = index sampler with integral indices
# Custom Samplers = possible to use non-integral indices/keys
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f934b6c6760>

In [8]:
# Use actual dataloader
data = [sample for sample in dataloader]
data

[tensor([[0.0932, 0.1033, 0.1071, 0.0990, 0.1023, 0.0992, 0.0981, 0.1008, 0.0999,
          0.0971],
         [0.0829, 0.0938, 0.1125, 0.0990, 0.1056, 0.1138, 0.1011, 0.1069, 0.1020,
          0.0826],
         [0.0908, 0.0974, 0.0997, 0.0955, 0.0976, 0.0993, 0.0925, 0.1101, 0.1112,
          0.1059],
         [0.0987, 0.1015, 0.0978, 0.1036, 0.0992, 0.1046, 0.0967, 0.0951, 0.1007,
          0.1022],
         [0.0972, 0.1101, 0.0926, 0.0970, 0.1006, 0.1001, 0.0986, 0.0950, 0.0971,
          0.1119]]),
 tensor([[0.0999, 0.0923, 0.0871, 0.1128, 0.1139, 0.0957, 0.1002, 0.1093, 0.0803,
          0.1085],
         [0.1015, 0.0959, 0.0941, 0.0987, 0.1070, 0.1058, 0.1002, 0.0978, 0.1004,
          0.0987],
         [0.0960, 0.0978, 0.0994, 0.1063, 0.1015, 0.0920, 0.0971, 0.0933, 0.1075,
          0.1091],
         [0.1015, 0.0963, 0.0941, 0.0924, 0.0978, 0.1026, 0.1058, 0.1055, 0.1005,
          0.1034],
         [0.1057, 0.0996, 0.0968, 0.1034, 0.0953, 0.0988, 0.1087, 0.0864, 0.0902,
       

In [9]:
# Recreate output of actual dataloader with manual use of collate and batch forming
# does only apply for the first full iteration of the dataloader data
indices = [[8, 14, 17, 19, 1], [15, 18, 9, 3, 11], [4, 0, 10, 7, 13], [16, 12, 6, 2, 5]]

# Helper functions
# get items of one batch from dataset Fancy
batch_fancy = lambda batch: list(map(lambda x: fancy[x], batch))
# get items of multiple batches from dataset Fancy
fancy_indices = lambda ind: list(map(lambda single_batch: batch_fancy(single_batch), ind))
# use collate function on multiple batches
collate_fancy = lambda fan_list: list(map(lambda fancy_batches: collate(fancy_batches), fan_list))

handcraft = collate_fancy(fancy_indices(indices))
handcraft

[tensor([[0.0932, 0.1033, 0.1071, 0.0990, 0.1023, 0.0992, 0.0981, 0.1008, 0.0999,
          0.0971],
         [0.0829, 0.0938, 0.1125, 0.0990, 0.1056, 0.1138, 0.1011, 0.1069, 0.1020,
          0.0826],
         [0.0908, 0.0974, 0.0997, 0.0955, 0.0976, 0.0993, 0.0925, 0.1101, 0.1112,
          0.1059],
         [0.0987, 0.1015, 0.0978, 0.1036, 0.0992, 0.1046, 0.0967, 0.0951, 0.1007,
          0.1022],
         [0.0972, 0.1101, 0.0926, 0.0970, 0.1006, 0.1001, 0.0986, 0.0950, 0.0971,
          0.1119]]),
 tensor([[0.0999, 0.0923, 0.0871, 0.1128, 0.1139, 0.0957, 0.1002, 0.1093, 0.0803,
          0.1085],
         [0.1015, 0.0959, 0.0941, 0.0987, 0.1070, 0.1058, 0.1002, 0.0978, 0.1004,
          0.0987],
         [0.0960, 0.0978, 0.0994, 0.1063, 0.1015, 0.0920, 0.0971, 0.0933, 0.1075,
          0.1091],
         [0.1015, 0.0963, 0.0941, 0.0924, 0.0978, 0.1026, 0.1058, 0.1055, 0.1005,
          0.1034],
         [0.1057, 0.0996, 0.0968, 0.1034, 0.0953, 0.0988, 0.1087, 0.0864, 0.0902,
       

In [10]:
# Check if manual dataloader equals automatic creation of dataloader
for idx, (dataloader_sample, handcraft_sample) in enumerate(zip(data, handcraft)):
    print(f'batches #{idx} identical: {torch.allclose(dataloader_sample, handcraft_sample)}')

batches #0 identical: True
batches #1 identical: True
batches #2 identical: True
batches #3 identical: True
