In [3]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
import h5py
import tifffile as tiff
from typing import Literal

sys.path.append(str(Path("..").resolve()))
from src import *

  from .autonotebook import tqdm as notebook_tqdm


# Convert the Dataset to HDF5
The authors of CNNT request the dataset to be structured in a proper way.
Given the [`Denoising.zip`](https://zenodo.org/records/4624364#.YF4lBa9Kgal) dataset, we need to process it in the following way.

- Convert each TIFF pair (raw ↔ ground-truth _decon) into an HDF5 top-level group that contains two datasets named exactly `noisy_im` and `clean_im` (each shaped T, H, W, dtype float32).
- Put all those groups into one (or a few) `.h5` files and give the path(s) to `--h5files`. The authors' loader will then find hfile[key+"/noisy_im"] and hfile[key+"/clean_im"] and work.

In [4]:
def dataset2hdf5(dir: Path, subdirs: list[Literal["Training", "Test"]] = ["Training"], gt_suffix="_decon", demo=False):
    """Expecting the following hierarcy:

    <DATASET NAME>
        ├── Test
        │   ├── `GT`
        │   └── `Raw`
        ├── Training
        │   ├── `GT`
        │   └── `Raw`
    """
    name = dir.parts[-1]
    # For each subdir (Training, Test)
    for raw_dir, gt_dir in [(dir / _ / "Raw", dir / _ / "GT") for _ in subdirs]:
        filename = f"{name}_{raw_dir.parts[-2]}.h5"
        if Path(filename).exists():
            continue
        with h5py.File(filename, "w") as h5f:
            # For each TIF file
            for raw in tqdm(sorted(raw_dir.glob("*.tif"))):
                gt = gt_dir / f"{raw.stem.replace('5%','80%').replace('C1-','C2-').replace('raw','gt')}{gt_suffix}.tif"
                g = h5f.create_group(raw.stem)
                for ds, data in {"noisy_im": raw, "clean_im": gt}.items():
                    g.create_dataset(
                        ds,
                        data=tiff.imread(data).astype(np.float32),
                        compression="lzf",
                    )
                if demo:
                    break

In [5]:
dataset_dir = Path("dataset/Denoising/")
for subdir in tqdm(sorted(dataset_dir.glob("*"))):
    dataset2hdf5(subdir, demo=True)
    break

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/20 [01:47<?, ?it/s]
  0%|          | 0/14 [01:47<?, ?it/s]
