## Benchmarking LBM Datasets

In [17]:
import os
import sys
import glob
import json
import time
from datetime import datetime
from dateutil import tz
from pathlib import Path
import h5py

import matplotlib.pyplot as plt
import numpy as np
import h5py
import dask.array as da
import tifffile
import cv2
from icecream import ic
from humansize import naturalsize
%matplotlib inline

ModuleNotFoundError: No module named 'humansize'

In [None]:
import sys
from humanize import naturalsize

for size, name in sorted(
    (value.nbytes, name)
    for name, value in locals().items()
    if isinstance(value, numpy.ndarray)):
  print("{:>30}: {:>8}".format(name, naturalsize(size)))

In [5]:
#enable or disable logging
#ic.disable()
ic.enable()

### Data

`ScanImage` aquisition software allows users to specify frames to be split between multiple `.tif` files.
This is annoying because the metadata and physical state of the data become out-of-sync. One file may contain only 1/10th of the frames, but the metadata will indicate the total frames in the dataset. 

To demonstrate this, lets take a few multi-file datasets and pull only 1 of the files to analyze. 

In [7]:
path_input_file = "/data2/fpo/lbm/3mm_5mm"
files = sorted(glob.glob(path_input_file + '/*.tif', recursive=True))
# Notice how each file contains different metadata in the filename, i.e. .9mmx.9mm vs 3mm x 5mm

[f"{Path(x).name}" for x in files]

['mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00001.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00002.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00003.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00004.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00005.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00006.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00007.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00008.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00009.tif',
 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00010.tif']

### Extract metadata without loading the image into memory

In [None]:
#%% Read
for file in files:

    print(f"--- Processing: {file}")
    
    metadata = {}
    with open(file, 'rb') as fh:
        metadata = tifffile.read_scanimage_metadata(fh)
        static_metadata = metadata[0]
        frame_metadata = metadata[1]['RoiGroups']['imagingRoiGroup']['rois']
    rois = [x['scanfields'] for x in frame_metadata]
    centerXY = rois[0]['centerXY']
    print(f"CenterXY: {(centerXY[0], centerXY[1])}")
    sizeXY = rois[0]['sizeXY']
    print(f"sizeXY: {(sizeXY[0], sizeXY[1])}")
    pixel_resolution_xy = rois[0]['pixelResolutionXY']
    
    num_frames = static_metadata['SI.hStackManager.framesPerSlice']
    print(f"Alleged Frames: {num_frames}")
    num_planes = len(static_metadata['SI.hChannels.channelsActive'])
    print(f"Planes: {num_planes}")
    frame_rate = static_metadata['SI.hRoiManager.scanVolumeRate']  # scanVolumeRate/scanFrameRate are the same now, but may not always be 
    print(f"Volume rate: {frame_rate} Hz")
    objective_resolution = static_metadata['SI.objectiveResolution']  # 157.5
    print(f"Objective resolution: {objective_resolution} deg/um")
    
    # Explore:
    lines_per_frame = static_metadata['SI.hRoiManager.linesPerFrame']
    print(f'{lines_per_frame} lines/frame')
    px_per_line = static_metadata['SI.hRoiManager.pixelsPerLine']
    print(f'{px_per_line} pixels/line')
    r_arr = da.from_zarr(tifffile.imread(file, aszarr=True))
    print(f"Alleged filesize: {r_arr.nbytes / 10**9} GB")
    print(f"Alleged array dims: {r_arr.shape}")
    print(f" ")
    

The below will fail because the metadata stored accounts for 10 files, but only 1 file is in the datapath. 

 Because `tifffile` uses `ScanImageTiffReader` under the hood, it uses the scanimage numberOfFrames attribute and tries to iterate through more frames than are actually in the file.

In [None]:
try:
    print(r_arr.shape)
    array_loaded = r_arr.compute()
except IndexError:
    print("Index error! Oops!")

Instead, lets use cv2, a versatile image reader that has no idea what "ScanImage" metadata look like. 

In [9]:
data = tifffile.imread(file)
data = data.reshape(int(data.shape[0] / num_planes), num_planes,  data.shape[1], data.shape[2])
data = np.swapaxes(data, 1, 3)

<tifffile.TiffFile 'mh89_hemisphere…_00001_00010.tif'> asarray failed to reshape (25320, 5104, 145) to (8440, 30, 5104, 145)


In [14]:
file = Path(file)
new_name = file.name[:-16] + f"_{data.shape[0]}_{data.shape[1]}_{data.shape[2]}"
new_file = file.parent / Path(new_name).with_suffix('.h5')
new_file

PosixPath('/data2/fpo/lbm/3mm_5mm/mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_844_145_5104.h5')

(844, 145, 5104, 30)

In [None]:
# Write .tif as h5, with metadata as attributes
with h5py.File(new_file, 'w') as f:
    f.create_dataset("mov", data=)
    f.attrs.create( "metadata", str(metadata) )
    
end = time.time()
duration = f"{end-start:.2f}"
print(f"{duration_tif} seconds")