LBM Dataset Exploration

In [6]:
import glob
from pathlib import Path
from pprint import pprint

import dask.array as da
import h5py
import tifffile
from icecream import ic
%matplotlib inline

In [7]:
#enable or disable logging
#ic.disable()
ic.enable()

def extract_metadata(filename):
    metadata = {}
    with open(filename, 'rb') as fh:
        metadata = tifffile.read_scanimage_metadata(fh)
    static_metadata = metadata[0]
    frame_metadata = metadata[1]['RoiGroups']['imagingRoiGroup']['rois']
    rois = [x['scanfields'] for x in frame_metadata]
    raw_arr =  da.from_zarr(tifffile.imread(filename, aszarr=True))
    return {
        'fname': filename,
        'arr': raw_arr,
        'center_xy': rois[0]['centerXY'], 
        'size_xy': rois[0]['sizeXY'],
        'pixel_resolution_xy': rois[0]['pixelResolutionXY'],
        'num_frames': static_metadata['SI.hStackManager.framesPerSlice'],
        'num_planes': len(static_metadata['SI.hChannels.channelsActive']),
        'frame_rate': static_metadata['SI.hRoiManager.scanVolumeRate'],
        'objective_resolution': static_metadata['SI.objectiveResolution'],
        'lines_per_frame': static_metadata['SI.hRoiManager.linesPerFrame'],
        'px_per_line': static_metadata['SI.hRoiManager.pixelsPerLine'],
        'file_size_gb': raw_arr.nbytes / 10**9,
        'array_dims': raw_arr.shape
    }
    
def get_slice_coordinates(arr_shape, square_size=(142, 142)):
    """Get the start and end coordinates for slicing a square from the center of a 2D array."""
    # Get the dimensions of the input array
    rows, cols = arr_shape
    
    # grab a square at the center of the array
    start_row = (rows - square_size[0]) // 2
    end_row = start_row + square_size[0]
    start_col = (cols - square_size[1]) // 2
    end_col = start_col + square_size[1]
    
    return (start_row, start_col), (end_row, end_col)    


### ScanImage Metadata can lead to erroneous data loaded in memory 

`ScanImage` aquisition software allows users to specify frames to be split between multiple `.tif` files.
This is annoying because the metadata and physical state of the data become out-of-sync. One file may contain only 1/10th of the frames, but the metadata will indicate the total frames in the dataset. 

To demonstrate this, lets take a few multi-file datasets and pull only 1 of the files to analyze. 

In [8]:
sandbox_filepath = "/data2/fpo/lbm/sandbox/"
sandbox_files = sorted(glob.glob(sandbox_filepath + '/*.tif', recursive=True))
if len(sandbox_files)<=1: print(f"No files found in {sandbox_filepath}") 
else: print([f"{Path(x).name}" for x in sandbox_files])

['MH70_0p9mm_FOV_50_550um_depth_som_stim_199mW_3min_M1_00001_00001.tif', 'MH70_2mm_FOV_50_550um_depth_som_stim_199mW_3min_M1_00001_00001.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00001.tif']


## Extract metadata without loading the image into memory

In [5]:
sandbox_metadata = extract_metadata(sandbox_files[0])
for i, filename in enumerate(sandbox_files):
    print(f'File {i+1}')
    pprint(extract_metadata(filename))
    print(' ')

NameError: name 'sandbox_files' is not defined

## Create a sample dataset

We need all of the files the were saved with scanimage in the directory, for reasons described above.
We can extract metadata from a single file, and use it to slice the whole dataset.

In [4]:
# We should have 10 files, with 844 frames each, totaling 8440 frames as shown in the metadata for this file
path_sample_files = "/data2/fpo/lbm/3mm_5mm/"
sample_files = sorted(glob.glob(path_sample_files + '/*.tif', recursive=True))
if len(sample_files)<=1: print(f"No files found in {path_sample_files}") 
else: print([f"{Path(x).name}" for x in sample_files])
sample_metadata = extract_metadata(sample_files[0])

['mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00001.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00002.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00003.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00004.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00005.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00006.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00007.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00008.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00009.tif', 'mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_00001_00010.tif']


In [7]:
data = tifffile.imread(sample_files[0])

<tifffile.TiffFile 'mh89_hemisphere…_00001_00001.tif'> asarray failed to reshape (25320, 5104, 145) to (8440, 30, 5104, 145)


In [8]:
start, end = get_slice_coordinates((data.shape[1], data.shape[2]))
my_arr = data[:, start[0]:end[0], start[1]:end[1]]
print(my_arr.shape)

(25320, 142, 142)


In [9]:
# Write .tif as h5, with metadata as attributes
file = Path(sample_files[0])
new_name = file.name[:-16] + f"_{my_arr.shape[0]}_{my_arr.shape[1]}_{my_arr.shape[2]}"
new_file = file.parent / Path(new_name).with_suffix('.h5')
new_file

PosixPath('/data2/fpo/lbm/3mm_5mm/mh89_hemisphere_FOV_50_550um_depth_250mW_dual_stimuli_30min_25320_142_142.h5')

In [10]:
with h5py.File(new_file, 'w') as f:
    f.create_dataset("mov", data=my_arr)
    f.attrs.create( "metadata", str(sample_metadata))

## Create sample dataset from processed .mat files

The LBM pipeline doesn't work non-sequentially, meaning registration wont work without the proper variables `volume_size`, `volume_rate`, `mean_image` being extracted and living as datasets in the .mat file. Here, we start from the .mat file 

In [11]:
matfile = [x for x in Path('/data2/fpo/lbm/3mm_5mm/TMP/').glob('*.mat')]
assert matfile[0].is_file()

In [12]:
fh = h5py.File(matfile[0])

In [13]:
h5path = [x for x in Path('/data2/fpo/lbm/sandbox/').glob('*.h5')]
with h5py.File(h5path[0]) as f:
    data = f['data']
    data = data[1, 1, :, :]

In [20]:
data.size

740080