# Dataprep

### Objective

Crawls through raw_data directory and converts diffusion and flair into a data array 

### Prerequisites

All diffusion and FLAIR should be registrated and put in a NIFTI file format.

### Data organisation

- All b0 diffusion should be named "patientid_hX_DWIb0.nii.gz" where "hX" corresponds to time delay and can be "h0" or "h1" (to stratify on delay)
- All b1000 diffusion should be named "patientid_hX_DWIb1000.nii.gz" where "hX" corresponds to time delay and can be "h0" or "h1" (to stratify on delay)
- All corresponding FLAIR sequences should be named: "patientid_hX_qX_FLAIR.nii.gz" where "qX" corresponds to quality and can be "q0" or "q1" or "q2" (to stratify on quality)
- Optionally, you can add a weighted mask "patientid_hX_MASK.nii.gz" with values between 0 (background), 1 (brain mask) and 2 (stroke region) that will be used for training weight. If you don't provide it, a crude stroke segmentation with ADC < 600 will be used as a weighting map.

## Load modules

In [None]:
import os, glob, h5py
import numpy as np
from skimage.morphology import dilation, opening
from modules.niftitools import twoniftis2array, flairnifti2array, masknifti2array

## Crawl through files

In [None]:
dwifiles_precheck = glob.glob(os.path.join("raw_data", "*_DWIb0.nii.gz"))
patnames, timepoints, qualities, b0files, b1000files, flairfiles, maskfiles = [], [], [], [], [], [], []
num_patients = 0
for dwifile in dwifiles_precheck:
    name, timepoint, _ = os.path.basename(dwifile).split("_")
    timepoint = int(timepoint.replace("h",""))
    matchesb1000 = glob.glob(os.path.join("raw_data", name+"_h"+str(timepoint)+"_DWIb1000.nii.gz"))
    matchesFlair = glob.glob(os.path.join("raw_data", name+"_h"+str(timepoint)+"_q*_FLAIR.nii.gz"))
    if len(matchesFlair) and len(matchesb1000):
        _, _, quality, _ = os.path.basename(matchesFlair[0]).split("_")
        patnames.append(name)
        timepoints.append(timepoint)
        qualities.append(int(quality.replace("q","")))
        b0files.append(dwifile)
        b1000files.append(matchesb1000[0])
        flairfiles.append(matchesFlair[0])
        matchesMask = glob.glob(os.path.join("raw_data", name+"_h"+str(timepoint)+"_MASK.nii.gz"))
        if len(matchesMask):
            maskfiles.append(matchesMask[0])
        else:
            maskfiles.append(None)
        num_patients += 1

## Create data arrays

In [None]:
z_slices = 25
outputdir = "data"

In [None]:
with h5py.File(os.path.join(outputdir,"metadata.hdf5"), "w") as metadata:
    metadata.create_dataset("patientnames", data=np.array(patnames, dtype="S"))
    metadata.create_dataset("shape_x", data=(num_patients,256,256,z_slices,3))
    metadata.create_dataset("shape_y", data=(num_patients,256,256,z_slices,1))
    metadata.create_dataset("shape_mask", data=(num_patients,256,256,z_slices,1))
    metadata.create_dataset("shape_meta", data=(num_patients,2))

In [None]:
fx = np.memmap(os.path.join(outputdir,"data_x.dat"), dtype="float32", mode="w+",
               shape=(num_patients,256,256,z_slices,3))
fy = np.memmap(os.path.join(outputdir,"data_y.dat"), dtype="float32", mode="w+",
               shape=(num_patients,256,256,z_slices,1))
fmask = np.memmap(os.path.join(outputdir,"data_mask.dat"), dtype="uint8", mode="w+",
               shape=(num_patients,256,256,z_slices,1))
fmeta = np.memmap(os.path.join(outputdir,"data_meta.dat"), dtype="float32", mode="w+",
               shape=(num_patients,2))

if num_patients > 0:
    print("Imported following patients:", end=" ")

for i in range(num_patients):
    if i>0:
        print(", ",end="")
    fmeta[i,0] = qualities[i]
    fmeta[i,1] = timepoints[i]
    Xdata, mask, _ = twoniftis2array(b0files[i], b1000files[i],z_slices)
    Xdata = Xdata.transpose(1,2,3,0)
    fx[i] = Xdata
    if maskfiles[i] is not None:
        fmask[i] = masknifti2array(maskfiles[i],z_slices)[...,np.newaxis]
    else:
        crudemask = dilation(dilation(dilation(opening(np.logical_and(mask, Xdata[...,2]<600)))))
        crudemask = crudemask.astype("uint8") + mask.astype("uint8")
        fmask[i] = crudemask[...,np.newaxis]
    fy[i] = flairnifti2array(flairfiles[i],mask,z_slices)[...,np.newaxis]
    print(name, end="")
    
del fx, fy, fmask, fmeta