### Creating Image Datasets

- Single Slice RGB with Windowing
- Multi Slice RGB with Windowing (Adjacent Slices - s-1, s, s+1)

In [1]:
from fastai.vision.all import *
from fastai.medical.imaging import *

In [2]:
pd.options.display.max_columns = 100

In [3]:
datapath = Path("/../rsna_data/")

In [4]:
create_samples = False
if create_samples:
    sample_studies = np.random.choice((datapath/'train').ls(), 100, replace=False)
    sample_studies = np.random.permutation(sample_studies)
    valid_pids = [o.stem for o in sample_studies[:20]]
    train_pids = [o.stem for o in sample_studies[20:]]
    sample_pids_dir = (datapath/'sample_pids')
    if not sample_pids_dir.exists(): sample_pids_dir.mkdir()
    pd.to_pickle(valid_pids, sample_pids_dir/'valid_pids.pkl')
    pd.to_pickle(train_pids, sample_pids_dir/'train_pids.pkl')
    assert set(train_pids).intersection(set(valid_pids)) == set()

In [5]:
train_df = pd.read_csv(datapath/'train.csv')

In [10]:
do_sample=False
if do_sample:
    sample_pids_dir = (datapath/'sample_pids')
    valid_pids = pd.read_pickle(sample_pids_dir/'valid_pids.pkl')
    train_pids = pd.read_pickle(sample_pids_dir/'train_pids.pkl')

    trn_df = train_df[train_df['StudyInstanceUID'].isin(train_pids)]
    val_df = train_df[train_df['StudyInstanceUID'].isin(valid_pids)]

    trn_df['pe_present_on_image'].mean(), val_df['pe_present_on_image'].mean()

### Save image and metadata (RGB windows: lung, PE, mediastinal)

In [11]:
# RGB windows
lung_window = (1500, -600)
pe_window = (700, 100)
mediastinal_window = (400, 40)
windows = (lung_window, pe_window, mediastinal_window)

In [12]:
def read_dcm_img(dcm, windows=windows):
    "Read single slice in RGB"
    return torch.stack([dcm.windowed(*w) for w in windows])

In [13]:
def save_img(t:Tensor, fn):
    "Save single slice in RGB"
    t = (t*255).byte()
    im = Image.fromarray(t.permute(1,2,0).numpy(), mode=['RGB','CMYK'][t.shape[0]==4])
    im.save(fn, quality=90)

In [16]:
if do_sample:
    sample_pids_dir = (datapath/'sample_pids')
    valid_pids = pd.read_pickle(sample_pids_dir/'valid_pids.pkl')
    train_pids = pd.read_pickle(sample_pids_dir/'train_pids.pkl')

    len(train_pids+valid_pids)

In [17]:
save_dir = datapath/'full_raw_512'
if not save_dir.exists(): save_dir.mkdir()
metadata_dir = datapath/'metadata'
if not metadata_dir.exists(): metadata_dir.mkdir()

In [18]:
def save_imgs_and_metadata(study_dirname, size=None, save_metadf=False):
    try:
        # get metadata
        dcmfiles = get_dicom_files(study_dirname)
        dcm_metadf = (pd.DataFrame.from_dicoms(dcmfiles, window=pe_window)
                                  .sort_values(['ImagePositionPatient2'])
                                  .reset_index(drop=True))
        study_fnames = dcm_metadf['fname'].values
        
        # get ordered imgs
        dcm_ds = [Path(o).dcmread() for o in study_fnames]
        imgs = torch.stack([read_dcm_img(o) for o in dcm_ds])
        
        # resize
        if size:
            imgs = F.interpolate(imgs, size=size, mode='bilinear', align_corners=False)

        # save cropped imgs in order
        outdir = save_dir/study_dirname.name
        if not outdir.exists(): outdir.mkdir()
        for i, (fn, t) in enumerate(zip(study_fnames, imgs)):
            save_img(t, outdir/f"{i}_{Path(fn).stem}.jpg")

        # save metadata
        if save_metadf:
            dcm_metadf.to_csv(metadata_dir/f"{study_dirname.name}.csv", index=False)
        
    except Exception as e:
        # install GDCM offline in inference kernel
        if "GDCM" in str(e): print(f"Possible GDCM issue skipping: {study_dirname}")
        else: raise e

In [22]:
if do_sample:
    study_dirnames = [datapath/'train'/o for o in train_pids+valid_pids]
else:
    study_dirnames = [datapath/'train'/o for o in train_df['StudyInstanceUID'].unique()]

In [23]:
len(study_dirnames)

7279

In [24]:
save_imgs_and_metadata(study_dirnames[0], size=None, save_metadf=True)

In [25]:
PILImage.create((save_dir/study_dirnames[0].name).ls()[0]).shape

(512, 512)

In [None]:
for study_dirname in progress_bar(study_dirnames):
    save_imgs_and_metadata(study_dirname, size=None, save_metadf=False)

### Save Metadata

In [28]:
def save_metadata(study_dirname):
    try:
        # get metadata
        dcmfiles = get_dicom_files(study_dirname)
        dcm_metadf = (pd.DataFrame.from_dicoms(dcmfiles, window=pe_window)
                                  .sort_values(['ImagePositionPatient2'])
                                  .reset_index(drop=True))
#         study_fnames = dcm_metadf['fname'].values
        
#         # get ordered imgs
#         dcm_ds = [Path(o).dcmread() for o in study_fnames]
#         imgs = torch.stack([read_dcm_img(o) for o in dcm_ds])
        
#         # resize
#         if size:
#             imgs = F.interpolate(imgs, size=size, mode='bilinear', align_corners=False)

#         # save cropped imgs in order
#         outdir = save_dir/study_dirname.name
#         if not outdir.exists(): outdir.mkdir()
#         for i, (fn, t) in enumerate(zip(study_fnames, imgs)):
#             save_img(t, outdir/f"{i}_{Path(fn).stem}.jpg")


        dcm_metadf.to_csv(metadata_dir/f"{study_dirname.name}.csv", index=False)
        
    except Exception as e:
        # install GDCM offline in inference kernel
        if "GDCM" in str(e): print(f"Possible GDCM issue skipping: {study_dirname}")
        else: raise e

In [None]:
for study_dirname in progress_bar(study_dirnames):
    save_metadata(study_dirname)

### Multi Slice RGB 

In [23]:
dcms = pdcms[100:103]

def read_dcm_imgs(dcms, windows=windows, nchan=3, nslices=3):
    "Read multi slice in RGB and stack"
    assert len(windows) == nchan
    t = torch.stack([dcm.windowed(*w) for dcm in dcms for w in windows])
    if t.size(0) != nslices*nchan:
        diff = int((nslices*nchan - t.size(0))/nchan)
        t = torch.cat([t[:nchan] for i in range(diff)] + [t])
    assert t.size(0) == nslices*nchan
    return t