In [62]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
import os, sys, time, copy

import cv2
from PIL import Image
import matplotlib.pyplot as plt

In [63]:
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    changes = np.where(pixels[1:] != pixels[:-1])[0]
    starts = changes[::2]
    lengths = changes[1::2] - starts
    return list(zip(starts, lengths))


def rle_decode(rle, shape):
    mask = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for start, length in rle:
        mask[start:start + length] = 1
    return mask.reshape(shape)

In [64]:
def get_crop_box(mask):
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)

    y1, y2 = np.where(rows)[0][[0, -1]]
    x1, x2 = np.where(cols)[0][[0, -1]]

    return x1, x2, y1, y2

In [65]:
segmentations = pd.read_csv('./data/train_segmentations.csv')
segmentations

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8
0,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
1,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
2,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
3,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
4,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...
711596,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42870, 4), (43125, 8), (43379, 12), (43633, ..."
711597,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42871, 2), (43124, 9), (43379, 11), (43633, ..."
711598,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43125, 7), (43379, 11), (43633, 15), (43888,..."
711599,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43891, 3), (44146, 3), (44400, 4), (44654, 6..."


In [77]:
segmentations['folder'] = segmentations.image_path.apply(lambda x: x.split('/')[-2])
segmentations

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder
0,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
1,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
2,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
3,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
4,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
...,...,...,...,...,...,...,...,...,...,...
711596,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42870, 4), (43125, 8), (43379, 12), (43633, ...",1.2.826.0.1.3680043.23957
711597,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42871, 2), (43124, 9), (43379, 11), (43633, ...",1.2.826.0.1.3680043.23957
711598,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43125, 7), (43379, 11), (43633, 15), (43888,...",1.2.826.0.1.3680043.23957
711599,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43891, 3), (44146, 3), (44400, 4), (44654, 6...",1.2.826.0.1.3680043.23957


In [78]:
OUTPUT_FOLDER = './data/train_data_XYZcrops/'

DAT = {col: [] for col in ['folder', 'idxs'] + ["C", 'sz'] + ['x1', 'x2', 'y1', 'y2']}

for gri, grd in tqdm(segmentations.groupby('folder')):
    os.makedirs(f"{OUTPUT_FOLDER}/{gri}/", exist_ok=1)
    
    mask_volume = []
    for i, row in grd.iterrows():
        mask = []
        for c in range(1,9):
            rle = eval(row[f"rle_c{c}"])
            mask.append(rle_decode(rle, (256, 256)))
        mask = np.stack(mask)
        mask_volume.append(mask)
    mask_volume = np.stack(mask_volume)
    
    volume = np.stack([np.load(file) for file in grd.image_path.values])
    
    szs = mask_volume.sum(-1).sum(-1)
    szs = szs / szs.max(0)
    
    for c in range(7):
        #zs = np.where(mask_volume[:, c].max(-1).max(-1))[0]
        idxs = np.where(szs[:, c])[0]
        
        if np.unique(idxs).shape[0] < 2:
            continue
        
        #vol = vol.astype(np.float32) / 255
        
        DAT['folder'].append(gri)
        DAT['idxs'].append(idxs.tolist())
        DAT['C'].append(c+1)
        DAT[f'sz'].append(szs[idxs, c].tolist())
        
        mask_maxed = mask_volume[idxs].max(0).max(0)
        try:
            x1, x2, y1, y2 = get_crop_box(mask_maxed)
            x1_orig, x2_orig, y1_orig, y2_orig = x1, x2, y1, y2
            x1, x2, y1, y2 = x1/mask_maxed.shape[1], x2/mask_maxed.shape[1], y1/mask_maxed.shape[0], y2/mask_maxed.shape[0]
            x1, x2, y1, y2 = int(x1*volume.shape[2]), int(x2*volume.shape[2]), int(y1*volume.shape[1]), int(y2*volume.shape[1])
        except:
            x1, x2, y1, y2 = 0, vol.shape[1], 0, vol.shape[0]
        
        vol = volume[idxs][:, y1:y2, x1:x2]
        np.save(f"{OUTPUT_FOLDER}/{gri}/C{c}_image.npy", vol)
        
        mask_c = mask_volume[idxs][:, c]
        mask_other = np.delete(mask_volume[idxs], c, axis=1).max(1)
        mask = np.stack([mask_c, mask_other], -1)[:, y1_orig:y2_orig, x1_orig:x2_orig]
        
        np.save(f"{OUTPUT_FOLDER}/{gri}/C{c}_mask.npy", mask)
        
        DAT['x1'].append(x1)
        DAT['x2'].append(x2)
        DAT['y1'].append(y1)
        DAT['y2'].append(y2)
    
    #break

  szs = szs / szs.max(0)
100%|██████████| 2019/2019 [36:53<00:00,  1.10s/it] 


In [79]:
train_data = pd.DataFrame(DAT)
train_data

Unnamed: 0,folder,idxs,C,sz,x1,x2,y1,y2
0,1.2.826.0.1.3680043.10001,"[49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 6...",1,"[0.00042408821034775233, 0.01993214588634436, ...",116,440,70,250
1,1.2.826.0.1.3680043.10001,"[54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 6...",2,"[0.0047256497768443165, 0.006563402467839328, ...",116,440,70,284
2,1.2.826.0.1.3680043.10001,"[101, 102, 103, 104, 105, 106, 107, 108, 109, ...",3,"[0.00281214848143982, 0.04190101237345332, 0.0...",162,392,82,284
3,1.2.826.0.1.3680043.10001,"[119, 120, 121, 122, 123, 124, 125, 126, 127, ...",4,"[0.004678040726472207, 0.02476609796367639, 0....",138,384,96,304
4,1.2.826.0.1.3680043.10001,"[125, 126, 127, 128, 129, 130, 131, 132, 140, ...",5,"[0.05639193671855065, 0.08420515437611635, 0.1...",128,384,96,356
...,...,...,...,...,...,...,...,...
14125,1.2.826.0.1.3680043.9997,"[84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 9...",3,"[0.01648505169041632, 0.06761665269628388, 0.1...",140,362,48,244
14126,1.2.826.0.1.3680043.9997,"[107, 108, 109, 110, 111, 112, 113, 114, 115, ...",4,"[0.004285354171918326, 0.05092009074867658, 0....",132,362,54,248
14127,1.2.826.0.1.3680043.9997,"[127, 130, 131, 132, 133, 134, 135, 136, 137, ...",5,"[0.000687442713107241, 0.0009165902841429881, ...",130,362,58,326
14128,1.2.826.0.1.3680043.9997,"[146, 150, 151, 152, 153, 154, 155, 156, 157, ...",6,"[0.03007518796992481, 0.12515158864904197, 0.0...",98,386,68,354


In [80]:
train_data.to_csv('./data/train_data_XYZcrops.csv', index=False)

In [274]:
train_data.to_csv('./data/train_data_zcrop.csv', index=False)

In [126]:
vol.shape

(96, 512, 512)

In [119]:
szs[:, 0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00150344,
       0.01567869, 0.02899485, 0.03801546, 0.05841924, 0.08569588,
       0.11318729, 0.1544244 , 0.19093643, 0.22959622, 0.27878007,
       0.34407216, 0.41000859, 0.47530069, 0.56980241, 0.63767182,
       0.71563574, 0.82517182, 0.89347079, 0.9540378 , 0.99420103,
       1.        , 0.9899055 , 0.95425258, 0.87414089, 0.80047

In [121]:
szs[start:end, :7]

array([[0.        , 0.        , 0.        , 0.        , 0.51923077,
        0.21039354, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.47897436,
        0.26639758, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.39794872,
        0.37209889, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.29230769,
        0.46241171, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.22512821,
        0.54339051, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.18435897,
        0.61629667, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.15333333,
        0.67305752, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.12948718,
        0.75151362, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.10205128,
        0.81079717, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.08538462,
        0.87865792, 0. 

In [54]:
szs = mask_volume.sum(-1).sum(-1)
szs

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0, 4578],
       [   0,    0,    0, ...,    0,    0, 4587],
       [   0,    0,    0, ...,    0,    0, 4581]], dtype=uint64)

In [58]:
szs.max(0)

array([4656, 3729, 3505, 3685, 3900, 3964, 4810, 5498], dtype=uint64)

In [93]:
np.where(szs[:, 0])[0]

array([49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
       66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
       83, 84, 85, 86])

In [59]:
szs / szs.max(0)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83266642],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83430338],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83321208]])

In [80]:
np.arange(100)[2::3][:32] + 3

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53,
       56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98])

In [15]:
grd

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder
175867,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175868,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175869,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175870,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175871,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
...,...,...,...,...,...,...,...,...,...,...
176130,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23403, 14), (23655, 22), (23910, 24), (24165...",1.2.826.0.1.3680043.10001
176131,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23406, 12), (23658, 19), (23911, 24), (24166...",1.2.826.0.1.3680043.10001
176132,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23661, 15), (23913, 20), (24167, 25), (24422...",1.2.826.0.1.3680043.10001
176133,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23662, 8), (23914, 18), (24168, 22), (24422,...",1.2.826.0.1.3680043.10001
