In [151]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
import os, sys, time, copy

from sklearn.model_selection import KFold

import cv2
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    changes = np.where(pixels[1:] != pixels[:-1])[0]
    starts = changes[::2]
    lengths = changes[1::2] - starts
    return list(zip(starts, lengths))


def rle_decode(rle, shape):
    mask = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for start, length in rle:
        mask[start:start + length] = 1
    return mask.reshape(shape)

In [3]:
def get_crop_box(mask):
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)

    y1, y2 = np.where(rows)[0][[0, -1]]
    x1, x2 = np.where(cols)[0][[0, -1]]

    return x1, x2, y1, y2

In [7]:
segmentations = pd.read_csv('./data/train_segmentations.csv')
segmentations

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8
0,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
1,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
2,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
3,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
4,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...
711596,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42870, 4), (43125, 8), (43379, 12), (43633, ..."
711597,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42871, 2), (43124, 9), (43379, 11), (43633, ..."
711598,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43125, 7), (43379, 11), (43633, 15), (43888,..."
711599,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43891, 3), (44146, 3), (44400, 4), (44654, 6..."


In [8]:
segmentations['folder'] = segmentations.image_path.apply(lambda x: x.split('/')[-2])
segmentations

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder
0,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
1,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
2,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
3,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
4,./data/train_images_npy/1.2.826.0.1.3680043.33...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.3306
...,...,...,...,...,...,...,...,...,...,...
711596,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42870, 4), (43125, 8), (43379, 12), (43633, ...",1.2.826.0.1.3680043.23957
711597,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(42871, 2), (43124, 9), (43379, 11), (43633, ...",1.2.826.0.1.3680043.23957
711598,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43125, 7), (43379, 11), (43633, 15), (43888,...",1.2.826.0.1.3680043.23957
711599,./data/train_images_npy/1.2.826.0.1.3680043.23...,[],[],[],[],[],[],[],"[(43891, 3), (44146, 3), (44400, 4), (44654, 6...",1.2.826.0.1.3680043.23957


In [290]:
train_bbox = pd.read_csv('./data/train_bounding_boxes.csv')
train_bbox

Unnamed: 0,StudyInstanceUID,x,y,width,height,slice_number
0,1.2.826.0.1.3680043.10051,219.27715,216.71419,17.30440,20.38517,133
1,1.2.826.0.1.3680043.10051,221.56460,216.71419,17.87844,25.24362,134
2,1.2.826.0.1.3680043.10051,216.82151,221.62546,27.00959,26.37454,135
3,1.2.826.0.1.3680043.10051,214.49455,215.48637,27.92726,37.51363,136
4,1.2.826.0.1.3680043.10051,214.00000,215.48637,27.00000,43.51363,137
...,...,...,...,...,...,...
7212,1.2.826.0.1.3680043.9940,297.23186,115.53983,85.18228,66.52623,140
7213,1.2.826.0.1.3680043.9940,298.00000,117.00000,86.00000,61.00000,141
7214,1.2.826.0.1.3680043.9940,298.00000,119.00000,87.00000,58.00000,142
7215,1.2.826.0.1.3680043.9940,299.00000,120.00000,89.00000,56.00000,143


In [291]:
train_bbox.StudyInstanceUID.unique().shape

(235,)

In [131]:
train = pd.read_csv('./data/train.csv')
train

Unnamed: 0,StudyInstanceUID,patient_overall,C1,C2,C3,C4,C5,C6,C7
0,1.2.826.0.1.3680043.6200,1,1,1,0,0,0,0,0
1,1.2.826.0.1.3680043.27262,1,0,1,0,0,0,0,0
2,1.2.826.0.1.3680043.21561,1,0,1,0,0,0,0,0
3,1.2.826.0.1.3680043.12351,0,0,0,0,0,0,0,0
4,1.2.826.0.1.3680043.1363,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
2014,1.2.826.0.1.3680043.21684,1,0,1,0,0,0,1,1
2015,1.2.826.0.1.3680043.4786,1,0,0,0,0,0,0,1
2016,1.2.826.0.1.3680043.14341,0,0,0,0,0,0,0,0
2017,1.2.826.0.1.3680043.12053,0,0,0,0,0,0,0,0


In [155]:
healthy_patients = train[train.patient_overall==0].StudyInstanceUID.values
fractured_patients = np.unique(train_bbox.StudyInstanceUID.values)

healthy_folds = [*KFold(n_splits=4).split(healthy_patients)]
fractured_folds = [*KFold(n_splits=4).split(fractured_patients)]

healthy_patients.shape, fractured_patients.shape

((1058,), (235,))

In [161]:
train_bbox

Unnamed: 0,StudyInstanceUID,x,y,width,height,slice_number
0,1.2.826.0.1.3680043.10051,219.27715,216.71419,17.30440,20.38517,133
1,1.2.826.0.1.3680043.10051,221.56460,216.71419,17.87844,25.24362,134
2,1.2.826.0.1.3680043.10051,216.82151,221.62546,27.00959,26.37454,135
3,1.2.826.0.1.3680043.10051,214.49455,215.48637,27.92726,37.51363,136
4,1.2.826.0.1.3680043.10051,214.00000,215.48637,27.00000,43.51363,137
...,...,...,...,...,...,...
7212,1.2.826.0.1.3680043.9940,297.23186,115.53983,85.18228,66.52623,140
7213,1.2.826.0.1.3680043.9940,298.00000,117.00000,86.00000,61.00000,141
7214,1.2.826.0.1.3680043.9940,298.00000,119.00000,87.00000,58.00000,142
7215,1.2.826.0.1.3680043.9940,299.00000,120.00000,89.00000,56.00000,143


In [256]:
def crop_mask_with_quantile(mask, quantile=0.99):
    y_indices, x_indices = np.where(mask == 1)

    if len(x_indices) == 0 or len(y_indices) == 0:
        return None, None, None, None

    x1 = np.quantile(x_indices, 1 - quantile)
    x2 = np.quantile(x_indices, quantile)
    y1 = np.quantile(y_indices, 1 - quantile)
    y2 = np.quantile(y_indices, quantile)
    
    x1, x2, y1, y2 = int(x1)/255, int(x2)/255, int(y1)/255, int(y2)/255
    
    return x1, y1, x2, y2

In [281]:
datas = []

for F in range(4):
    DAT = []
    
    patients = fractured_patients#[fractured_folds[F][1]]
    
    for gri in tqdm(patients):
    
        grd = train_bbox[train_bbox.StudyInstanceUID==gri]

        segm_rows = segmentations[segmentations.folder==gri].reset_index(drop=1)
        segm_rows['fractured'] = 0
        segm_rows.loc[grd.slice_number.values, 'fractured'] = 1
        segm_rows['n_pixels'] = 0
        
        segm_rows['image_crop_x1'] = 0
        segm_rows['image_crop_y1'] = 256
        segm_rows['image_crop_x2'] = 0
        segm_rows['image_crop_y2'] = 256

        filter_rows = []
        for i, row in segm_rows.iterrows():
            if not np.any([len(eval(x)) for x in row.values[1:8]]):
                continue
            
            rles = [eval(x) for x in row.values[1:8]]
            rle = rles[0] + rles[1] + rles[2] + rles[3] + rles[4] + rles[5] + rles[6]
            n_pixels = np.sum([l for x, l in rle])
            
            row['n_pixels'] = n_pixels
            
            mask = rle_decode(rle, (256, 256))
            x1, y1, x2, y2 = crop_mask_with_quantile(mask)
            
            row['image_crop_x1'] = x1
            row['image_crop_y1'] = y1
            row['image_crop_x2'] = x2
            row['image_crop_y2'] = y2
            
            filter_rows.append(row)
        
        maxed_rle = sum([sum([eval(x) for x in row.values[1:8]], []) for row in filter_rows], [])
        maxed_mask = rle_decode(maxed_rle, (256, 256))
        x1, y1, x2, y2 = crop_mask_with_quantile(maxed_mask)
        
        filter_rows = pd.DataFrame(filter_rows)
        filter_rows['patient_crop_x1'] = x1
        filter_rows['patient_crop_y1'] = y1
        filter_rows['patient_crop_x2'] = x2
        filter_rows['patient_crop_y2'] = y2
        
        DAT.append(filter_rows)
        #break
    
    patients = healthy_patients[healthy_folds[F][1]]
    
    for gri in tqdm(patients):

        segm_rows = segmentations[segmentations.folder==gri].reset_index(drop=1)
        segm_rows['fractured'] = 0
        segm_rows['n_pixels'] = 0
        
        segm_rows['image_crop_x1'] = 0
        segm_rows['image_crop_y1'] = 256
        segm_rows['image_crop_x2'] = 0
        segm_rows['image_crop_y2'] = 256

        filter_rows = []
        for i, row in segm_rows.iterrows():
            if not np.any([len(eval(x)) for x in row.values[1:8]]):
                continue
                
            rles = [eval(x) for x in row.values[1:8]]
            rle = rles[0] + rles[1] + rles[2] + rles[3] + rles[4] + rles[5] + rles[6]
            n_pixels = np.sum([l for x, l in rle])

            row['n_pixels'] = n_pixels
            
            mask = rle_decode(rle, (256, 256))
            x1, y1, x2, y2 = crop_mask_with_quantile(mask)
            
            row['image_crop_x1'] = x1
            row['image_crop_y1'] = y1
            row['image_crop_x2'] = x2
            row['image_crop_y2'] = y2
            
            if np.any([len(eval(x)) for x in row.values[1:8]]):
                filter_rows.append(row)

        maxed_rle = sum([sum([eval(x) for x in row.values[1:8]], []) for row in filter_rows], [])
        maxed_mask = rle_decode(maxed_rle, (256, 256))
        x1, y1, x2, y2 = crop_mask_with_quantile(maxed_mask)
        
        filter_rows = pd.DataFrame(filter_rows)
        filter_rows['patient_crop_x1'] = x1
        filter_rows['patient_crop_y1'] = y1
        filter_rows['patient_crop_x2'] = x2
        filter_rows['patient_crop_y2'] = y2
        
        DAT.append(filter_rows)
        #break
        
    data = pd.concat(DAT).reset_index(drop=1)
    
    datas.append(data)
    
    #break
    
data

100%|██████████| 235/235 [01:33<00:00,  2.53it/s]
100%|██████████| 265/265 [02:12<00:00,  1.99it/s]
100%|██████████| 235/235 [01:33<00:00,  2.50it/s]
100%|██████████| 265/265 [02:14<00:00,  1.97it/s]
100%|██████████| 235/235 [01:35<00:00,  2.47it/s]
100%|██████████| 264/264 [02:16<00:00,  1.94it/s]
100%|██████████| 235/235 [01:35<00:00,  2.46it/s]
100%|██████████| 264/264 [02:16<00:00,  1.94it/s]


Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder,fractured,n_pixels,image_crop_x1,image_crop_y1,image_crop_x2,image_crop_y2,patient_crop_x1,patient_crop_y1,patient_crop_x2,patient_crop_y2
0,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(15199, 3), (15451, 7), (15705, 7), (15960, 4)]",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,21,0.345098,0.231373,0.380392,0.243137,0.172549,0.168627,0.749020,0.788235
1,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14942, 5), (15195, 9), (15448, 12), (15702, ...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,59,0.329412,0.227451,0.388235,0.254902,0.172549,0.168627,0.749020,0.788235
2,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14683, 7), (14936, 13), (15189, 17), (15443,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,114,0.313725,0.223529,0.392157,0.262745,0.172549,0.168627,0.749020,0.788235
3,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14427, 7), (14680, 12), (14933, 17), (15187,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,164,0.305882,0.219608,0.470588,0.266667,0.172549,0.168627,0.749020,0.788235
4,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14173, 7), (14425, 14), (14677, 18), (14931,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,230,0.301961,0.215686,0.486275,0.270588,0.172549,0.168627,0.749020,0.788235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108676,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13705, 13), (13956, 23), (14209, 30), (14463...","[(18086, 4), (18337, 12), (18589, 19), (18842,...",1.2.826.0.1.3680043.12053,0,439,0.474510,0.207843,0.643137,0.266667,0.188235,0.094118,0.792157,0.619608
108677,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13710, 5), (13958, 19), (14210, 27), (14464,...","[(17827, 8), (18078, 15), (18329, 22), (18583,...",1.2.826.0.1.3680043.12053,0,329,0.478431,0.207843,0.627451,0.258824,0.188235,0.094118,0.792157,0.619608
108678,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13960, 13), (14212, 21), (14466, 24), (14720...","[(17570, 8), (17824, 12), (18072, 21), (18324,...",1.2.826.0.1.3680043.12053,0,226,0.482353,0.211765,0.600000,0.254902,0.188235,0.094118,0.792157,0.619608
108679,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(14216, 10), (14468, 16), (14722, 18), (14976...","[(17055, 8), (17309, 12), (17561, 18), (17813,...",1.2.826.0.1.3680043.12053,0,135,0.482353,0.215686,0.572549,0.250980,0.188235,0.094118,0.792157,0.619608


In [283]:
for F, data in enumerate(datas):
    data.to_csv(f"./data/train_stage1_F{F}.csv", index=False)

In [292]:
data

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder,fractured,n_pixels,image_crop_x1,image_crop_y1,image_crop_x2,image_crop_y2,patient_crop_x1,patient_crop_y1,patient_crop_x2,patient_crop_y2
0,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(15199, 3), (15451, 7), (15705, 7), (15960, 4)]",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,21,0.345098,0.231373,0.380392,0.243137,0.172549,0.168627,0.749020,0.788235
1,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14942, 5), (15195, 9), (15448, 12), (15702, ...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,59,0.329412,0.227451,0.388235,0.254902,0.172549,0.168627,0.749020,0.788235
2,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14683, 7), (14936, 13), (15189, 17), (15443,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,114,0.313725,0.223529,0.392157,0.262745,0.172549,0.168627,0.749020,0.788235
3,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14427, 7), (14680, 12), (14933, 17), (15187,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,164,0.305882,0.219608,0.470588,0.266667,0.172549,0.168627,0.749020,0.788235
4,./data/train_images_npy/1.2.826.0.1.3680043.10...,"[(14173, 7), (14425, 14), (14677, 18), (14931,...",[],[],[],[],[],[],[],1.2.826.0.1.3680043.10051,0,230,0.301961,0.215686,0.486275,0.270588,0.172549,0.168627,0.749020,0.788235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108676,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13705, 13), (13956, 23), (14209, 30), (14463...","[(18086, 4), (18337, 12), (18589, 19), (18842,...",1.2.826.0.1.3680043.12053,0,439,0.474510,0.207843,0.643137,0.266667,0.188235,0.094118,0.792157,0.619608
108677,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13710, 5), (13958, 19), (14210, 27), (14464,...","[(17827, 8), (18078, 15), (18329, 22), (18583,...",1.2.826.0.1.3680043.12053,0,329,0.478431,0.207843,0.627451,0.258824,0.188235,0.094118,0.792157,0.619608
108678,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(13960, 13), (14212, 21), (14466, 24), (14720...","[(17570, 8), (17824, 12), (18072, 21), (18324,...",1.2.826.0.1.3680043.12053,0,226,0.482353,0.211765,0.600000,0.254902,0.188235,0.094118,0.792157,0.619608
108679,./data/train_images_npy/1.2.826.0.1.3680043.12...,[],[],[],[],[],[],"[(14216, 10), (14468, 16), (14722, 18), (14976...","[(17055, 8), (17309, 12), (17561, 18), (17813,...",1.2.826.0.1.3680043.12053,0,135,0.482353,0.215686,0.572549,0.250980,0.188235,0.094118,0.792157,0.619608


In [287]:
filt = data[data.fractured==1]
filt

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder,fractured,n_pixels,image_crop_x1,image_crop_y1,image_crop_x2,image_crop_y2,patient_crop_x1,patient_crop_y1,patient_crop_x2,patient_crop_y2
102,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],"[(9841, 5), (10093, 16), (10347, 21), (10602, ...","[(11371, 10), (11619, 23), (11874, 29), (12128...",[],[],[],[],1.2.826.0.1.3680043.10051,1,3556,0.254902,0.156863,0.639216,0.443137,0.172549,0.168627,0.749020,0.788235
103,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],"[(10094, 12), (10348, 18), (10604, 4), (10613,...","[(11364, 20), (11619, 27), (11873, 31), (12128...",[],[],[],[],1.2.826.0.1.3680043.10051,1,3637,0.250980,0.160784,0.643137,0.450980,0.172549,0.168627,0.749020,0.788235
104,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],"[(11364, 25), (11619, 28), (11874, 30), (12128...",[],[],[],[],1.2.826.0.1.3680043.10051,1,3634,0.250980,0.176471,0.647059,0.458824,0.172549,0.168627,0.749020,0.788235
105,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],"[(11118, 6), (11365, 23), (11620, 26), (11874,...",[],[],[],[],1.2.826.0.1.3680043.10051,1,3656,0.243137,0.176471,0.647059,0.466667,0.172549,0.168627,0.749020,0.788235
106,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],"[(11369, 16), (11623, 20), (11877, 23), (12132...",[],[],[],[],1.2.826.0.1.3680043.10051,1,3557,0.243137,0.176471,0.647059,0.478431,0.172549,0.168627,0.749020,0.788235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49153,./data/train_images_npy/1.2.826.0.1.3680043.99...,[],[],[],"[(14718, 5), (14725, 12), (14972, 23), (15227,...","[(21685, 5), (21936, 10), (22191, 12), (22448,...",[],[],[],1.2.826.0.1.3680043.9940,1,3835,0.317647,0.227451,0.745098,0.552941,0.250980,0.164706,0.792157,0.717647
49154,./data/train_images_npy/1.2.826.0.1.3680043.99...,[],[],[],"[(14719, 3), (14730, 6), (14973, 21), (15228, ...","[(21429, 4), (21681, 9), (21935, 12), (22190, ...",[],[],[],1.2.826.0.1.3680043.9940,1,3718,0.313725,0.231373,0.745098,0.552941,0.250980,0.164706,0.792157,0.717647
49155,./data/train_images_npy/1.2.826.0.1.3680043.99...,[],[],[],"[(14974, 19), (15228, 23), (15483, 25), (15738...","[(20916, 1), (21166, 10), (21421, 12), (21675,...",[],[],[],1.2.826.0.1.3680043.9940,1,3506,0.313725,0.231373,0.745098,0.556863,0.250980,0.164706,0.792157,0.717647
49156,./data/train_images_npy/1.2.826.0.1.3680043.99...,[],[],[],"[(14975, 18), (15229, 22), (15483, 25), (15738...","[(21166, 9), (21420, 12), (21675, 14), (21930,...",[],[],[],1.2.826.0.1.3680043.9940,1,3278,0.313725,0.231373,0.749020,0.552941,0.250980,0.164706,0.792157,0.717647


In [289]:
filt.image_path.values[0]

'./data/train_images_npy/1.2.826.0.1.3680043.10051/134.npy'

In [195]:
fold_data = pd.DataFrame({'patient': fractured_patients, 'fold': -1})

for F, fd in enumerate(fractured_folds):
    fold_data.loc[fd[1], 'fold'] = F

fold_data

Unnamed: 0,patient,fold
0,1.2.826.0.1.3680043.10051,0
1,1.2.826.0.1.3680043.10579,0
2,1.2.826.0.1.3680043.10678,0
3,1.2.826.0.1.3680043.10697,0
4,1.2.826.0.1.3680043.10732,0
...,...,...
230,1.2.826.0.1.3680043.8519,3
231,1.2.826.0.1.3680043.8693,3
232,1.2.826.0.1.3680043.9447,3
233,1.2.826.0.1.3680043.9926,3


In [196]:
fold_data.to_csv('./data/train_stage1_foldinfo.csv', index=False)

In [137]:
data.fractured.value_counts()

fractured
0    368734
1     21294
Name: count, dtype: int64

In [127]:
data.folder.unique().shape

(235,)

In [123]:
train[train.patient_overall==1]

Unnamed: 0,StudyInstanceUID,patient_overall,C1,C2,C3,C4,C5,C6,C7
0,1.2.826.0.1.3680043.6200,1,1,1,0,0,0,0,0
1,1.2.826.0.1.3680043.27262,1,0,1,0,0,0,0,0
2,1.2.826.0.1.3680043.21561,1,0,1,0,0,0,0,0
4,1.2.826.0.1.3680043.1363,1,0,0,0,0,1,0,0
5,1.2.826.0.1.3680043.4859,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
2007,1.2.826.0.1.3680043.17052,1,0,0,0,1,0,0,0
2009,1.2.826.0.1.3680043.14464,1,0,0,0,0,0,0,1
2014,1.2.826.0.1.3680043.21684,1,0,1,0,0,0,1,1
2015,1.2.826.0.1.3680043.4786,1,0,0,0,0,0,0,1


In [59]:
OUTPUT_FOLDER = './data/train_data_crops/'

DAT = {col: [] for col in ['folder', 'start', 'end'] + ["C", 'sz'] + ['x1', 'x2', 'y1', 'y2']}

for gri, grd in tqdm(segmentations.groupby('folder')):
    os.makedirs(f"{OUTPUT_FOLDER}/{gri}/", exist_ok=1)
    
    mask_volume = []
    for i, row in grd.iterrows():
        mask = []
        for c in range(1,9):
            rle = eval(row[f"rle_c{c}"])
            mask.append(rle_decode(rle, (256, 256)))
        mask = np.stack(mask)
        mask_volume.append(mask)
    mask_volume = np.stack(mask_volume)
    
    volume = np.stack([np.load(file) for file in grd.image_path.values])
    
    szs = mask_volume.sum(-1).sum(-1)
    szs = szs / szs.max(0)
    
    for c in range(7):
        #zs = np.where(mask_volume[:, c].max(-1).max(-1))[0]
        zs = np.where(szs[:, c])[0]
        start, end = zs.min(), zs.max()
        
        if start==end:
            continue
        
        #vol = vol.astype(np.float32) / 255
        
        DAT['folder'].append(gri)
        DAT['start'].append(start)
        DAT['end'].append(end)
        DAT['C'].append(c+1)
        DAT[f'sz'].append(szs[start:end, c].tolist())
        
        mask_maxed = mask_volume[start:end].max(0).max(0)
        try:
            x1, x2, y1, y2 = get_crop_box(mask_maxed)
            x1_orig, x2_orig, y1_orig, y2_orig = x1, x2, y1, y2
            x1, x2, y1, y2 = x1/mask_maxed.shape[1], x2/mask_maxed.shape[1], y1/mask_maxed.shape[0], y2/mask_maxed.shape[0]
            x1, x2, y1, y2 = int(x1*volume.shape[2]), int(x2*volume.shape[2]), int(y1*volume.shape[1]), int(y2*volume.shape[1])
        except:
            x1, x2, y1, y2 = 0, vol.shape[1], 0, vol.shape[0]
        
        vol = volume[start:end][:, y1:y2, x1:x2]
        np.save(f"{OUTPUT_FOLDER}/{gri}/C{c}_{start}_{end}_image.npy", vol)
        
        mask_c = mask_volume[start:end][:, c]
        mask_other = np.delete(mask_volume[start:end], c, axis=1).max(1)
        mask = np.stack([mask_c, mask_other], -1)[:, y1_orig:y2_orig, x1_orig:x2_orig]
        
        np.save(f"{OUTPUT_FOLDER}/{gri}/C{c}_{start}_{end}_mask.npy", mask)
        
        DAT['x1'].append(x1)
        DAT['x2'].append(x2)
        DAT['y1'].append(y1)
        DAT['y2'].append(y2)
    
    #break

  szs = szs / szs.max(0)
100%|██████████| 2019/2019 [36:17<00:00,  1.08s/it] 


In [60]:
train_data = pd.DataFrame(DAT)
train_data

Unnamed: 0,folder,start,end,C,sz,x1,x2,y1,y2
0,1.2.826.0.1.3680043.10001,49,86,1,"[0.0015034364261168386, 0.0156786941580756, 0....",116,440,70,248
1,1.2.826.0.1.3680043.10001,54,112,2,"[0.006167873424510592, 0.009385894341646554, 0...",116,440,70,284
2,1.2.826.0.1.3680043.10001,101,135,3,"[0.02368045649072753, 0.043937232524964336, 0....",162,392,80,284
3,1.2.826.0.1.3680043.10001,119,161,4,"[0.015739484396200813, 0.028222523744911805, 0...",140,384,94,304
4,1.2.826.0.1.3680043.10001,125,186,5,"[0.0035897435897435897, 0.04205128205128205, 0...",128,384,96,356
...,...,...,...,...,...,...,...,...,...
14125,1.2.826.0.1.3680043.9997,84,137,3,"[0.014208581983518044, 0.05569764137539074, 0....",132,362,48,306
14126,1.2.826.0.1.3680043.9997,107,138,4,"[0.01014455997971088, 0.048693887902612226, 0....",132,362,52,306
14127,1.2.826.0.1.3680043.9997,125,163,5,"[0.0025392428439519853, 0.017774699907663897, ...",130,362,60,326
14128,1.2.826.0.1.3680043.9997,124,195,6,"[0.0004909180166912126, 0.0066273932253313695,...",98,386,58,354


In [61]:
train_data.to_csv('./data/train_data_crops.csv', index=False)

In [274]:
train_data.to_csv('./data/train_data_zcrop.csv', index=False)

In [126]:
vol.shape

(96, 512, 512)

In [119]:
szs[:, 0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00150344,
       0.01567869, 0.02899485, 0.03801546, 0.05841924, 0.08569588,
       0.11318729, 0.1544244 , 0.19093643, 0.22959622, 0.27878007,
       0.34407216, 0.41000859, 0.47530069, 0.56980241, 0.63767182,
       0.71563574, 0.82517182, 0.89347079, 0.9540378 , 0.99420103,
       1.        , 0.9899055 , 0.95425258, 0.87414089, 0.80047

In [121]:
szs[start:end, :7]

array([[0.        , 0.        , 0.        , 0.        , 0.51923077,
        0.21039354, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.47897436,
        0.26639758, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.39794872,
        0.37209889, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.29230769,
        0.46241171, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.22512821,
        0.54339051, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.18435897,
        0.61629667, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.15333333,
        0.67305752, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.12948718,
        0.75151362, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.10205128,
        0.81079717, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.08538462,
        0.87865792, 0. 

In [54]:
szs = mask_volume.sum(-1).sum(-1)
szs

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0, 4578],
       [   0,    0,    0, ...,    0,    0, 4587],
       [   0,    0,    0, ...,    0,    0, 4581]], dtype=uint64)

In [58]:
szs.max(0)

array([4656, 3729, 3505, 3685, 3900, 3964, 4810, 5498], dtype=uint64)

In [93]:
np.where(szs[:, 0])[0]

array([49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
       66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
       83, 84, 85, 86])

In [59]:
szs / szs.max(0)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83266642],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83430338],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.83321208]])

In [80]:
np.arange(100)[2::3][:32] + 3

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53,
       56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98])

In [15]:
grd

Unnamed: 0,image_path,rle_c1,rle_c2,rle_c3,rle_c4,rle_c5,rle_c6,rle_c7,rle_c8,folder
175867,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175868,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175869,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175870,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
175871,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],[],1.2.826.0.1.3680043.10001
...,...,...,...,...,...,...,...,...,...,...
176130,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23403, 14), (23655, 22), (23910, 24), (24165...",1.2.826.0.1.3680043.10001
176131,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23406, 12), (23658, 19), (23911, 24), (24166...",1.2.826.0.1.3680043.10001
176132,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23661, 15), (23913, 20), (24167, 25), (24422...",1.2.826.0.1.3680043.10001
176133,./data/train_images_npy/1.2.826.0.1.3680043.10...,[],[],[],[],[],[],[],"[(23662, 8), (23914, 18), (24168, 22), (24422,...",1.2.826.0.1.3680043.10001
