血的教训:
1. 对mask做插值(rescale,resize),一定且只能在onehot编码上操作
2. 不能用skimage,一定要用sitk

In [None]:
import numpy as np
from glob import glob
from tqdm import tqdm
import h5py
import nrrd
import os
import pandas as pd
from dataset_split import remove_files
import SimpleITK as sitk
from skimage import transform
from collections import Counter
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical

output_size =[128, 128, 64]

def resample_image3D(
    image3D,
    newspacing=[0.3,0.3,3],
    newsize=None,
    method='Linear',):
    """做插值"""
    resample = sitk.ResampleImageFilter()
    if method == 'Linear':
        resample.SetInterpolator(sitk.sitkLinear)
    elif method == 'Nearest':
        resample.SetInterpolator(sitk.sitkNearestNeighbor)
    resample.SetOutputDirection(image3D.GetDirection())
    resample.SetOutputOrigin(image3D.GetOrigin())
    resample.SetOutputSpacing(newspacing)

    if not newsize:
        newsize = np.round(np.array(image3D.GetSize())*np.abs(image3D.GetSpacing())/np.array(newspacing)).astype('int').tolist()

    resample.SetSize(newsize)
    # resample.SetDefaultPixelValue(0)

    newimage = resample.Execute(image3D)
    return newimage

def sitk_onehot_transform(image):
    image_array = sitk.GetArrayFromImage(image)
    label_array_onehot = to_categorical(image_array)
    image_onehot = sitk.GetImageFromArray(label_array_onehot)
    image_onehot.SetOrigin(image.GetOrigin())
    image_onehot.SetDirection(image.GetDirection())
    image_onehot.SetSpacing(image.GetSpacing())
    return image_onehot

# 数组替换元素
def array_replace(array,olds,news):
    # 不适用于onehot
    #olds:list of old value
    #news:list of new value
    olds = np.array(olds)
    news = np.array(news)
    offset = olds.max()*10
    tmps = olds+offset
    array += offset
    for tmp,new in zip(tmps,news):
        array[array==tmp] = new
    return array

def covert_h5(glob_str, old_replaced, new_replaced):
    """
    备注：不要骨头，骨头合并到背景类别中
    """
    listt = glob(glob_str)
    error_samples = []
    error_samples_origin = []
    stats = pd.DataFrame(columns=['sample_name',
                                  'mean_whole', 
                                  'mean_bg', 
                                  'mean_dura', 
                                  'mean_SC', 
                                  'std_whole',
                                  'std_bg',
                                  'std_dura',
                                  'std_SC',
                                  'old_space0','old_space1','old_space2',
                                  'new_space0','new_space1','new_space2',
                                 ])
    for item in tqdm(listt):
        sample_name = item.split('/')[-2]
        print(sample_name,':')#win系统改为'\\'
#         if not sample_name == "1171704-neck":#B809338":#""1352900":#B809338":#"1756747":#1700637-neck":
#             continue
        
        # read image
        image = sitk.ReadImage(item)
        seg = sitk.ReadImage(item.replace(old_replaced, 'Segmentation.seg.nrrd'))
        label = sitk.ReadImage(item.replace(old_replaced, 'Segmentation-label.nrrd'))
        label_onehot = sitk_onehot_transform(label)
        
        label_name = [
            'bg',
            seg.GetMetaData('Segment0_Name'),
            seg.GetMetaData('Segment1_Name'),
            seg.GetMetaData('Segment2_Name') 
            ]#人工标注的类别顺序
        oldspacing = np.abs(image.GetSpacing())
        newspacing = [0.3, 0.3, 3.0]

        # resample/rescale( by sitk )
        image = resample_image3D(image,newspacing,method='Linear')
        label_onehot = resample_image3D(label_onehot,newspacing,method='Nearest')
        
        # get array
        image = sitk.GetArrayFromImage(image).transpose((2,1,0))#tanspose之后才能与sizes匹配
        label_onehot = np.round( sitk.GetArrayFromImage(label_onehot) ).transpose((2,1,0,3))#tanspose之后才能与sizes匹配
        label = np.argmax(label_onehot,axis=-1)
        plot_slice_sample(image,label,np.nonzero(label)[2].max(),item.replace(old_replaced,'slice_sample_origin.png'))
        
        
        if not image.shape == label_onehot.shape[:-1]:
            error_samples.append(sample_name)
            print("error sample(mismatch shape of image and label):",sample_name)
            continue

        if not label_onehot.sum(axis=-1).max()==1:
            # label onehot encoder可以解决这个问题
            error_samples.append(sample_name)
            print("error sample(some pixels in seg are multi-category at the same time):",sample_name)
            continue
        
        # 灰度标准化
        image = (image - np.mean(image)) / np.std(image)
        image = image.astype(np.float32)

        
        # 错误病例：标记的尺寸和image尺寸不同，缺少其中一个类别或者多个类别的标记
        if not label_onehot.shape[-1] == 4:
            error_samples.append(sample_name)
            print("error sample(no dura/bone/SC):",sample_name)
            continue 
        if not (np.unique(label_onehot) == [0, 1]).all():
            error_samples.append(sample_name)
            print("error sample label file error:",sample_name)   
            continue
        
        ## 调整类别顺序&合并骨头到背景中，注意：是onehot编码
        target_name = ['bg','bone','dura','SC']#目标类别顺序
        idx = [label_name.index(name) for name in target_name]
        assert len(idx)==4,'one or more classes missed'
        label_onehot = label_onehot[:,:,:,idx]

        ## bone归入背景类
        bg = label_onehot[:,:,:,[0,1]].sum(axis=-1)[:,:,:,np.newaxis]
        label_onehot = np.concatenate((bg,label_onehot[:,:,:,2:]),axis=-1)
        assert (np.unique(label_onehot) == [0, 1]).all(), "1: pixel class error"
        ## 转化为非onehot编码以便作图
        label = np.argmax(label_onehot, axis=-1)
        
        # cut( random center cut)
        tempL = np.nonzero(label)
        minx, maxx = np.min(tempL[0]), np.max(tempL[0])
        miny, maxy = np.min(tempL[1]), np.max(tempL[1])
        minz, maxz = np.min(tempL[2]), np.max(tempL[2])
        w, h, d = label.shape
        px = max(output_size[0] - (maxx-minx+1), 0) // 2
        py = max(output_size[1] - (maxy-miny+1), 0) // 2
        #pz = max(output_size[2] - (maxz-minz+1), 0) // 2
        minx = max(minx - np.random.randint(10, 20) - px, 0)
        maxx = min(maxx + np.random.randint(10, 20) + px, w-1)
        miny = max(miny - np.random.randint(10, 20) - py, 0)
        maxy = min(maxy + np.random.randint(10, 20) + py, h-1)
        #minz = max(minz - np.random.randint(10, 20) - pz, 0)
        #maxz = min(maxz + np.random.randint(10, 20) + pz, d)
        image = image[minx:maxx+1, miny:maxy+1, minz:maxz+1]
        label = label[minx:maxx+1, miny:maxy+1, minz:maxz+1]
        print("cut image.shape:",image.shape, "cut label.shape:",label.shape)
        plot_slice_sample(image,label,maxz-minz,item.replace(old_replaced,'slice_sample.png'))
        
        # save files
        f = h5py.File(item.replace(old_replaced, new_replaced), 'w')
        f.create_dataset('image', data=image, compression="gzip")
        f.create_dataset('label', data=label_onehot, compression="gzip")
        f.close()
    print("total number of samples:", len(listt))
    return error_samples, error_samples_origin

def plot_slice_sample(image,label,d,fn):
    fig = plt.figure()
    a = fig.add_subplot(1, 2, 1)
    imgplot = plt.imshow(image[:,:,d].squeeze())
    a.set_title('image')
    plt.colorbar(orientation='horizontal')
    a = fig.add_subplot(1, 2, 2)
    imgplot = plt.imshow(label[:,:,d].squeeze())
    imgplot.set_clim(0.0, 3.0)
    a.set_title('label')
    plt.colorbar(orientation='horizontal')
    plt.savefig(fn)
    plt.show()
    
def covert_h5_unseg(glob_str, old_replaced, new_replaced):
    """
    备注：无标注数据的格式转换
    """
    listt = glob(glob_str)
    for item in tqdm(listt):
        sample_name = item.split('/')[-2]
        print(sample_name,':')#win系统改为'\\'
        image = sitk.ReadImage(item)
        
        # resample
        newspacing = [0.3, 0.3, 3.0]
        image = resample_image3D(image,newspacing,method='Linear')
        image = sitk.GetArrayFromImage(image).transpose((2,1,0))#tanspose之后才能与sizes匹配
        
        # 灰度标准化
        image = (image - np.mean(image)) / np.std(image)
        image = image.astype(np.float32)
        print("image shape:",image.shape)
        
        f = h5py.File(item.replace(old_replaced, new_replaced), 'w')
        f.create_dataset('image', data=image, compression="gzip")
        f.close()  
    print("total number of unseg-samples:", len(listt))

    
if __name__ == '__main__':
    # 有标签数据
#     print('seg dataset:')
#     ## 先删除旧文件
#     dataset_dir = '../../data/CTM_dataset/Segmented'
#     re = os.path.join(dataset_dir,'*/mri_norm2.h5')
#     remove_files(re=re)
#     ## 再生成新文件
#     glob_str = '../../data/CTM_dataset/Segmented/*/CTM.nrrd'
#     error_samples,error_samples_origin = covert_h5(glob_str,'CTM.nrrd','mri_norm2.h5')
    
    # 无标签数据
    print('unseg dataset:')
    ## 先删除旧文件
    dataset_dir = '../../data/CTM_dataset/unSegmented'
    re = os.path.join(dataset_dir,'*/mri_norm2.h5')
    remove_files(re=re)
    ## 再生成新文件
    glob_str = '../../data/CTM_dataset/unSegmented/*/CTM.nrrd'
    covert_h5_unseg(glob_str,'CTM.nrrd','mri_norm2.h5')
    glob_str = '../../data/CTM_dataset/unSegmented/*/CT-vol.nrrd'
    covert_h5_unseg(glob_str,'CT-vol.nrrd','mri_norm2.h5')      


unseg dataset:


  0%|          | 0/212 [00:00<?, ?it/s]

b1261721 :
image shape: (515, 515, 71)


  0%|          | 1/212 [00:02<07:36,  2.16s/it]

1302098 :
image shape: (565, 565, 72)


  1%|          | 2/212 [00:04<07:45,  2.22s/it]

1336587-dingzi :
image shape: (555, 555, 71)


  1%|▏         | 3/212 [00:06<07:46,  2.23s/it]

b1532717 :
image shape: (468, 468, 82)


  2%|▏         | 4/212 [00:08<07:37,  2.20s/it]

b1232165 :
image shape: (799, 799, 92)


  2%|▏         | 5/212 [00:15<11:44,  3.41s/it]

1204336 :
image shape: (565, 565, 74)


  3%|▎         | 6/212 [00:17<10:36,  3.09s/it]

b1330821 :
image shape: (512, 512, 82)


  3%|▎         | 7/212 [00:23<13:43,  4.02s/it]

b1203609 :
image shape: (645, 645, 78)


  4%|▍         | 8/212 [00:27<13:20,  3.92s/it]

b1273842 :
image shape: (558, 558, 79)


  4%|▍         | 9/212 [00:29<11:52,  3.51s/it]

610558 :
image shape: (802, 802, 72)


  5%|▍         | 10/212 [00:34<12:40,  3.77s/it]

b1559027 :
image shape: (492, 492, 73)


  5%|▌         | 11/212 [00:36<10:58,  3.27s/it]

B1102588-dingzi :
image shape: (519, 519, 180)


  6%|▌         | 12/212 [00:42<13:26,  4.03s/it]

b1330032 :
image shape: (599, 599, 90)


  6%|▌         | 13/212 [00:45<12:39,  3.82s/it]

1240266 :
image shape: (625, 625, 105)


  7%|▋         | 14/212 [00:49<12:50,  3.89s/it]

1466425 :
image shape: (550, 550, 102)


  7%|▋         | 15/212 [00:52<11:37,  3.54s/it]

1460036-neck :
image shape: (500, 500, 63)


  8%|▊         | 16/212 [00:53<09:45,  2.99s/it]

b1366651 :
image shape: (534, 534, 79)


  8%|▊         | 17/212 [00:56<09:14,  2.84s/it]

1333768 :
image shape: (630, 630, 63)


  8%|▊         | 18/212 [00:58<08:43,  2.70s/it]

1171704-neck :
image shape: (500, 500, 61)


  9%|▉         | 19/212 [01:00<07:39,  2.38s/it]

1700637 :
image shape: (478, 478, 95)


  9%|▉         | 20/212 [01:02<07:18,  2.28s/it]

824280-dingzi :
image shape: (1015, 1015, 85)


 10%|▉         | 21/212 [01:11<13:11,  4.14s/it]

B664822 :
image shape: (724, 724, 107)


 10%|█         | 22/212 [01:17<15:12,  4.80s/it]

b1771895 :
image shape: (445, 445, 64)


 11%|█         | 23/212 [01:18<11:54,  3.78s/it]

1475355 :
image shape: (647, 647, 71)


 11%|█▏        | 24/212 [01:21<10:54,  3.48s/it]

1536553 :
image shape: (625, 625, 85)


 12%|█▏        | 25/212 [01:25<10:50,  3.48s/it]

b1738413 :
image shape: (599, 599, 82)


 12%|█▏        | 26/212 [01:27<10:09,  3.28s/it]

b1450355 :
image shape: (799, 799, 72)


 13%|█▎        | 27/212 [01:32<11:33,  3.75s/it]

b1227957 :
image shape: (799, 799, 71)


 13%|█▎        | 28/212 [01:37<12:16,  4.00s/it]

774851-2 :
image shape: (599, 599, 82)


 14%|█▎        | 29/212 [01:39<10:58,  3.60s/it]

b1532418 :
image shape: (620, 620, 79)


 14%|█▍        | 30/212 [01:43<10:27,  3.45s/it]

B682651 :
image shape: (546, 546, 85)


 15%|█▍        | 31/212 [01:45<09:43,  3.23s/it]

1330032 :
image shape: (599, 599, 90)


 15%|█▌        | 32/212 [01:48<09:30,  3.17s/it]

b1006763 :
image shape: (611, 611, 82)


 16%|█▌        | 33/212 [01:52<09:42,  3.25s/it]

b1460036-neck :
image shape: (500, 500, 63)


 16%|█▌        | 34/212 [01:54<08:20,  2.81s/it]

b1373427 :
image shape: (606, 606, 84)


 17%|█▋        | 35/212 [01:57<08:38,  2.93s/it]

B1685560-lumbar :
image shape: (625, 625, 151)


 17%|█▋        | 36/212 [02:03<11:52,  4.05s/it]

1467719 :
image shape: (637, 637, 80)


 17%|█▋        | 37/212 [02:06<10:54,  3.74s/it]

1228776 :
image shape: (799, 799, 81)


 18%|█▊        | 38/212 [02:11<12:00,  4.14s/it]

b1049094 :
image shape: (490, 490, 73)


 18%|█▊        | 39/212 [02:14<10:07,  3.51s/it]

b1022888 :
image shape: (678, 678, 85)


 19%|█▉        | 40/212 [02:18<10:41,  3.73s/it]

b182723 :
image shape: (587, 587, 75)


 19%|█▉        | 41/212 [02:20<09:45,  3.42s/it]

b1602447 :
image shape: (640, 640, 73)


 20%|█▉        | 42/212 [02:27<12:04,  4.26s/it]

1123189 :
image shape: (799, 799, 69)


 20%|██        | 43/212 [02:31<12:06,  4.30s/it]

b1658899 :
image shape: (799, 799, 68)


 21%|██        | 44/212 [02:35<12:05,  4.32s/it]

1775169 :
image shape: (483, 483, 60)


 21%|██        | 45/212 [02:37<09:36,  3.45s/it]

1396101 :
image shape: (799, 799, 77)


 22%|██▏       | 46/212 [02:42<10:44,  3.88s/it]

b1756747 :
image shape: (471, 471, 77)


 22%|██▏       | 47/212 [02:44<08:56,  3.25s/it]

1261721 :
image shape: (515, 515, 71)


 23%|██▎       | 48/212 [02:46<07:50,  2.87s/it]

b1425167 :
image shape: (799, 799, 85)


 23%|██▎       | 49/212 [02:51<10:10,  3.74s/it]

b1245231 :
image shape: (519, 519, 77)


 24%|██▎       | 50/212 [02:54<08:58,  3.33s/it]

1669821 :
image shape: (637, 637, 74)


 24%|██▍       | 51/212 [02:56<08:31,  3.17s/it]

b1673882 :
image shape: (628, 628, 85)


 25%|██▍       | 52/212 [03:00<08:49,  3.31s/it]

1393868 :
image shape: (802, 802, 72)


 25%|██▌       | 53/212 [03:04<09:35,  3.62s/it]

1232165 :
image shape: (799, 799, 92)


 25%|██▌       | 54/212 [03:10<11:05,  4.21s/it]

b1738145 :
image shape: (502, 502, 80)


 26%|██▌       | 55/212 [03:12<09:29,  3.63s/it]

1409022 :
image shape: (633, 633, 76)


 26%|██▋       | 56/212 [03:15<08:52,  3.41s/it]

1673882 :
image shape: (628, 628, 85)


 27%|██▋       | 57/212 [03:19<08:44,  3.38s/it]

b1632110 :
image shape: (666, 666, 79)


 27%|██▋       | 58/212 [03:22<08:58,  3.50s/it]

1771895 :
image shape: (445, 445, 64)


 28%|██▊       | 59/212 [03:24<07:13,  2.83s/it]

1273842 :
image shape: (558, 558, 79)


 28%|██▊       | 60/212 [03:26<06:46,  2.68s/it]

1602447 :
image shape: (640, 640, 73)


 29%|██▉       | 61/212 [03:32<09:08,  3.63s/it]

b1521755 :
image shape: (478, 478, 83)


 29%|██▉       | 62/212 [03:34<08:00,  3.20s/it]

1756747 :
image shape: (471, 471, 77)


 30%|██▉       | 63/212 [03:36<06:48,  2.74s/it]

1612086 :
image shape: (802, 802, 68)


 30%|███       | 64/212 [03:40<07:51,  3.19s/it]

1678746 :
image shape: (565, 565, 85)


 31%|███       | 65/212 [03:42<07:24,  3.02s/it]

1225089 :
image shape: (737, 737, 84)


 31%|███       | 66/212 [03:47<08:48,  3.62s/it]

1632110 :
image shape: (666, 666, 79)


 32%|███▏      | 67/212 [03:51<08:38,  3.58s/it]

b1336587-dingzi :
image shape: (555, 555, 71)


 32%|███▏      | 68/212 [03:53<07:45,  3.23s/it]

b1583550 :
image shape: (486, 486, 78)


 33%|███▎      | 69/212 [03:55<06:46,  2.84s/it]

b1148448 :
image shape: (565, 565, 75)


 33%|███▎      | 70/212 [03:58<06:30,  2.75s/it]

b1547418 :
image shape: (534, 534, 85)


 33%|███▎      | 71/212 [04:00<06:20,  2.70s/it]

b936932-dingzi :
image shape: (799, 799, 85)


 34%|███▍      | 72/212 [04:06<08:37,  3.69s/it]

b1623334 :
image shape: (512, 512, 68)


 34%|███▍      | 73/212 [04:11<09:21,  4.04s/it]

1597431 :
image shape: (647, 647, 99)


 35%|███▍      | 74/212 [04:15<09:23,  4.08s/it]

1671660 :
image shape: (799, 799, 85)


 35%|███▌      | 75/212 [04:21<10:07,  4.44s/it]

b1204183-dingzi :
image shape: (621, 621, 84)


 36%|███▌      | 76/212 [04:24<09:29,  4.18s/it]

b1486880 :
image shape: (604, 604, 64)


 36%|███▋      | 77/212 [04:27<08:18,  3.69s/it]

1373427 :
image shape: (606, 606, 84)


 37%|███▋      | 78/212 [04:30<07:45,  3.47s/it]

1521755 :
image shape: (478, 478, 83)


 37%|███▋      | 79/212 [04:32<06:43,  3.03s/it]

# 数据集划分&生成*.list文件

In [6]:
from dataset_split import dataset_split, make_dataset_list
# 有标签数据(划分为两个数据集并生成列表)
save_dir = '../../data/CTM_dataset'
dataset_dir = '../../data/CTM_dataset/Segmented'
list_train_validatioin,list_test = dataset_split(path=dataset_dir,save_dir=save_dir)
# 无标签数据(不需要划分,直接生存列表即可)
dataset_dir = '../../data/CTM_dataset/unSegmented'
make_dataset_list(path=dataset_dir,save_dir=save_dir)

In [3]:
error_samples

['B1709234', '1409022-no SC', 'B1409022', 'B1334915-need revision', '1709234']