# 4. 识别出DSB3数据集中的肺结节

## 步骤
* 加载和处理数据集
* 使用训练好的U-net生成掩模
* 使用训练好的CNN降低假阳性率

In [None]:

### 程序执行前设置 ###

unetweightspath="modelpths/unet9.pth"                   # UNet模型权重路径
truenoduleweightspath="modelpths/truenodule-cnn.pth"    # 用于判断是否是真结节的CNN模型权重路径
INPUT_FOLDER = 'F:/Datasets/DSB3/stage1/'               # DSB3数据集路径
datafolder="F:/Datasets/DSB3-processed/"                # 处理后的数据保存路径
# 程序会自动创建该文件夹及其子文件夹images和masks

import os
os.makedirs(datafolder + 'images', exist_ok=True)
os.makedirs(datafolder + 'masks', exist_ok=True)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# 尝试使用可扩展的内存段来避免CUDA内存碎片化
# 请注意，这可能会导致性能下降，因为这会导致更多的内存分配和释放操作

######################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import label
from skimage.measure import regionprops

import torch

# 使用cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True

patients = os.listdir(INPUT_FOLDER)


In [None]:
# 导入肺结节分割模型
from NoduleUNet import UNet

unetmodel = UNet().to(device)
unetmodel.load_state_dict(torch.load(unetweightspath))
unetmodel.eval()


In [None]:
# 分类结节与非结节
from TrueNoduleClassifier import NoduleClassifier

# 创建模型实例
Classifymodel = NoduleClassifier().to(device)

# 加载训练好的模型
Classifymodel.load_state_dict(torch.load(truenoduleweightspath))
Classifymodel.eval()


In [None]:
# 定义一些方法

from ProcessCTData import load_scan, get_pixels_hu, Standardizeimage

# 从文件中处理图像
def process_image_from_file(ppix):
    processpix = np.ndarray([ppix.shape[0], 1, 512, 512], dtype=np.float32)
    for i in range(ppix.shape[0]):
        processpix[i, 0] = Standardizeimage(ppix[i])
    return processpix

# 预测图像的掩码
def predict_mask(images):
    num_test = images.shape[0]
    imgs_mask_test = np.ndarray([num_test, 1, 512, 512], dtype=np.float32)
    for i in range(num_test):
        imgs_mask_test[i] = unetmodel.forward(torch.from_numpy(images[i:i+1]).to(device).float()).cpu().detach().numpy()[0]
        torch.cuda.empty_cache()
    return imgs_mask_test

# 获取检测到结节的切片索引
def get_nodule_index(imgs_mask_test):
    mask_sum = [np.sum(maskslice[0]) for maskslice in imgs_mask_test]
    print('mask_sum: ', mask_sum)
    return [i for i in range(len(mask_sum)) if mask_sum[i] > 5]

# 获取真正的索引
def get_true_indices(processed_pix, nodule_index):
    nodule_imgs = [processed_pix[ind] for ind in nodule_index]
    nodule_imgs = np.array(nodule_imgs, np.float32)
    print('nodule_imgs shape: ', nodule_imgs.shape)
    outputs = Classifymodel.forward(torch.from_numpy(nodule_imgs).to(device).float())
    del nodule_imgs
    torch.cuda.empty_cache()
    print('outputs: ', outputs)
    _, predictions = torch.max(outputs, dim=1)
    del outputs
    torch.cuda.empty_cache()
    print('predictions: ', predictions)
    true_indices = [ind for i, ind in enumerate(nodule_index) if predictions[i] == 1]
    return true_indices


# 获取最大结节的属性
def get_largest_nodule_properties(mask):
    mask[mask > 0.5] = 1
    mask[mask < 0.5] = 0
    mask = mask.astype(np.int8)
    labeled_array, nf = label(mask)
    areas_in_slice = []
    if nf > 1:
        for n in range(nf):
            lab = np.array(labeled_array)
            lab[lab != (n + 1)] = 0
            lab[lab == (n + 1)] = 1
            areas_in_slice.append(np.sum(lab))
        nlargest = areas_in_slice.index(max(areas_in_slice))
        labeled_array[labeled_array != (nlargest + 1)] = 0
        nodule_props = regionprops(labeled_array)
    else:
        nodule_props = regionprops(mask)
    area = nodule_props[0].area
    eccentricity = nodule_props[0].eccentricity
    diam = nodule_props[0].equivalent_diameter
    diammajor = nodule_props[0].major_axis_length
    spiculation = nodule_props[0].solidity
    return area, eccentricity, diam, diammajor, spiculation


def process_patient(i):
    print("Processing patient #", i)
    patient_scan = load_scan(INPUT_FOLDER + patients[i])
    patient_pix = get_pixels_hu(patient_scan)
    processed_pix = process_image_from_file(patient_pix)
    processed_pix[processed_pix==-0] = 0
    mask = predict_mask(processed_pix)
    nodule_index = get_nodule_index(mask)
    print("Nodule index: ", nodule_index)
    if len(nodule_index) == 0:
        return [], [], []
    
    true_inds = get_true_indices(processed_pix, nodule_index)
    print("True indices: ", true_inds)
    if len(true_inds) == 0:
        return [], [], []
    
    nodule_images = []
    nodule_masks = []
    sample = []
    area = []
    nodule_indices = []

    for ind in true_inds:
        nodule_images.append(patient_pix[ind])
        nodule_masks.append(mask[ind])
        sample.append(patients[i])
        area.append(np.sum(mask[ind]))
        nodule_indices.append(ind)

    nodule_images = np.array(nodule_images)
    nodule_masks = np.array(nodule_masks)

    np.save(datafolder + "images/" + "nodule_images_" + f"{i:04d}" + ".npy", nodule_images)
    np.save(datafolder + "masks/" + "nodule_masks_" + f"{i:04d}" + ".npy", nodule_masks)

    return sample, area, nodule_indices


In [None]:
print(len(patients))

In [None]:
# 开始处理

sample = []
area = []
nodule_indices = []

from tqdm.contrib.concurrent import thread_map

MAX_WORKERS = 28 # 最大线程数

# 多线程处理数据
results = thread_map(process_patient, range(len(patients)), max_workers=MAX_WORKERS)
# results = thread_map(process_patient, selected_patients, range(len(selected_patients)), max_workers=MAX_WORKERS)

for result in results:

    sample.extend(result[0])
    area.extend(result[1])
    nodule_indices.extend(result[2])
# 267min 11.4s

In [None]:
# 重新读取图像

LoadPath = 'F:/Datasets/DSB3-processed/'

# 处理后的图像
image_npy_paths = [
    os.path.join(os.getcwd(), f"{LoadPath}images", x)
    for x in os.listdir(f"{LoadPath}images")
]

# 掩模
mask_npy_paths = [
    os.path.join(os.getcwd(), f"{LoadPath}masks", x)
    for x in os.listdir(f"{LoadPath}masks")
]

print("CT scans images: " + str(len(image_npy_paths)))
print("CT scans masks: " + str(len(mask_npy_paths)))


In [None]:
# 加载数据

nodule_images = []
nodule_masks = []

for i in range(len(image_npy_paths)):
    nodule_images.append(np.load(image_npy_paths[i]))
    nodule_masks.append(np.load(mask_npy_paths[i]))


In [None]:
nodule_images = np.vstack(nodule_images)
nodule_masks = np.vstack(nodule_masks)

print("Nodule images shape: ", nodule_images.shape)
print("Nodule masks shape: ", nodule_masks.shape)


In [None]:
# nodule_images的形状不为4维，需要添加一个维度

nodule_images = nodule_images[:, np.newaxis, :, :]

print("Nodule images shape: ", nodule_images.shape)

In [None]:
# 保存数据

np.save(datafolder + "DSBNoduleImages.npy", nodule_images)
np.save(datafolder + "DSBNoduleMasks.npy", nodule_masks)


In [None]:
# 查看图像与掩模

fig, ax = plt.subplots(2, 4, figsize=(20, 10))

for i, j in enumerate((10, 22, 46, 60)):
    ax[0, i].axis('off')
    ax[0, i].imshow(nodule_images[j][0], cmap='gray')
    ax[0, i].set_title("Nodule Image")
    ax[1, i].axis('off')
    ax[1, i].imshow(nodule_masks[j][0], cmap='gray')
    ax[1, i].set_title("Nodule Mask")

plt.show()


In [None]:
# 生成特征表

meannodule_HU = []
nodule_count = []
largest_area_list = []
eccentricity_list = []
diam_list = []
diammajor_list = []
spiculation_list = []

from tqdm.notebook import tqdm

for i in tqdm(range(nodule_masks.shape[0])):
    print("Processing nodule #", i)
    mask = nodule_masks[i, 0]
    mask[mask > 0.5] = 1
    mask[mask < 0.5] = 0
    meannodule_HU.append(np.sum(nodule_images[i, 0] * mask) / np.sum(mask))
    labeled_array, features = label(mask)
    nodule_count.append(features)
    area, eccentricity, diam, diammajor, spiculation = get_largest_nodule_properties(nodule_masks[i, 0])
    largest_area_list.append(area)
    eccentricity_list.append(eccentricity)
    diam_list.append(diam)
    diammajor_list.append(diammajor)
    spiculation_list.append(spiculation)
table = pd.DataFrame({"Patient": sample, "NoduleIndex": nodule_indices, "Area": area, 
                      "MeanHU": meannodule_HU, "LargestNoduleArea": largest_area_list,
                      "Eccentricity": eccentricity_list, "Diameter": diam_list, 
                      "DiameterMajor": diammajor_list, "Spiculation": spiculation_list})

table.to_csv(datafolder+"DSBNoduleFeatures.csv")
