In [None]:
########################################
# 测试用的程序
########################################

# 下面是从DetectNodules.ipynb获取到的结节图像、掩码数据及其特征表的路径
DATAFOLDER = 'F:\\Datasets\\DSB3-processed\\'
imageslist=[f'{DATAFOLDER}DSBNoduleImages.npy']
maskslist=[f'{DATAFOLDER}DSBNoduleMasks.npy']
tablelist=[f'{DATAFOLDER}DSBNoduleFeatures.csv']

########################################

import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# 尝试使用可扩展的内存段来避免CUDA内存碎片化
# 请注意，这可能会导致性能下降，因为这会导致更多的内存分配和释放操作

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, roc_curve

# 使用cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True


In [None]:
# 获取每个图像的最大结节，对处理过的图像进行64x64的裁剪，并用良恶性标签进行标记。
from ProcessCTData import processimagenomask, crop_nodule, largestnodulearea, largestnodulecoordinates

table = pd.read_csv(tablelist[0])
if len(tablelist) > 1:
    for file in tablelist[1:]:
        temptable = pd.read_csv(file)
        table = pd.concat([table, temptable])
table = table.reset_index()

print("Top 10 of DSBNoduleFeatures.csv :\n", table[:10])

malignantlabel = []
malignancytable = pd.concat([pd.read_csv("DSB/stage1_labels.csv"), 
                             pd.read_csv("DSB/stage1_solution.csv")])
patients = malignancytable["id"].values
index = 0
noduleexists = []
nodulecrops = np.ndarray([len(patients), 1, 64, 64])
indicies = []

for i in range(len(imageslist)):
    print("loading file", imageslist[i])
    noduleimages = np.load(imageslist[i])
    nodulemasks = np.load(maskslist[i])
    tabletemp = pd.read_csv(tablelist[i])
    biggestnodulearea = []

    for j in range(nodulemasks.shape[0]):
        biggestnodulearea.append(largestnodulearea(nodulemasks[j, 0], tabletemp, j))

    tabletemp["LargestNoduleArea"] = pd.Series(biggestnodulearea)

    for patient in tqdm(patients):
        print("Process patient ", patient)
        nodulearea = tabletemp[["LargestNoduleArea"]].loc[tabletemp["Patient"] == patient]

        if len(nodulearea) > 0:
            malignantlabel.append(malignancytable["cancer"].loc[malignancytable["id"] == patient].values[0].astype(bool))
            noduleexists.append(1)
            indx = nodulearea.loc[nodulearea["LargestNoduleArea"] == max(nodulearea["LargestNoduleArea"])].index[0]
            indicies.append(indx)
            nodcrop = crop_nodule(largestnodulecoordinates(nodulemasks[indx, 0]), processimagenomask(noduleimages[indx, 0]))

            if nodcrop.shape[0] * nodcrop.shape[1] < 64 ** 2:
                nodulecrops[index, 0] = np.zeros([64, 64])
                nodulecrops[index, 0][0:nodcrop.shape[0], 0:nodcrop.shape[1]] = nodcrop
            else:
                nodulecrops[index, 0] = nodcrop

            index += 1

nodulecrops = nodulecrops[:index]
# nodulecrops = nodulecrops.reshape(nodulecrops.shape[0], 64, 64, 1)
# features = table.iloc[indicies]
# features["label"] = malignantlabel
TFratio = len([a for a in malignantlabel if a == True]) / len(malignantlabel)
print("Percent labels True: ", TFratio)

malignantlabel = np.array(malignantlabel, dtype=bool)

print("Number of nodules: ", len(nodulecrops))


In [None]:
# 显示结节图像

fig, ax = plt.subplots(1, 4, figsize=(20, 4))

for i, j in enumerate((10, 22, 46, 60)):
    ax[i].imshow(nodulecrops[j, 0])
    ax[i].axis("off")
    ax[i].set_title("Malignant: " + str(malignantlabel[j]))
# for i in range(5):
#     ax[i].imshow(nodulecrops[i, 0])
#     ax[i].axis("off")
#     ax[i].set_title("Malignant: " + str(malignantlabel[i]))
plt.show()


In [None]:
# 定义数据集类
from sklearn.model_selection import train_test_split

class NoduleDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        return image, label

nodulecrops = torch.from_numpy(nodulecrops).float()
malignantlabel = F.one_hot(torch.from_numpy(malignantlabel).long(), 2).float()

X_train, X_test, y_train, y_test = train_test_split(nodulecrops, malignantlabel, test_size=0.3)

# Create data loaders for training and testing
train_dataset = NoduleDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataset = NoduleDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)


In [None]:
# 实例化模型

from NoduleCancerClassifier import CNNModel

model = CNNModel(2, 32).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

modelpath = "modelpths/NoduleCancerClassifier.pth"

In [None]:
# 训练模型

history = []
epochs = 200

# Train the model
train_loss_history = []
train_acc_history = []
test_loss_history = []
test_acc_history = []

# Initialize the best validation loss
best_train_loss = np.inf

for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0.0
    train_correct = 0
    total = 0
    for inputs, labels in train_loader:
        inputs = inputs.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.requires_grad_(True)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted_labels = torch.max(outputs, 1)
        train_correct += (predicted_labels == labels[:,1]).sum().item()
        total += predicted_labels.size(0)
    train_loss = train_loss / len(train_loader)
    train_accuracy = train_correct / total
    train_loss_history.append(train_loss)
    train_acc_history.append(train_accuracy)

    model.eval()
    test_loss = 0.0
    test_correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            test_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
    test_loss = test_loss / len(test_loader)
    test_accuracy = test_correct / total
    test_loss_history.append(test_loss)
    test_acc_history.append(test_accuracy)

    # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.4f}')
    if train_loss < best_train_loss:
        print(f'Epoch {epoch + 1}/{epochs}')
        print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
        best_train_loss = train_loss
        torch.save(model.state_dict(), modelpath)
history.append({
    'train_loss': train_loss_history,
    'train_acc': train_acc_history,
    'test_loss': test_loss_history,
    'test_acc': test_acc_history
})


In [None]:
# 评估模型效果

predicted = []
malignantlabeltest = []

model.eval()
with torch.no_grad():
    outputs = model(nodulecrops.cuda())
    _, predicted_labels = torch.max(outputs, 1)
    val_loss = criterion(outputs, malignantlabel.cuda()).item()
    val_accuracy = (predicted_labels.cpu() == malignantlabel[:,1]).sum().item() / len(malignantlabel[:,1])
    predicted.append(outputs[:, 1].cpu().numpy())
    malignantlabeltest.append(malignantlabel[:,1])

predicted = np.concatenate(predicted, axis=0)
malignantlabeltest = np.concatenate(malignantlabeltest, axis=0)

roc = roc_curve(malignantlabeltest, predicted)
auc = roc_auc_score(malignantlabeltest, predicted)


In [None]:
# 输出评估结果
print("Lowest train_loss of", min(history[0]['train_loss']), "at epoch", np.argmin(history[0]['train_loss']))
print("Lowest test_loss of", min(history[0]['test_loss']), "at epoch", np.argmin(history[0]['test_loss']))
print("Highest train_acc of", max(history[0]['train_acc']), "at epoch", np.argmax(history[0]['train_acc']))
print("Highest test_acc of", max(history[0]['test_acc']), "at epoch", np.argmax(history[0]['test_acc']))
print("AUC:", auc)

# 绘制训练和测试损失曲线
plt.figure(figsize=(6, 6))
plt.plot(train_loss_history, label="Train Loss")
plt.plot(test_loss_history, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()
plt.show()

# 绘制训练和测试准确率曲线
plt.figure(figsize=(6, 6))
plt.plot(train_acc_history, label="Train Accuracy")
plt.plot(test_acc_history, label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()
plt.show()

# 绘制ROC曲线
plt.figure(figsize=(6, 6))
plt.plot(roc[0], roc[1])
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC={auc:.4f})")
plt.show()

