# 5.肺结节癌性预测

* 在每个病人样本中找到最大的结节并在其上进行64x64的裁剪
* 通过标签数据标记为癌性或非癌性，并创建相同数量的随机标签
* 使用分层K折（K-fold）划分数据以进行交叉验证
* 使用CNN进行癌性预测，交叉验证与消融实验

In [None]:
#EDIT HERE##############################

# 下面是从DetectNodules.ipynb获取到的结节图像、掩码数据及其特征表的路径

DATAFOLDER = 'F:\\Datasets\\DSB3-processed\\'
imageslist=[f'{DATAFOLDER}DSBNoduleImages.npy']
maskslist=[f'{DATAFOLDER}DSBNoduleMasks.npy']
tablelist=[f'{DATAFOLDER}DSBNoduleFeatures.csv']

########################################

import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# 尝试使用可扩展的内存段来避免CUDA内存碎片化
# 请注意，这可能会导致性能下降，因为这会导致更多的内存分配和释放操作

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve

# 使用cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True


### 获取图像中的肺结节切片

In [None]:
# 获取每个图像的最大结节，对处理过的图像进行64x64的裁剪，并用良恶性标签进行标记。
from ProcessCTData import processimagenomask, crop_nodule, largestnodulearea, largestnodulecoordinates

table = pd.read_csv(tablelist[0])
if len(tablelist) > 1:
    for file in tablelist[1:]:
        temptable = pd.read_csv(file)
        table = pd.concat([table, temptable])
table = table.reset_index()

print("Top 10 of DSBNoduleFeatures.csv :\n", table[:10])

malignantlabel = []
malignancytable = pd.concat([pd.read_csv("DSB/stage1_labels.csv"), 
                             pd.read_csv("DSB/stage1_solution.csv")])
patients = malignancytable["id"].values
index = 0
noduleexists = []
nodulecrops = np.ndarray([len(patients), 1, 64, 64])
indicies = []

for i in range(len(imageslist)):
    print("loading file", imageslist[i])
    noduleimages = np.load(imageslist[i])
    nodulemasks = np.load(maskslist[i])
    tabletemp = pd.read_csv(tablelist[i])
    biggestnodulearea = []

    for j in range(nodulemasks.shape[0]):
        biggestnodulearea.append(largestnodulearea(nodulemasks[j, 0], tabletemp, j))

    tabletemp["LargestNoduleArea"] = pd.Series(biggestnodulearea)

    for patient in tqdm(patients):
        print("Process patient ", patient)
        nodulearea = tabletemp[["LargestNoduleArea"]].loc[tabletemp["Patient"] == patient]

        if len(nodulearea) > 0:
            malignantlabel.append(malignancytable["cancer"].loc[malignancytable["id"] == patient].values[0].astype(bool))
            noduleexists.append(1)
            indx = nodulearea.loc[nodulearea["LargestNoduleArea"] == max(nodulearea["LargestNoduleArea"])].index[0]
            indicies.append(indx)
            nodcrop = crop_nodule(largestnodulecoordinates(nodulemasks[indx, 0]), processimagenomask(noduleimages[indx, 0]))

            if nodcrop.shape[0] * nodcrop.shape[1] < 64 ** 2:
                nodulecrops[index, 0] = np.zeros([64, 64])
                nodulecrops[index, 0][0:nodcrop.shape[0], 0:nodcrop.shape[1]] = nodcrop
            else:
                nodulecrops[index, 0] = nodcrop

            index += 1

nodulecrops = nodulecrops[:index]
# nodulecrops = nodulecrops.reshape(nodulecrops.shape[0], 64, 64, 1)
# features = table.iloc[indicies]
# features["label"] = malignantlabel
TFratio = len([a for a in malignantlabel if a == True]) / len(malignantlabel)
print("Percent labels True: ", TFratio)

randomlabel = np.random.choice([0, 1], size=(len(malignantlabel),), p=[(1 - TFratio), TFratio])
malignantlabel = np.array(malignantlabel, dtype=bool)
randomlabel = np.array(randomlabel, dtype=bool)


In [None]:
# 显示结节图像

fig, ax = plt.subplots(1, 4, figsize=(20, 4))

for i, j in enumerate((10, 22, 46, 60)):
    ax[i].imshow(nodulecrops[j, 0])
    ax[i].axis("off")
    ax[i].set_title("Malignant: " + str(malignantlabel[j]))
# for i in range(5):
#     ax[i].imshow(nodulecrops[i, 0])
#     ax[i].axis("off")
#     ax[i].set_title("Malignant: " + str(malignantlabel[i]))
plt.show()


### 定义基础CNN和各参数，进行训练与验证

In [None]:
# 定义CNN模型

class CNNModel(nn.Module):
    def __init__(self, num_classes, width):
        super(CNNModel, self).__init__()
        self.width = width
        self.conv1 = nn.Conv2d(1, width, kernel_size=3)
        self.conv2 = nn.Conv2d(width, width*2, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(0.50)
        self.fc1 = nn.Linear(width*2*30*30, width*4)
        self.dropout2 = nn.Dropout(0.50)
        self.fc2 = nn.Linear(width*4, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))             # 64->62
        x = self.pool1(torch.relu(self.conv2(x))) # 62->60->30
        x = self.dropout1(x)
        x = x.view(-1, self.width*2*30*30)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# 定义数据集
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [None]:
# 设置参数
modelpath = 'cancerpredpths/'
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
n_kfold = 1
num_classes = 2
width = 32
epochs = 200
batch_size = 512
cvscores = []
cvscoresrandom = []
history = []
historyrandom = []
aucscores = []
aucscoresrandom = []
predicted = []
malignantlabeltest = []
predictedrandom = []
randomlabeltest = []

# Convert the data to PyTorch tensors
nodulecrops = torch.from_numpy(nodulecrops).float()
malignantlabel = F.one_hot(torch.from_numpy(malignantlabel).long(), 2).float()
randomlabel = F.one_hot(torch.from_numpy(randomlabel).long(), 2).float()

# Perform k-fold cross-validation
for train, test in tqdm(kfold.split(nodulecrops, malignantlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-normal-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], malignantlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], malignantlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf

    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_best.pth')
    history.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, malignantlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == malignantlabel[test][:,1]).sum().item() / len(malignantlabel[test][:,1])
        cvscores.append([val_loss, val_accuracy])
        predicted.append(outputs[:, 1].cpu().numpy())
        malignantlabeltest.append(malignantlabel[test][:,1])
        aucscores.append(roc_auc_score(malignantlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))

    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predicted = np.concatenate(predicted, axis=0)
malignantlabeltest = np.concatenate(malignantlabeltest, axis=0)
roc = roc_curve(malignantlabeltest, predicted)

n_kfold = 1

# Perform k-fold cross-validation for random labels
for train, test in tqdm(kfold.split(nodulecrops, randomlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-random-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], randomlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], randomlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf
    
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_best.pth')

    historyrandom.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, randomlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == randomlabel[test][:,1]).sum().item() / len(randomlabel[test][:,1])
        cvscoresrandom.append([val_loss, val_accuracy])
        predictedrandom.append(outputs[:, 1].cpu().numpy())
        randomlabeltest.append(randomlabel[test][:,1])
        aucscoresrandom.append(roc_auc_score(randomlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))
    
    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predictedrandom = np.concatenate(predictedrandom, axis=0)
randomlabeltest = np.concatenate(randomlabeltest, axis=0)
rocrandom = roc_curve(randomlabeltest, predictedrandom)


In [None]:
# 通过各种方法评估模型

print("Mean loss across all CV sets with true labels:", np.mean([cvscores[i][0] for i in range(len(cvscores))]))
print("Mean loss across all CV sets with random labels:", np.mean([cvscoresrandom[i][0] for i in range(len(cvscoresrandom))]))
print("Mean accuracy across all CV sets with true labels:", np.mean([cvscores[i][1] for i in range(len(cvscores))]))
print("Mean accuracy across all CV sets with random labels:", np.mean([cvscoresrandom[i][1] for i in range(len(cvscoresrandom))]))

print("Lowest val_loss of", min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)))[0], "with true labels")
print("Lowest val_loss of", min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)))[0],"with random labels")
print("Average AUC across CV sets with true labels:",np.mean(aucscores))
print("Average AUC across CV sets with random labels:",np.mean(aucscoresrandom))
acc=np.mean([history[i]['train_acc'] for i in range(n_splits)], axis=0)
valacc=np.mean([history[i]['val_acc'] for i in range(n_splits)], axis=0)
loss=np.mean([history[i]['train_loss'] for i in range(n_splits)], axis=0)
valloss=np.mean([history[i]['val_loss'] for i in range(n_splits)], axis=0)
randacc=np.mean([historyrandom[i]['train_acc'] for i in range(n_splits)], axis=0)
randvalacc=np.mean([historyrandom[i]['val_acc'] for i in range(n_splits)], axis=0)
randloss=np.mean([historyrandom[i]['train_loss'] for i in range(n_splits)], axis=0)
randvalloss=np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)], axis=0)

# summarize history for loss
plt.plot(loss)
plt.plot(valloss)
plt.plot(randloss)
plt.plot(randvalloss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.45,0.7])
plt.show()

# summarize history for accuracy
plt.plot(acc)
plt.plot(valacc)
plt.plot(randacc)
plt.plot(randvalacc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.7,0.78])
plt.show()

#ROC curve
plt.plot(roc[0],roc[1])
plt.plot(rocrandom[0],rocrandom[1])
plt.title('ROC')
plt.ylabel('TPrate')
plt.xlabel('FPrate')
plt.legend(['true label', 'random label'])
plt.show()

### 添加一层卷积层

In [None]:
# 定义CNN模型

class CNNModel(nn.Module):
    def __init__(self, num_classes, width):
        super(CNNModel, self).__init__()
        self.width = width
        self.conv1 = nn.Conv2d(1, width, kernel_size=3)
        self.conv2 = nn.Conv2d(width, width*2, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(width*2, width*2, kernel_size=3, padding=1) # 添加一层卷积层
        self.dropout1 = nn.Dropout(0.50)
        self.fc1 = nn.Linear(width*2*30*30, width*4)
        self.dropout2 = nn.Dropout(0.50)
        self.fc2 = nn.Linear(width*4, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))             # 64->62
        x = self.pool1(torch.relu(self.conv2(x))) # 62->60->30
        x = torch.relu(self.conv3(x))             # 30->30
        x = self.dropout1(x)
        x = x.view(-1, self.width*2*30*30)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# 定义数据集
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [None]:
# 设置参数
modelpath = 'cancerpredpths1/'
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
n_kfold = 1
num_classes = 2
width = 32
epochs = 200
batch_size = 512
cvscores = []
cvscoresrandom = []
history = []
historyrandom = []
aucscores = []
aucscoresrandom = []
predicted = []
malignantlabeltest = []
predictedrandom = []
randomlabeltest = []

# Convert the data to PyTorch tensors
# nodulecrops = torch.from_numpy(nodulecrops).float()
# malignantlabel = F.one_hot(torch.from_numpy(malignantlabel).long(), 2).float()
# randomlabel = F.one_hot(torch.from_numpy(randomlabel).long(), 2).float()

# Perform k-fold cross-validation
for train, test in tqdm(kfold.split(nodulecrops, malignantlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-normal-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], malignantlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], malignantlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf

    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_best.pth')
    history.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, malignantlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == malignantlabel[test][:,1]).sum().item() / len(malignantlabel[test][:,1])
        cvscores.append([val_loss, val_accuracy])
        predicted.append(outputs[:, 1].cpu().numpy())
        malignantlabeltest.append(malignantlabel[test][:,1])
        aucscores.append(roc_auc_score(malignantlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))

    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predicted = np.concatenate(predicted, axis=0)
malignantlabeltest = np.concatenate(malignantlabeltest, axis=0)
roc = roc_curve(malignantlabeltest, predicted)

n_kfold = 1

# Perform k-fold cross-validation for random labels
for train, test in tqdm(kfold.split(nodulecrops, randomlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-random-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], randomlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], randomlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf
    
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_best.pth')

    historyrandom.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, randomlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == randomlabel[test][:,1]).sum().item() / len(randomlabel[test][:,1])
        cvscoresrandom.append([val_loss, val_accuracy])
        predictedrandom.append(outputs[:, 1].cpu().numpy())
        randomlabeltest.append(randomlabel[test][:,1])
        aucscoresrandom.append(roc_auc_score(randomlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))
    
    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predictedrandom = np.concatenate(predictedrandom, axis=0)
randomlabeltest = np.concatenate(randomlabeltest, axis=0)
rocrandom = roc_curve(randomlabeltest, predictedrandom)


In [None]:
print("Mean loss across all CV sets with true labels:", np.mean([cvscores[i][0] for i in range(len(cvscores))]))
print("Mean loss across all CV sets with random labels:", np.mean([cvscoresrandom[i][0] for i in range(len(cvscoresrandom))]))
print("Mean accuracy across all CV sets with true labels:", np.mean([cvscores[i][1] for i in range(len(cvscores))]))
print("Mean accuracy across all CV sets with random labels:", np.mean([cvscoresrandom[i][1] for i in range(len(cvscoresrandom))]))

print("Lowest val_loss of", min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)))[0], "with true labels")
print("Lowest val_loss of", min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)))[0],"with random labels")
print("Average AUC across CV sets with true labels:",np.mean(aucscores))
print("Average AUC across CV sets with random labels:",np.mean(aucscoresrandom))
acc=np.mean([history[i]['train_acc'] for i in range(n_splits)], axis=0)
valacc=np.mean([history[i]['val_acc'] for i in range(n_splits)], axis=0)
loss=np.mean([history[i]['train_loss'] for i in range(n_splits)], axis=0)
valloss=np.mean([history[i]['val_loss'] for i in range(n_splits)], axis=0)
randacc=np.mean([historyrandom[i]['train_acc'] for i in range(n_splits)], axis=0)
randvalacc=np.mean([historyrandom[i]['val_acc'] for i in range(n_splits)], axis=0)
randloss=np.mean([historyrandom[i]['train_loss'] for i in range(n_splits)], axis=0)
randvalloss=np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)], axis=0)

# summarize history for loss
plt.plot(loss)
plt.plot(valloss)
plt.plot(randloss)
plt.plot(randvalloss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.45,0.7])
plt.show()

# summarize history for accuracy
plt.plot(acc)
plt.plot(valacc)
plt.plot(randacc)
plt.plot(randvalacc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.7,0.78])
plt.show()

#ROC curve
plt.plot(roc[0],roc[1])
plt.plot(rocrandom[0],rocrandom[1])
plt.title('ROC')
plt.ylabel('TPrate')
plt.xlabel('FPrate')
plt.legend(['true label', 'random label'])
plt.show()

### 再将第一个dropout层修改为0.75

In [None]:
# 定义CNN模型

class CNNModel(nn.Module):
    def __init__(self, num_classes, width):
        super(CNNModel, self).__init__()
        self.width = width
        self.conv1 = nn.Conv2d(1, width, kernel_size=3)
        self.conv2 = nn.Conv2d(width, width*2, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(width*2, width*2, kernel_size=3, padding=1) # 添加一层卷积层
        self.dropout1 = nn.Dropout(0.75)                       # 修改dropout 0.75
        self.fc1 = nn.Linear(width*2*30*30, width*4)
        self.dropout2 = nn.Dropout(0.50)
        self.fc2 = nn.Linear(width*4, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))             # 64->62
        x = self.pool1(torch.relu(self.conv2(x))) # 62->60->30
        x = torch.relu(self.conv3(x))             # 30->30
        x = self.dropout1(x)
        x = x.view(-1, self.width*2*30*30)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# 定义数据集
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [None]:
# 设置参数
modelpath = 'cancerpredpths2/'
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
n_kfold = 1
num_classes = 2
width = 32
epochs = 200
batch_size = 512
cvscores = []
cvscoresrandom = []
history = []
historyrandom = []
aucscores = []
aucscoresrandom = []
predicted = []
malignantlabeltest = []
predictedrandom = []
randomlabeltest = []

# Convert the data to PyTorch tensors
# nodulecrops = torch.from_numpy(nodulecrops).float()
# malignantlabel = F.one_hot(torch.from_numpy(malignantlabel).long(), 2).float()
# randomlabel = F.one_hot(torch.from_numpy(randomlabel).long(), 2).float()

# Perform k-fold cross-validation
for train, test in tqdm(kfold.split(nodulecrops, malignantlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-normal-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], malignantlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], malignantlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf

    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_best.pth')
    history.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, malignantlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == malignantlabel[test][:,1]).sum().item() / len(malignantlabel[test][:,1])
        cvscores.append([val_loss, val_accuracy])
        predicted.append(outputs[:, 1].cpu().numpy())
        malignantlabeltest.append(malignantlabel[test][:,1])
        aucscores.append(roc_auc_score(malignantlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))

    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-normal-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predicted = np.concatenate(predicted, axis=0)
malignantlabeltest = np.concatenate(malignantlabeltest, axis=0)
roc = roc_curve(malignantlabeltest, predicted)

n_kfold = 1

# Perform k-fold cross-validation for random labels
for train, test in tqdm(kfold.split(nodulecrops, randomlabel[:,1]), total=n_splits):
    os.makedirs(modelpath + f'k-fold-random-{n_kfold}/', exist_ok=True)
    print(f'Fold {n_kfold}/{n_splits}')
    # Create the model
    model = CNNModel(num_classes, width).cuda()

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    # Create data loaders for training and validation
    train_dataset = CustomDataset(nodulecrops[train], randomlabel[train])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = CustomDataset(nodulecrops[test], randomlabel[test])
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Train the model
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    # Initialize the best validation loss
    best_train_loss = np.inf
    
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted_labels = torch.max(outputs, 1)
            train_correct += (predicted_labels == labels[:,1]).sum().item()
            total += predicted_labels.size(0)
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / total
        train_loss_history.append(train_loss)
        train_acc_history.append(train_accuracy)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.cuda()
                labels = labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted_labels = torch.max(outputs, 1)
                val_correct += (predicted_labels == labels[:,1]).sum().item()
                total += predicted_labels.size(0)
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / total
        val_loss_history.append(val_loss)
        val_acc_history.append(val_accuracy)

        # print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
        if train_loss < best_train_loss:
            print(f'Epoch {epoch + 1}/{epochs}')
            print(f'Train_loss decrease from {best_train_loss:.4f} to {train_loss:.4f}. Saving model...')
            best_train_loss = train_loss
            torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_best.pth')

    historyrandom.append({
        'train_loss': train_loss_history,
        'train_acc': train_acc_history,
        'val_loss': val_loss_history,
        'val_acc': val_acc_history
    })

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(nodulecrops[test].cuda())
        _, predicted_labels = torch.max(outputs, 1)
        val_loss = criterion(outputs, randomlabel[test].cuda()).item()
        val_accuracy = (predicted_labels.cpu() == randomlabel[test][:,1]).sum().item() / len(randomlabel[test][:,1])
        cvscoresrandom.append([val_loss, val_accuracy])
        predictedrandom.append(outputs[:, 1].cpu().numpy())
        randomlabeltest.append(randomlabel[test][:,1])
        aucscoresrandom.append(roc_auc_score(randomlabel[test][:,1].cpu().numpy(), outputs[:, 1].cpu().numpy()))
    
    # Save the final model
    torch.save(model.state_dict(), modelpath + f'k-fold-random-{n_kfold}/' + 'model_final.pth')
    n_kfold += 1

predictedrandom = np.concatenate(predictedrandom, axis=0)
randomlabeltest = np.concatenate(randomlabeltest, axis=0)
rocrandom = roc_curve(randomlabeltest, predictedrandom)


In [None]:
print("Mean loss across all CV sets with true labels:", np.mean([cvscores[i][0] for i in range(len(cvscores))]))
print("Mean loss across all CV sets with random labels:", np.mean([cvscoresrandom[i][0] for i in range(len(cvscoresrandom))]))
print("Mean accuracy across all CV sets with true labels:", np.mean([cvscores[i][1] for i in range(len(cvscores))]))
print("Mean accuracy across all CV sets with random labels:", np.mean([cvscoresrandom[i][1] for i in range(len(cvscoresrandom))]))

print("Lowest val_loss of", min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([history[i]['val_loss'] for i in range(n_splits)],axis=0)))[0], "with true labels")
print("Lowest val_loss of", min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)), "at epoch", np.where(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)==min(np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)],axis=0)))[0],"with random labels")
print("Average AUC across CV sets with true labels:",np.mean(aucscores))
print("Average AUC across CV sets with random labels:",np.mean(aucscoresrandom))
acc=np.mean([history[i]['train_acc'] for i in range(n_splits)], axis=0)
valacc=np.mean([history[i]['val_acc'] for i in range(n_splits)], axis=0)
loss=np.mean([history[i]['train_loss'] for i in range(n_splits)], axis=0)
valloss=np.mean([history[i]['val_loss'] for i in range(n_splits)], axis=0)
randacc=np.mean([historyrandom[i]['train_acc'] for i in range(n_splits)], axis=0)
randvalacc=np.mean([historyrandom[i]['val_acc'] for i in range(n_splits)], axis=0)
randloss=np.mean([historyrandom[i]['train_loss'] for i in range(n_splits)], axis=0)
randvalloss=np.mean([historyrandom[i]['val_loss'] for i in range(n_splits)], axis=0)

# summarize history for loss
plt.plot(loss)
plt.plot(valloss)
plt.plot(randloss)
plt.plot(randvalloss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.45,0.7])
plt.show()

# summarize history for accuracy
plt.plot(acc)
plt.plot(valacc)
plt.plot(randacc)
plt.plot(randvalacc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train truelabel', 'validation truelabel', 'train randomlabel', 'val randomlabel'], loc='best')
plt.ylim([0.7,0.78])
plt.show()

#ROC curve
plt.plot(roc[0],roc[1])
plt.plot(rocrandom[0],rocrandom[1])
plt.title('ROC')
plt.ylabel('TPrate')
plt.xlabel('FPrate')
plt.legend(['true label', 'random label'])
plt.show()