In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import datasets, layers, models
from tqdm import tqdm
from skimage.transform import resize
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

Get the label data from the csv file for each preprocessed mask

In [3]:
data = pd.read_csv("Path to 'meta_info.csv' file")
data.head()

Unnamed: 0,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean
0,1,0,0,LIDC-IDRI-0001/0001_NI000_slice000,LIDC-IDRI-0001/0001_MA000_slice000,5,True,False
1,1,0,1,LIDC-IDRI-0001/0001_NI000_slice001,LIDC-IDRI-0001/0001_MA000_slice001,5,True,False
2,1,0,2,LIDC-IDRI-0001/0001_NI000_slice002,LIDC-IDRI-0001/0001_MA000_slice002,5,True,False
3,1,0,3,LIDC-IDRI-0001/0001_NI000_slice003,LIDC-IDRI-0001/0001_MA000_slice003,5,True,False
4,1,0,4,LIDC-IDRI-0001/0001_NI000_slice004,LIDC-IDRI-0001/0001_MA000_slice004,5,True,False


Preprocessing the csv file to make it suitable for classification

In [4]:
# Dropping the images without any nodules
data = data[data['is_clean'] != True]

In [5]:
# Quantifying the cancer/benign outcome classes 
data["is_cancer"] = data['is_cancer'].map({'Ambiguous':-1, 'True':1,'False':0})

In [6]:
# Create a dictionary with the image filename and label
mask_malignancy_dict = dict(zip(data['mask_image'], data['malignancy']))

In [7]:
filenames = list(mask_malignancy_dict.keys())
labels = list(mask_malignancy_dict.values())
filenames[len(filenames)-1]

'LIDC-IDRI-1012/1012_MA000_slice002'

In [8]:
max(labels), min(labels)

(5, 1)

In [9]:
folders = []
files = []
for n, val in enumerate(filenames):
    parts = val.split('/')
    folders.append(parts[0])
    files.append(parts[1])


In [10]:
X = []
Y = []
path = "Path to mask folder only"

for fol, fil in zip(folders, files):
    try:
        X.append(np.load(path+fol+"\\"+fil+".npy"))
    except FileNotFoundError:
        continue
for n in range(0,len(labels)):
    Y.append(labels[n])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
# Get the number of malignancy classes
num_classes = len(np.unique(y_train))

X_train : 6400
X_test : 1600
y_train : 6400
y_test : 1600
5


In [14]:
train_x = np.zeros((len(X_train), 512, 512, 1))
test_x = np.zeros((len(X_test), 512, 512, 1))

In [15]:
train_y = np.array(y_train)
test_y = np.array(y_test)
train_y = np.reshape(train_y, (len(train_y),1)) # reshape the label array
test_y = np.reshape(test_y, (len(test_y),1))

Load the mask image npy files

In [16]:
for n, img in tqdm(enumerate(X_train), total=len(X_train)):
    img = np.reshape(img, (512, 512, 1))
    train_x[n] = img

100%|████████████████████████████████████████████████████████████████████████████| 6400/6400 [00:01<00:00, 3325.41it/s]


In [17]:
for n, test_img in tqdm(enumerate(X_test), total=len(X_test)):
    test_img = np.reshape(test_img, (512, 512, 1))
    test_x[n] = test_img

100%|████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:00<00:00, 3487.76it/s]


In [22]:
# Subtract 1 from the labels to shift the range to [0, 4] due the use of categorical_crossentropy
train_y = train_y - 1
test_y = test_y - 1
train_y.shape

(6400, 1)

In [26]:
# one hot encoding the labels
train_y_encoded = to_categorical(train_y, num_classes=5)
test_y_encoded = to_categorical(test_y, num_classes=5)

Defining callbacks

In [30]:
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='logs')
checkpoint = ModelCheckpoint('help.h5', save_best_only=True, verbose=1)
early_stopping = EarlyStopping(patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience=2, verbose=1)
callbacks = [checkpoint, early_stopping, reduce_lr, tensorboard]

In [28]:
def create_classification_model(input_shape, num_classes):
    inputs = tf.keras.Input(shape=input_shape)

    conv1 = layers.Conv2D(32, 3, activation='relu', padding='same')(inputs)
    conv1 = layers.BatchNormalization()(conv1)
    conv1 = layers.Conv2D(32, 3, activation='relu', padding='same')(conv1)
    conv1 = layers.BatchNormalization()(conv1)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = layers.Conv2D(64, 3, activation='relu', padding='same')(pool1)
    conv2 = layers.BatchNormalization()(conv2)
    conv2 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv2)
    conv2 = layers.BatchNormalization()(conv2)
    pool2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = layers.Conv2D(128, 3, activation='relu', padding='same')(pool2)
    conv3 = layers.BatchNormalization()(conv3)
    conv3 = layers.Conv2D(128, 3, activation='relu', padding='same')(conv3)
    conv3 = layers.BatchNormalization()(conv3)
    pool3 = layers.MaxPooling2D(pool_size=(2, 2))(conv3)

    flatten = layers.Flatten()(pool3)

    dense1 = layers.Dense(256, activation='relu')(flatten)
    dense1 = layers.BatchNormalization()(dense1)
    dense2 = layers.Dense(128, activation='relu')(dense1)
    dense2 = layers.BatchNormalization()(dense2)

    outputs = layers.Dense(num_classes, activation='softmax')(dense2)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    return model


input_shape = (512, 512, 1)
num_classes = 5 

model = create_classification_model(input_shape, num_classes)


In [None]:
model.fit(train_x, train_y_encoded, batch_size=8, epochs=30, validation_data=(val_x, val_y_encoded), callbacks=callbacks)

In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(test_x, test_y_encoded)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)