## The application of deep learning in lung cancerous lesion detection (example notebook)
**Contributors: Ngoc M. Vu, Phuong T.M. Chu, Tram P.B. Ha**

This notebook can be used to train an InceptionV3 model for the classification of lung cancer and non-cancer pneumonia-only using chest CT scans.

Details about the data and model configurations can be found at:

Overview of the classification models:
<p float="left">
  <img src="https://github.com/NgocVuMinh/Lung-Cancer-Pneumonia-Classification/blob/main/overview1.png" />

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.pyplot import figure
from matplotlib import cm
%matplotlib inline
from skimage import io
from tqdm import tqdm
import random
import cv2
import os
import re
import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, AveragePooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.applications import InceptionV3 # ResNet50, ResNet101, VGG16, VGG19, MobileNetV2, InceptionResNetV2, Xception
from keras.applications.densenet import DenseNet121
from keras.src.preprocessing.image import ImageDataGenerator
from keras.src.preprocessing import image
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
tf.compat.v1.disable_eager_execution()

In [None]:
SEED = 99
keras.utils.set_random_seed(SEED)

#### Data processing

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
val_dir = '/home/ngoc/lc/sample_data/Validation'
train_dir = '/home/ngoc/lc/sample_data/Training'
test_dir = '/home/ngoc/lc/sample_data/Testing'

In [None]:
disease_types=['Non-cancer','Cancer']

train_data = []
for id, categ_name in enumerate(disease_types):
    for file in os.listdir(os.path.join(train_dir, categ_name)):
        train_data.append(['{}/{}'.format(categ_name, file), id, categ_name])
train = pd.DataFrame(train_data, columns=['File', 'DiseaseID','Disease Type'])

val_data = []
for id, categ_name in enumerate(disease_types):
    for file in os.listdir(os.path.join(val_dir, categ_name)):
        val_data.append(['{}/{}'.format(categ_name, file), id, categ_name])
val = pd.DataFrame(val_data, columns=['File', 'DiseaseID','Disease Type'])

test_data = []
for id, categ_name in enumerate(disease_types):
    for file in os.listdir(os.path.join(test_dir, categ_name)):
        test_data.append(['{}/{}'.format(categ_name, file), id, categ_name])
test = pd.DataFrame(test_data, columns=['File', 'DiseaseID','Disease Type'])

In [None]:
IMAGE_SIZE = 299
X_train = np.zeros((train.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
for i, file in tqdm(enumerate(train['File'].values)):
    image = cv2.imread(os.path.join(train_dir, file))
    if image is not None:
        X_train[i] = cv2.resize(image.copy(), (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)

X_val = np.zeros((val.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
for i, file in tqdm(enumerate(val['File'].values)):
    image = cv2.imread(os.path.join(val_dir, file))
    if image is not None:
        X_val[i] = cv2.resize(image.copy(), (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)

X_test = np.zeros((test.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
for i, file in tqdm(enumerate(test['File'].values)):
    image = cv2.imread(os.path.join(test_dir, file))
    if image is not None:
        X_test[i] = cv2.resize(image.copy(), (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)

In [None]:
# Normalize the data
X_val = X_val / 255.
X_train = X_train / 255.
X_test = X_test / 255.
print('Train Shape: ', X_train.shape)
print('Validation Shape: ', X_val.shape)
print('Testing Shape: ', X_val.shape)

Y_train = to_categorical(train['DiseaseID'].values, num_classes=2)
Y_val = to_categorical(val['DiseaseID'].values, num_classes=2)
Y_test = to_categorical(test['DiseaseID'].values, num_classes=2)

#### Model training

In [None]:
BATCH_SIZE = 16
EPOCHS = 120

def incep(fine_tune=94):

    conv_base = InceptionV3(include_top=False, weights='imagenet', input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
    if fine_tune > 0:
        for layer in conv_base.layers[:-fine_tune]:
            layer.trainable = False
    else:
        for layer in conv_base.layers:
            layer.trainable = False

    x = tf.keras.layers.UpSampling2D(size=(4, 4))(conv_base.output)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x) 
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    out_layer = tf.keras.layers.Dense(2, activation='softmax', name='actiation')(x)

    model = Model(inputs=conv_base.input, outputs=out_layer)
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-2)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy"])
    model.summary()
    
    return model

In [None]:
train_datagen = ImageDataGenerator(width_shift_range=0.2, height_shift_range=0.2, 
                                   zoom_range=0.2, shear_range=0.15,
                                   horizontal_flip=True, vertical_flip=True, 
                                   rotation_range=10, fill_mode="nearest")
train_datagen.fit(X_train)
validation_datagen = ImageDataGenerator()
validation_datagen.fit(X_val)
test_datagen = ImageDataGenerator()
test_datagen.fit(X_test)

checkpoint_callback = ModelCheckpoint(filepath='weights.h5',
                                      monitor='val_accuracy',
                                      mode='max',
                                      save_best_only=True)

In [None]:
model = incep()

In [None]:
hist = model.fit(train_datagen.flow(X_train, Y_train, batch_size=BATCH_SIZE),
                 steps_per_epoch=X_train.shape[0] // BATCH_SIZE,
                 epochs=EPOCHS, verbose=2,
                 validation_data=(X_val, Y_val),
                 callbacks=[checkpoint_callback]
                 )

In [None]:
final_loss, final_accuracy = model.evaluate(X_val, Y_val)
print(f'Final Loss: {final_loss}, Final Accuracy: {final_accuracy}')

In [None]:
#model.save('incep.h5')

In [None]:
# Accuracy plot
figure(figsize=(6, 4))
plt.plot(hist.history['accuracy'], color='#1338BE')
plt.plot(hist.history['val_accuracy'], color='#B90E0A')
plt.title(f'Accuracy', size=16)
plt.ylabel('Accuracy', size=14)
plt.xlabel('Epoch', size=14)
plt.legend(['Train', 'Validation'], loc='lower right', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# Loss plot
plt.plot(hist.history['loss'], color='#1338BE')
plt.plot(hist.history['val_loss'], color='#B90E0A')
plt.title(f'Loss', size=16)
plt.ylabel('Loss', size=14)
plt.xlabel('Epoch', size=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(['Train', 'Validation'], loc='upper right', fontsize=14)
#plt.ylim([0.05, 0.9])
plt.show()

#### Evaluating on the testing set

In [None]:
Y_pred = model.predict(X_test)
Y_pred = np.argmax(Y_pred, axis=1)
Y_true = np.argmax(Y_test, axis=1)

In [None]:
# Confusion matrix
cm = confusion_matrix(Y_true, Y_pred)
print(cm)
print(confusion_matrix(Y_true, Y_pred))
print(classification_report(Y_true, Y_pred))

# Calculating metrics
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print ('Accuracy : ', accuracy)
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity : ', sensitivity )
specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity : ', specificity)

In [None]:
# Calculating AUC
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred[:, 1])
auc_ = auc(fpr, tpr)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label=f'{model} = {auc}')
plt.ylabel('True positive rate', size=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title('ROC curve', size=16)
plt.legend(loc='best', fontsize=14)
plt.show()