proposed models: mobilenet, mobilenetv2, mobilenetv3, nasnetmobile, efficientnetv2b0, efficientnetlite

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import seaborn as sns
import random
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn import metrics

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours, NearMiss
from imblearn.combine import SMOTEENN

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.applications import ResNet50V2, InceptionResNetV2, NASNetMobile, MobileNet
from tensorflow.keras.applications import EfficientNetV2B0, EfficientNetV2B1, EfficientNetV2B2, EfficientNetV2B3
from tensorflow.keras.applications import DenseNet121, DenseNet169, DenseNet201
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from keras.regularizers import l2
import keras_tuner as kt

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

num_classes=4

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
pd.set_option('display.max_colwidth', None)

categories = ["adenocarcinoma", "large_cell_carcinoma", "normal", "squamous_cell_carcinoma"]

In [None]:
# TRAIN data

directory_train = r"lung_classification_dataset\train"

adenocarcinoma_dir = directory_train + "/" + categories[0]
large_cell_carcinoma_dir = directory_train + "/" + categories[1]
normal_dir = directory_train + "/" + categories[2]
squamos_cell_carcinoma_dir = directory_train + "/" + categories[3]

path_list_train = []
category_list_train = []
for category in categories:
    folder_path = os.path.join(directory_train, category)
    folder_image_contents = os.listdir(folder_path)
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        path_list_train.append(image_file_path)
        category_list_train.append(category)

path_series_train = pd.Series(path_list_train, name="filepath")
category_series_train = pd.Series(category_list_train, name="category")

image_paths_train_df = pd.DataFrame(path_series_train).join(category_series_train)

#print(image_paths_train_df.head())   

# VAL data
directory_val = r"lung_classification_dataset\valid"

adenocarcinoma_dir = directory_val + "/" + categories[0]
large_cell_carcinoma_dir = directory_val + "/" + categories[1]
normal_dir = directory_val + "/" + categories[2]
squamos_cell_carcinoma_dir = directory_val + "/" + categories[3]

path_list_val = []
category_list_val = []
for category in categories:
    folder_path = os.path.join(directory_val, category)
    folder_image_contents = os.listdir(folder_path)
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        path_list_val.append(image_file_path)
        category_list_val.append(category)

path_series_val = pd.Series(path_list_val, name="filepath")
category_series_val = pd.Series(category_list_val, name="category")

image_paths_val_df = pd.DataFrame(path_series_val).join(category_series_val)

#print(image_paths_val_df.head())

# TEST data
directory_test = r"lung_classification_dataset\test"

adenocarcinoma_dir = directory_test + "/" + categories[0]
large_cell_carcinoma_dir = directory_test + "/" + categories[1]
normal_dir = directory_test + "/" + categories[2]
squamos_cell_carcinoma_dir = directory_test + "/" + categories[3]

path_list_test = []
category_list_test = []
for category in categories:
    folder_path = os.path.join(directory_test, category)
    folder_image_contents = os.listdir(folder_path)
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        path_list_test.append(image_file_path)
        category_list_test.append(category)

path_series_test = pd.Series(path_list_test, name="filepath")
category_series_test = pd.Series(category_list_test, name="category")

image_paths_test_df = pd.DataFrame(path_series_test).join(category_series_test)

#print(image_paths_test_df.head())

# Image resizing

In [None]:
# TRAIN data
size_data_train = {}
is_grayscale = 0 # enter 0 for color, 1 for grayscale
for category in categories:
    folder_path = os.path.join(directory_train, category)
    folder_image_contents = os.listdir(folder_path)
    image_sizes_in_one_category = {}
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        img = cv2.imread(image_file_path)
        height, width, channel = img.shape
        if(str(height) + " x " + str(width) in image_sizes_in_one_category):
            image_sizes_in_one_category[str(height) + " x " + str(width)] += 1
        else:
            image_sizes_in_one_category[str(height) + " x " + str(width)] = 1
    #plt.imshow(cv2.imread(image_file_path, is_grayscale))
    #plt.show()
    size_data_train[category] = image_sizes_in_one_category
size_data_train_df = pd.DataFrame(size_data_train)
#print(size_data_train)
#print(size_data_train_df.head(10))

# VAL data
size_data_val = {}
is_grayscale = 0 # enter 0 for color, 1 for grayscale
for category in categories:
    folder_path = os.path.join(directory_val, category)
    folder_image_contents = os.listdir(folder_path)
    image_sizes_in_one_category = {}
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        img = cv2.imread(image_file_path)
        height, width, channel = img.shape
        if(str(height) + " x " + str(width) in image_sizes_in_one_category):
            image_sizes_in_one_category[str(height) + " x " + str(width)] += 1
        else:
            image_sizes_in_one_category[str(height) + " x " + str(width)] = 1
    #plt.imshow(cv2.imread(image_file_path, is_grayscale))
    #plt.show()
    size_data_val[category] = image_sizes_in_one_category
size_data_val_df = pd.DataFrame(size_data_val)
#print(size_data_val)
#print(size_data_val_df.head(10))

# TEST data
size_data_test = {}
is_grayscale = 0 # enter 0 for color, 1 for grayscale
for category in categories:
    folder_path = os.path.join(directory_test, category)
    folder_image_contents = os.listdir(folder_path)
    image_sizes_in_one_category = {}
    for image_file in folder_image_contents:
        image_file_path = os.path.join(folder_path, image_file)
        img = cv2.imread(image_file_path)
        height, width, channel = img.shape
        if(str(height) + " x " + str(width) in image_sizes_in_one_category):
            image_sizes_in_one_category[str(height) + " x " + str(width)] += 1
        else:
            image_sizes_in_one_category[str(height) + " x " + str(width)] = 1
    #plt.imshow(cv2.imread(image_file_path, is_grayscale))
    #plt.show()
    size_data_test[category] = image_sizes_in_one_category
size_data_test_df = pd.DataFrame(size_data_test)
#print(size_data_test)
#print(size_data_test_df.head(10))

In [None]:
# TRAIN data

dataset_train = []
desired_size = 256
for index, row in image_paths_train_df.iterrows():
    filepath = row["filepath"]
    img = cv2.imread(filepath, is_grayscale)
    img = cv2.resize(img, (desired_size, desired_size))
    label = row["category"]
    dataset_train.append([img, label])
    
random.shuffle(dataset_train)

#print(dataset_train[0][0].shape)
#print(dataset_train[0][1])
#plt.imshow(dataset_train[0][0])
#plt.show()

# VAL data

dataset_val = []
desired_size = 256
for index, row in image_paths_val_df.iterrows():
    filepath = row["filepath"]
    img = cv2.imread(filepath, is_grayscale)
    img = cv2.resize(img, (desired_size, desired_size))
    label = row["category"]
    dataset_val.append([img, label])
    
random.shuffle(dataset_val)

#print(dataset_val[0][0].shape)
#print(dataset_val[0][1])
#plt.imshow(dataset_val[0][0])
#plt.show()

# TEST data

dataset_test = []
desired_size = 256
for index, row in image_paths_test_df.iterrows():
    filepath = row["filepath"]
    img = cv2.imread(filepath, is_grayscale)
    img = cv2.resize(img, (desired_size, desired_size))
    label = row["category"]
    dataset_test.append([img, label])
    
random.shuffle(dataset_test)

#print(dataset_test[0][0].shape)
#print(dataset_test[0][1])
#plt.imshow(dataset_test[0][0])
#plt.show()

# Histogram Equilization

In [None]:
# TRAIN data
for data in dataset_train:
    img = cv2.equalizeHist(data[0])
    data[0] = img

# VAL data
for data in dataset_val:
    img = cv2.equalizeHist(data[0])
    data[0] = img

# TEST data
for data in dataset_test:
    img = cv2.equalizeHist(data[0])
    data[0] = img

# Image smoothing

In [None]:
# using bilateral filtering
diameter = 3
sigma_color = 25
sigma_space = 25

# TRAIN data
for data in dataset_train:
    img = cv2.bilateralFilter(data[0], diameter, sigma_color, sigma_space)
    data[0] = img

#plt.imshow(dataset_train[0][0])
#plt.show()

# VAL data
for data in dataset_val:
    img = cv2.bilateralFilter(data[0], diameter, sigma_color, sigma_space)
    data[0] = img

#plt.imshow(dataset_val[0][0])
#plt.show()

# TEST data
for data in dataset_test:
    img = cv2.bilateralFilter(data[0], diameter, sigma_color, sigma_space)
    data[0] = img

#plt.imshow(dataset_test[0][0])
#plt.show()

# Data normalization

In [None]:
# using min-max scaling

# TRAIN data
for data in dataset_train:
    img = data[0]/255.0
    data[0] = img

#plt.imshow(dataset_train[0][0])
#plt.show()

# VAL data
for data in dataset_val:
    img = data[0]/255.0
    data[0] = img

#plt.imshow(dataset_val[0][0])
#plt.show()

# TEST data
for data in dataset_test:
    img = data[0]/255.0
    data[0] = img

#plt.imshow(dataset_test[0][0])
#plt.show()

# Morphological operations

In [None]:
# TRAIN data
for data in dataset_train:
    binr = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    kernel = np.ones((5, 5), np.uint8)
    invert = cv2.bitwise_not(binr)
    erosion = cv2.erode(invert, kernel, iterations=1)

# VAL data
for data in dataset_val:
    binr = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    kernel = np.ones((5, 5), np.uint8)
    invert = cv2.bitwise_not(binr)
    erosion = cv2.erode(invert, kernel, iterations=1)

# TEST data
for data in dataset_test:
    binr = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    kernel = np.ones((5, 5), np.uint8)
    invert = cv2.bitwise_not(binr)
    erosion = cv2.erode(invert, kernel, iterations=1)

# Data splitting

In [None]:
x_train, y_train = [], []
x_val, y_val = [], []
x_test, y_test = [], []

for image, category in dataset_train:
    x_train.append(image)
    y_train.append(category)

for image, category in dataset_val:
    x_val.append(image)
    y_val.append(category)

for image, category in dataset_test:
    x_test.append(image)
    y_test.append(category)


x_train = np.array(x_train)
y_train = np.array(y_train)
x_val = np.array(x_val)
y_val = np.array(y_val)
x_test = np.array(x_test)
y_test = np.array(y_test)

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# apply one-hot encoding

y_val = to_categorical(y_val, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

y_test_labels = np.argmax(y_test, axis=1)

for i in range(0, len(label_encoder.classes_)):
    print(i, end="")
    print(" = ", end="")
    print(label_encoder.classes_[i])

# Handling data imbalance

In [None]:
# use data augmentation (albumentation library)