# Pneumonia classification from Chest X-Ray data with EfficientNet

## Hardware Check (GPU vs CPU)

In [1]:
import tensorflow as tf

# Check if a GPU is available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("TensorFlow is using the GPU:")
    for gpu in gpus:
        print(f"- {gpu}")
else:
    print("TensorFlow is using the CPU.")

TensorFlow is using the GPU:
- PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [2]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
import kagglehub
import os
import random
import shutil
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import save_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_hub as hub # tensorflow_hub is giving some dependency clashes when running in Conda
import numpy as np
import os

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\caomi\.cache\kagglehub\datasets\paultimothymooney\chest-xray-pneumonia\versions\2


After downloading the Kaggle dataset, I moved the file to within the project repo for easier data handling

In [4]:
# base_path = "D:\\Minh Nguyen\\TME_6015\\Assignment_2\\chest_xray"   # Home PC
# base_path = "C:\\Minh Nguyen\\TME_6015\\Assignment_2\\chest_xray"   # Laptop
# base_path = "C:\mnguyen\TME_6015\Assignment_2\chest_xray"           # Work PC

# Find path to the dataset folder
current_dir = os.getcwd()
base_path = os.path.join(current_dir,'chest_xray')

train_dir = os.path.join(base_path,'train')
val_dir = os.path.join(base_path,'val')
test_dir = os.path.join(base_path,'test')


## Ensure train-validation split

In [5]:
def count_data(train_dir, val_dir, test_dir):
    num_data_ = {}
    for dir_ in [train_dir, val_dir, test_dir]:
        for category_ in ["PNEUMONIA","NORMAL"]:
            key_ = os.path.basename(dir_)+'_'+category_
            value_ = len(os.listdir(os.path.join(dir_, category_)))
            num_data_[key_] = value_
    return num_data_

num_data = count_data(train_dir, val_dir, test_dir)

print("The number of data in the corresponding categories:")
print(num_data)

The number of data in the corresponding categories:
{'train_PNEUMONIA': 3106, 'train_NORMAL': 1079, 'val_PNEUMONIA': 777, 'val_NORMAL': 270, 'test_PNEUMONIA': 390, 'test_NORMAL': 234}


It seem that the ratio of training samples to testing samples is acceptable. However, given that there are only 8 validation sample for each category, I will ensure that the train test split is 

In [6]:
def adjust_train_val_split(train_dir, val_dir, desired_split=0.8, seed=42):
    """
    Adjust the train-validation split for the dataset.
    
    Parameters:
    - train_dir (str): Path to the training directory.
    - val_dir (str): Path to the validation directory.
    - desired_split (float): Desired ratio of train to total data (e.g., 0.8 for 80% train, 20% validation).
    - seed (int): Random seed for reproducibility.
    """
    # Set seed for reproducibility
    random.seed(seed)
    
    for category in ["PNEUMONIA","NORMAL"]:
        train_category_dir = os.path.join(train_dir, category)
        val_category_dir = os.path.join(val_dir, category)
        
        # Ensure the validation category directory exists
        os.makedirs(val_category_dir, exist_ok=True)
        
        # Get lists of images in train and val directories for this category
        train_images = os.listdir(train_category_dir)
        val_images = os.listdir(val_category_dir)
        
        # Total number of images for this category
        total_images = len(train_images) + len(val_images)
        
        # Calculate desired number of train and validation images
        desired_train_count = int(total_images * desired_split)
        desired_val_count = total_images - desired_train_count
        
        # Adjust train set if necessary
        if len(train_images) > desired_train_count:
            # Move excess images from train to val
            move_count = len(train_images) - desired_train_count
            images_to_move = random.sample(train_images, move_count)
            for img in images_to_move:
                image_to_move_path = os.path.join(train_category_dir, img)
                shutil.move(image_to_move_path, val_category_dir)
        
        elif len(train_images) < desired_train_count:
            # Move images from val to train to increase training set
            move_count = desired_train_count - len(train_images)
            images_to_move = random.sample(val_images, move_count)
            for img in images_to_move:
                image_to_move_path = os.path.join(val_category_dir, img)
                shutil.move(image_to_move_path, train_category_dir)
        
        print(f"{category}: Adjusted to {desired_train_count} train and {desired_val_count} val images.")


DESIRED_SPLIT = 0.8
adjust_train_val_split(train_dir, val_dir, desired_split=DESIRED_SPLIT)

PNEUMONIA: Adjusted to 3106 train and 777 val images.
NORMAL: Adjusted to 1079 train and 270 val images.


In [7]:
count_data(train_dir, val_dir, test_dir)

{'train_PNEUMONIA': 3106,
 'train_NORMAL': 1079,
 'val_PNEUMONIA': 777,
 'val_NORMAL': 270,
 'test_PNEUMONIA': 390,
 'test_NORMAL': 234}

## Extract Data

In the following code block, I will extract the images from the directories of the train, validation, and test set for model training in terms of a generator

In [8]:
IMAGE_SIZE = (224,224)              # input image dimensions for an EfficientNetB0 model
BATCH_SIZE = 32

train_generator = ImageDataGenerator(
    rescale=1/255,
    rotation_range=20,
    # width_shift_range=0.1,
    # height_shift_range=0.1,
    # shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

train_set_raw = train_generator.flow_from_directory(
    train_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 4185 images belonging to 2 classes.


In [9]:
validation_generator = ImageDataGenerator(rescale=1/255)
valid_set_raw = validation_generator.flow_from_directory(
    val_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 1047 images belonging to 2 classes.


In [10]:
test_generator = ImageDataGenerator(rescale=1/255)
test_set_raw = test_generator.flow_from_directory(
    test_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 624 images belonging to 2 classes.


In [None]:
type(test_set_raw)

## EfficientNetB0-based model

### Base Model Construction

In [21]:
import tensorflow as tf
# from keras.models import load_model

# Path to your SavedModel directory
# model_dir = os.path.join(current_dir,"efficientnetv2-keras-efficientnetv2_b0_imagenet-v3")      # .h5 model
# model_dir = os.path.join(current_dir,"efficientnet-tensorflow1-b0-classification-v1")           # .pb model
model_dir = os.path.join(current_dir,"efficientnet-v2-tensorflow2-imagenet1k-b0-classification-v2") 

print(model_dir)

# Load the SavedModel with the appropriate tag
loaded_model = tf.saved_model.load(model_dir)

type(loaded_model)


d:\Minh Nguyen\TME_6015\Assignment_2\efficientnet-v2-tensorflow2-imagenet1k-b0-classification-v2


tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject

In [31]:
feature_extractor = loaded_model.signatures['serving_default']


print("Inputs:", feature_extractor.inputs)
print("Outputs:", feature_extractor.outputs)

feature_extractor = tf.keras.Model(inputs=feature_extractor.inputs, outputs=feature_extractor.outputs[0])


Inputs: [<tf.Tensor 'input_1:0' shape=(None, None, None, 3) dtype=float32>, <tf.Tensor 'unknown:0' shape=() dtype=resource>, <tf.Tensor 'unknown_0:0' shape=() dtype=resource>, <tf.Tensor 'unknown_1:0' shape=() dtype=resource>, <tf.Tensor 'unknown_2:0' shape=() dtype=resource>, <tf.Tensor 'unknown_3:0' shape=() dtype=resource>, <tf.Tensor 'unknown_4:0' shape=() dtype=resource>, <tf.Tensor 'unknown_5:0' shape=() dtype=resource>, <tf.Tensor 'unknown_6:0' shape=() dtype=resource>, <tf.Tensor 'unknown_7:0' shape=() dtype=resource>, <tf.Tensor 'unknown_8:0' shape=() dtype=resource>, <tf.Tensor 'unknown_9:0' shape=() dtype=resource>, <tf.Tensor 'unknown_10:0' shape=() dtype=resource>, <tf.Tensor 'unknown_11:0' shape=() dtype=resource>, <tf.Tensor 'unknown_12:0' shape=() dtype=resource>, <tf.Tensor 'unknown_13:0' shape=() dtype=resource>, <tf.Tensor 'unknown_14:0' shape=() dtype=resource>, <tf.Tensor 'unknown_15:0' shape=() dtype=resource>, <tf.Tensor 'unknown_16:0' shape=() dtype=resource>, <

ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: Tensor("input_1:0", shape=(None, None, None, 3), dtype=float32)

In [None]:
frozen = True
for variable in loaded_model.variables:
    print(variable, variable.trainable)

if frozen:
    print("The model is frozen.")
else:
    print("The model is not frozen.")

In [None]:
model = tf.keras.Sequential([
    # Input to this model will be images of dimension IMAGE_SIZE + (3,) extracted from train_dir
    layers.Input(shape = IMAGE_SIZE + (3,)),
    hub.KerasLayer("https://tfhub.dev/tensorflow/efficientnet/b0/feature-vector/1"),
    # layers.GlobalAveragePooling2D(),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(10, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')
])

model.summary()