# Pneumonia classification from Chest X-Ray data with VGG16

In [103]:
import kagglehub
import os
import random
import shutil
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_hub as hub
import numpy as np

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\caomi\.cache\kagglehub\datasets\paultimothymooney\chest-xray-pneumonia\versions\2


After downloading the Kaggle dataset, I moved the file to within the project repo for easier data handling

In [51]:
base_path = "D:\\Minh Nguyen\\TME_6015\\Assignment_2\\chest_xray"
train_dir = os.path.join(base_path,'train')
val_dir = os.path.join(base_path,'val')
test_dir = os.path.join(base_path,'test')


## Ensure train-valid-test split

In [52]:
def count_data(train_dir, val_dir, test_dir):
    num_data_ = {}
    for dir_ in [train_dir, val_dir, test_dir]:
        for category_ in ["PNEUMONIA","NORMAL"]:
            key_ = os.path.basename(dir_)+'_'+category_
            value_ = len(os.listdir(os.path.join(dir_, category_)))
            num_data_[key_] = value_
    return num_data_

num_data = count_data(train_dir, val_dir, test_dir)

print("The number of data in the corresponding categories:")
print(num_data)

The number of data in the corresponding categories:
{'train_PNEUMONIA': 3875, 'train_NORMAL': 1341, 'val_PNEUMONIA': 8, 'val_NORMAL': 8, 'test_PNEUMONIA': 390, 'test_NORMAL': 234}


It seem that the ratio of training samples to testing samples is acceptable. However, given that there are only 8 validation sample for each category, I will ensure that the train test split is 

In [64]:
def adjust_train_val_split(train_dir, val_dir, desired_split=0.8, seed=42):
    """
    Adjust the train-validation split for the dataset.
    
    Parameters:
    - train_dir (str): Path to the training directory.
    - val_dir (str): Path to the validation directory.
    - desired_split (float): Desired ratio of train to total data (e.g., 0.8 for 80% train, 20% validation).
    - seed (int): Random seed for reproducibility.
    """
    # Set seed for reproducibility
    random.seed(seed)
    
    for category in ["PNEUMONIA","NORMAL"]:
        train_category_dir = os.path.join(train_dir, category)
        val_category_dir = os.path.join(val_dir, category)
        
        # Ensure the validation category directory exists
        os.makedirs(val_category_dir, exist_ok=True)
        
        # Get lists of images in train and val directories for this category
        train_images = os.listdir(train_category_dir)
        val_images = os.listdir(val_category_dir)
        
        # Total number of images for this category
        total_images = len(train_images) + len(val_images)
        
        # Calculate desired number of train and validation images
        desired_train_count = int(total_images * desired_split)
        desired_val_count = total_images - desired_train_count
        
        # Adjust train set if necessary
        if len(train_images) > desired_train_count:
            # Move excess images from train to val
            move_count = len(train_images) - desired_train_count
            images_to_move = random.sample(train_images, move_count)
            for img in images_to_move:
                image_to_move_path = os.path.join(train_category_dir, img)
                shutil.move(image_to_move_path, val_category_dir)
        
        elif len(train_images) < desired_train_count:
            # Move images from val to train to increase training set
            move_count = desired_train_count - len(train_images)
            images_to_move = random.sample(val_images, move_count)
            for img in images_to_move:
                image_to_move_path = os.path.join(val_category_dir, img)
                shutil.move(image_to_move_path, train_category_dir)
        
        print(f"{category}: Adjusted to {desired_train_count} train and {desired_val_count} val images.")


DESIRED_SPLIT = 0.8
adjust_train_val_split(train_dir, val_dir, desired_split=DESIRED_SPLIT)

PNEUMONIA: Adjusted to 3106 train and 777 val images.
NORMAL: Adjusted to 1079 train and 270 val images.


In [63]:
count_data(train_dir, val_dir, test_dir)

{'train_PNEUMONIA': 3106,
 'train_NORMAL': 1079,
 'val_PNEUMONIA': 777,
 'val_NORMAL': 270,
 'test_PNEUMONIA': 390,
 'test_NORMAL': 234}

## Extract Data

In the following code block, I will extract the images from the directories of the train, validation, and test set for model training in terms of a generator

In [98]:
IMAGE_SIZE = (224,224)
BATCH_SIZE = 32

train_generator = ImageDataGenerator(
    rescale=1/255,
    rotation_range=20,
    # width_shift_range=0.1,
    # height_shift_range=0.1,
    # shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

train_set_raw = train_generator.flow_from_directory(
    train_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 4185 images belonging to 2 classes.


In [99]:
validation_generator = ImageDataGenerator(rescale=1/255)
valid_set_raw = validation_generator.flow_from_directory(
    val_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 1047 images belonging to 2 classes.


In [100]:
test_generator = ImageDataGenerator(rescale=1/255)
test_set_raw = test_generator.flow_from_directory(
    test_dir, target_size=IMAGE_SIZE, batch_size=BATCH_SIZE, class_mode="binary"
)

Found 624 images belonging to 2 classes.


In [87]:
type(test_set_raw)

keras.src.legacy.preprocessing.image.DirectoryIterator

## EfficientNetB0-based Model Construction

In [None]:
model = tf.keras.Sequential([
    # Input to this model will be images of dimension IMAGE_SIZE + (3,) extracted from train_dir
    layers.Input(shape = IMAGE_SIZE + (3,)),
    layers.Lambda(hub.KerasLayer("https://tfhub.dev/tensorflow/efficientnet/b0/feature-vector/1")),
    layers.GlobalAveragePooling2D(),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(10, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax')
])



ValueError: Input 0 of layer "global_average_pooling2d" is incompatible with the layer: expected ndim=4, found ndim=2. Full shape received: (None, 1280)