# Overview

## Dataset

A public dataset from kaggle was used with minor altering in the naming patterns. The original can be found [here](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia)

The dataset is split into 3 folders:
- `test`
- `train`
- `val`

Each folder is futher split into `normal` and `pneumonia`. There a total of 5863 files stored as JPEGs.

# Setup

Import all required packages and set the base path for the datasets

In [1]:
import numpy as np
import pandas as pd
import os
import skimage
import seaborn as sns
import tensorflow as tf

datasetsPath = "./datasets"

print("Path to dataset files:", datasetsPath)

labels = ['pneumonia', 'normal']
image_size = 224

Path to dataset files: ./datasets


In [3]:
def get_data (dir_relative_path):
    data = []
    for label in labels:
        currentPath = os.path.join(datasetsPath, dir_relative_path, label)
        print("Working inside folder: ", currentPath)
        class_number = labels.index(label)

        for image in os.listdir(currentPath):
            try:
                if image.endswith('.DS_Store'):
                    continue

                image_path = os.path.join(currentPath, image)
                image_file = tf.io.read_file(image_path)
                
                image_arr = tf.image.decode_image(image_file, channels=1)
                                
                resized_arr = tf.image.resize(image_arr, [image_size, image_size])

                data.append([resized_arr, class_number])
            except Exception as ex:
                print(ex)

    return np.array(data, dtype="object")

# Dat Augmentation
Import the necessary packages and resize the images

In [4]:
import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D,Dropout
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [5]:
train_datagen = ImageDataGenerator(rescale = 1. / 255, 
                                    horizontal_flip=True,
                                    vertical_flip=False,
                                    rotation_range=20, # randomly rotate the image up to 20 degrees
                                    shear_range=0.1, # skew the sides of the images by up to 10 degrees 
                                    width_shift_range=0.2, # shift the images left/right
                                    height_shift_range=0.2, # shift the images up/down TBD: needs more testing of values
                                    brightness_range=[0.8, 1.2],
                                    fill_mode="nearest")

valid_datagen = ImageDataGenerator(rescale= 1. / 255)
test_datagen = ImageDataGenerator(rescale= 1. / 255)



In [6]:
train_generator = train_datagen.flow_from_directory("./datasets/train",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

test_generator = test_datagen.flow_from_directory("./datasets/test",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

valid_generator = valid_datagen.flow_from_directory("./datasets/val",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

Found 5216 images belonging to 2 classes.
Found 624 images belonging to 2 classes.
Found 16 images belonging to 2 classes.


In [7]:
class_labels = train_generator.class_indices

print(class_labels)

# Flip them around so we have "index : value"
class_name = {}

for value, index in class_labels.items():
    class_name[index] = value

print(class_name)

{'normal': 0, 'pneumonia': 1}
{0: 'normal', 1: 'pneumonia'}


In [8]:
base_model = VGG19(input_shape= (image_size, image_size, 3),
      include_top = False,
      weights = "imagenet")

for layer in base_model.layers:
    layer.trainable = False

flat = Flatten()(base_model.output)

# TODO: test and change around the neuron counts

class_1 = Dense(4608, activation= "relu")(flat)
dropout = Dropout(0.2)(class_1)

class_2 = Dense(1152, activation = 'relu')(dropout)
output_layer = Dense(2, activation = 'softmax')(class_2)

model_01 = Model(base_model.inputs, output_layer)
model_01.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [9]:
checkpointPath = "./modelCheckpoints/model.keras"

early_stop = EarlyStopping(monitor= "val_loss",
                   verbose= 1,
                   mode= "min",
                   patience= 4) # TODO: play around with patience

checkpoint = ModelCheckpoint(checkpointPath,
                monitor= "val_loss",
                save_best_only=True,
                save_weights_only=False,
                save_freq= "epoch",
                mode= "auto")

# TODO: play around with the values
rate_reduction = ReduceLROnPlateau(monitor="val_accuracy",
                        patience= 3,
                        verbose= 1,
                        factor= 0.5, # "learning rate * factor" after patience runs out (? correct word ?)
                        min_lr= 0.0001)

# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.001, # Higher value for training
        momentum= 0.5,
        nesterov=True)


model_01.compile(loss="categorical_crossentropy", optimizer= sgd, metrics=["accuracy"])


In [10]:
history_01 = model_01.fit(train_generator, 
            steps_per_epoch=250,
            epochs=15, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)

Epoch 1/15


In [11]:
if not os.path.isdir("modelWeights/"):
    os.mkdir("modelWeights/")

model_01.save(filepath = "modelWeights/vgg19_model_01.keras", overwrite=True)

In [12]:
model_01.load_weights("modelWeights/vgg19_model_01.keras")

vgg_val_eval_01 = model_01.evaluate(valid_generator)
vgg_test_eval_01 = model_01.evaluate(test_generator)



In [None]:
print(f"Validation Loss: {vgg_val_eval_01[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_01[1]}")
print(f"Test Loss: {vgg_test_eval_01[0]}")
print(f"Test Accuarcy: {vgg_test_eval_01[1]}")

In [None]:
# TODO: research if It's better to copy-paste the previous or just use it like this
# Copy-pasting should allow for changing of variables
model_02 = Model(base_model.inputs, output_layer)

model_02.load_weights("modelWeights/vgg19_model_01.keras")

set_trainable = False
for layer in base_model.layers:
    if layer.name in [ 'block5_conv3','block5_conv4']:
        set_trainable=True
    else:
        set_trainable = False
    layer.trainable = set_trainable
print(model_02.summary())

In [None]:
# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.0001, # low value for fine tuning
        momentum= 0.5,
        nesterov=True)

model_02.compile(loss="categorical_crossentropy", optimizer= sgd, metrics=["accuracy"])

In [None]:
history_02 = model_02.fit(train_generator, 
            steps_per_epoch=200,
            epochs=10, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)

In [None]:
if not os.path.isdir('modelWeights/'):
    os.mkdir("modelWeights/")
model_02.save(filepath = "modelWeights/vgg19_model_02.keras", overwrite=True)

In [None]:
model_02.load_weights("modelWeights/vgg19_model_02.keras")

vgg_val_eval_02 = model_02.evaluate(valid_generator)
vgg_test_eval_02 = model_02.evaluate(test_generator)

print(f"Validation Loss: {vgg_val_eval_02[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_02[1]}")
print(f"Test Loss: {vgg_test_eval_02[0]}")
print(f"Test Accuarcy: {vgg_test_eval_02[1]}")

In [None]:
# Fine tuning the entire network (Second time)

base_model = VGG19(include_top=False, input_shape=(image_size, image_size,3))

x = base_model.output
flat = Flatten()(x)

class_1 = Dense(4608, activation = 'relu')(flat)
dropout = Dropout(0.2)(class_1)
class_2 = Dense(1152, activation = 'relu')(dropout)
output = Dense(2, activation = 'softmax')(class_2)

model_03 = Model(base_model.inputs, output)
model_03.load_weights("modelWeights/vgg19_model_02.keras")

print(model_03.summary())

In [None]:
# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.0001, # Low initial value = more gradual updates
        momentum= 0.8, # TODO: play around with the momentum
        nesterov=True)
model_03.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=['accuracy'])

In [None]:
history_03 = model_03.fit(train_generator, 
            steps_per_epoch=100,
            epochs=5, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)




In [None]:
vgg_val_eval_03 = model_03.evaluate(valid_generator)
vgg_test_eval_03 = model_03.evaluate(test_generator)

print(f"Validation Loss: {vgg_val_eval_03[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_03[1]}")
print(f"Test Loss: {vgg_test_eval_03[0]}")
print(f"Test Accuarcy: {vgg_test_eval_03[1]}")

In [None]:
if not os.path.isdir('modelWeights/'):
    os.mkdir("modelWeights/")
model_03.save(filepath = "modelWeights/vgg_unfrozen.keras", overwrite=True)