# Overview

## Dataset

A public dataset from kaggle was used with minor altering in the naming patterns. The original can be found [here](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia)

The dataset is split into 3 folders:
- `test`
- `train`
- `val`

Each folder is futher split into `normal` and `pneumonia`. There a total of 5863 files stored as JPEGs.

# Setup

Import all required packages and set the base path for the datasets

In [53]:
import numpy as np
import pandas as pd
import os
import cv2
import skimage
import seaborn as sns

datasetsPath = "./datasets"

print("Path to dataset files:", datasetsPath)

labels = ['pneumonia', 'normal']
image_size = 224

Path to dataset files: ./datasets


In [54]:
def get_data (dir_relative_path):
    data = []
    for label in labels:
        currentPath = os.path.join(datasetsPath, dir_relative_path, label)
        print("Working inside folder: ", currentPath)
        class_number = labels.index(label)

        for image in os.listdir(currentPath):
            try:
                if image.endswith('.DS_Store'):
                    continue

                image_arr = cv2.imread(os.path.join(currentPath, image), cv2.IMREAD_GRAYSCALE)
                
                #print(os.path.join(currentPath, image))
                
                resized_arr = cv2.resize(image_arr, (image_size, image_size))
                
                data.append([resized_arr, class_number])
            except Exception as ex:
                print(ex)

    return np.array(data, dtype="object")

In [55]:
train = get_data('train/')
test = get_data('test/')
val = get_data("val")

Working inside folder:  ./datasets\train/pneumonia
Working inside folder:  ./datasets\train/normal
Working inside folder:  ./datasets\test/pneumonia
Working inside folder:  ./datasets\test/normal
Working inside folder:  ./datasets\val\pneumonia
Working inside folder:  ./datasets\val\normal


# Dat Augmentation
Import the necessary packages and resize the images

In [56]:
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D,Dropout
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [57]:
train_datagen = ImageDataGenerator(rescale = 1. / 255, 
                                    horizontal_flip=True,
                                    vertical_flip=False,
                                    rotation_range=20, # randomly rotate the image up to 20 degrees
                                    shear_range=0.1, # skew the sides of the images by up to 10 degrees 
                                    width_shift_range=0.2, # shift the images left/right
                                    height_shift_range=0.2, # shift the images up/down TBD: needs more testing of values
                                    brightness_range=[0.8, 1.2],
                                    fill_mode="nearest")

valid_datagen = ImageDataGenerator(rescale= 1. / 255)
test_datagen = ImageDataGenerator(rescale= 1. / 255)



In [58]:
train_generator = train_datagen.flow_from_directory("./datasets/train",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

test_generator = test_datagen.flow_from_directory("./datasets/test",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

valid_generator = valid_datagen.flow_from_directory("./datasets/val",
                                  batch_size = 64,
                                  target_size=(image_size, image_size),
                                  class_mode = "categorical",
                                  shuffle=True,
                                  seed = 42,
                                  color_mode= "rgb")

Found 5216 images belonging to 2 classes.
Found 624 images belonging to 2 classes.
Found 16 images belonging to 2 classes.


In [59]:
class_labels = train_generator.class_indices

print(class_labels)

# Flip them around so we have "index : value"
class_name = {}

for value, index in class_labels.items():
    class_name[index] = value

print(class_name)

{'normal': 0, 'pneumonia': 1}
{0: 'normal', 1: 'pneumonia'}


In [60]:
base_model = VGG19(input_shape= (image_size, image_size, 3),
      include_top = False,
      weights = "imagenet")

for layer in base_model.layers:
    layer.trainable = False

flat = Flatten()(base_model.output)

# TODO: test and change around the neuron counts

class_1 = Dense(4608, activation= "relu")(flat)
dropout = Dropout(0.2)(class_1)

class_2 = Dense(1152, activation = 'relu')(dropout)
output_layer = Dense(2, activation = 'softmax')(class_2)

model_01 = Model(base_model.inputs, output_layer)
model_01.summary()

In [61]:
checkpointPath = "./modelCheckpoints/model.keras"

early_stop = EarlyStopping(monitor= "val_loss",
                   verbose= 1,
                   mode= "min",
                   patience= 4) # TODO: play around with patience

checkpoint = ModelCheckpoint(checkpointPath,
                monitor= "val_loss",
                save_best_only=True,
                save_weights_only=False,
                save_freq= "epoch",
                mode= "auto")

# TODO: play around with the values
rate_reduction = ReduceLROnPlateau(monitor="val_accuracy",
                        patience= 3,
                        verbose= 1,
                        factor= 0.5, # "learning rate * factor" after patience runs out (? correct word ?)
                        min_lr= 0.0001)

# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.001, # Higher value for training
        momentum= 0.5,
        nesterov=True)


model_01.compile(loss="categorical_crossentropy", optimizer= sgd, metrics=["accuracy"])


In [62]:
history_01 = model_01.fit(train_generator, 
            steps_per_epoch=250,
            epochs=15, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)

  self._warn_if_super_not_called()


Epoch 1/15


Expected: ['keras_tensor_108']
Received: inputs=Tensor(shape=(None, 224, 224, 3))


[1m 82/250[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m7:07[0m 3s/step - accuracy: 0.7528 - loss: 0.5508



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 841ms/step - accuracy: 0.7880 - loss: 0.4712 - val_accuracy: 0.7500 - val_loss: 0.6181 - learning_rate: 0.0010
Epoch 2/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 843ms/step - accuracy: 0.8458 - loss: 0.3273 - val_accuracy: 0.7500 - val_loss: 0.4099 - learning_rate: 0.0010
Epoch 3/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 831ms/step - accuracy: 0.8766 - loss: 0.2792 - val_accuracy: 0.7500 - val_loss: 0.4287 - learning_rate: 0.0010
Epoch 4/15
[1m 82/250[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m7:07[0m 3s/step - accuracy: 0.8911 - loss: 0.2579
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 841ms/step - accuracy: 0.8907 - loss: 0.2564 - val_accuracy: 0.7500 - val_loss: 0.3901 - learning_rate: 0.0010
Epoch 5/15
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [63]:
if not os.path.isdir("modelWeights/"):
    os.mkdir("modelWeights/")

model_01.save(filepath = "modelWeights/vgg19_model_01.keras", overwrite=True)

In [64]:
model_01.load_weights("modelWeights/vgg19_model_01.keras")

vgg_val_eval_01 = model_01.evaluate(valid_generator)
vgg_test_eval_01 = model_01.evaluate(test_generator)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 669ms/step - accuracy: 0.6875 - loss: 0.4744
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.8330 - loss: 0.3938


In [65]:
print(f"Validation Loss: {vgg_val_eval_01[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_01[1]}")
print(f"Test Loss: {vgg_test_eval_01[0]}")
print(f"Test Accuarcy: {vgg_test_eval_01[1]}")

Validation Loss: 0.4744485020637512
Validation Accuarcy: 0.6875
Test Loss: 0.38032519817352295
Test Accuarcy: 0.8445512652397156


In [66]:
# TODO: research if It's better to copy-paste the previous or just use it like this
# Copy-pasting should allow for changing of variables
model_02 = Model(base_model.inputs, output_layer)

model_02.load_weights("modelWeights/vgg19_model_01.keras")

set_trainable = False
for layer in base_model.layers:
    if layer.name in [ 'block5_conv3','block5_conv4']:
        set_trainable=True
    else:
        set_trainable = False
    layer.trainable = set_trainable
print(model_02.summary())

None


In [67]:
# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.0001, # low value for fine tuning
        momentum= 0.5,
        nesterov=True)

model_02.compile(loss="categorical_crossentropy", optimizer= sgd, metrics=["accuracy"])

In [68]:
history_02 = model_02.fit(train_generator, 
            steps_per_epoch=200,
            epochs=10, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 1s/step - accuracy: 0.9108 - loss: 0.2104 - val_accuracy: 0.6875 - val_loss: 0.4884 - learning_rate: 1.0000e-04
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 1s/step - accuracy: 0.9180 - loss: 0.2055 - val_accuracy: 0.6875 - val_loss: 0.4884 - learning_rate: 1.0000e-04
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 1s/step - accuracy: 0.9222 - loss: 0.1950 - val_accuracy: 0.7500 - val_loss: 0.5155 - learning_rate: 1.0000e-04
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 1s/step - accuracy: 0.9191 - loss: 0.2002 - val_accuracy: 0.7500 - val_loss: 0.4982 - learning_rate: 1.0000e-04
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 1s/step - accuracy: 0.9251 - loss: 0.1892 - val_accuracy: 0.7500 - val_loss: 0.4918 - learning_rate: 1.0000e-04
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━

In [69]:
if not os.path.isdir('modelWeights/'):
    os.mkdir("modelWeights/")
model_02.save(filepath = "modelWeights/vgg19_model_02.keras", overwrite=True)

In [70]:
model_02.load_weights("modelWeights/vgg19_model_02.keras")

vgg_val_eval_02 = model_02.evaluate(valid_generator)
vgg_test_eval_02 = model_02.evaluate(test_generator)

print(f"Validation Loss: {vgg_val_eval_02[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_02[1]}")
print(f"Test Loss: {vgg_test_eval_02[0]}")
print(f"Test Accuarcy: {vgg_test_eval_02[1]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695ms/step - accuracy: 0.7500 - loss: 0.4842
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.8388 - loss: 0.3844
Validation Loss: 0.4841676354408264
Validation Accuarcy: 0.75
Test Loss: 0.39870065450668335
Test Accuarcy: 0.8317307829856873


In [71]:
# Fine tuning the entire network (Second time)

base_model = VGG19(include_top=False, input_shape=(image_size, image_size,3))

x = base_model.output
flat = Flatten()(x)

class_1 = Dense(4608, activation = 'relu')(flat)
dropout = Dropout(0.2)(class_1)
class_2 = Dense(1152, activation = 'relu')(dropout)
output = Dense(2, activation = 'softmax')(class_2)

model_03 = Model(base_model.inputs, output)
model_03.load_weights("modelWeights/vgg19_model_02.keras")

print(model_03.summary())

None


In [76]:
# Stochatic Gradient Descent
sgd = SGD(learning_rate= 0.0001, # Low initial value = more gradual updates
        momentum= 0.8, # TODO: play around with the momentum
        nesterov=True)
model_03.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=['accuracy'])

In [None]:
history_03 = model_03.fit(train_generator, 
            steps_per_epoch=100,
            epochs=5, 
            callbacks=[early_stop, checkpoint, rate_reduction],
            validation_data=valid_generator)




Epoch 1/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m626s[0m 3s/step - accuracy: 0.9515 - loss: 0.1320 - val_accuracy: 0.7500 - val_loss: 0.3855 - learning_rate: 1.0000e-04
Epoch 2/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m626s[0m 3s/step - accuracy: 0.9504 - loss: 0.1284 - val_accuracy: 0.7500 - val_loss: 0.4925 - learning_rate: 1.0000e-04
Epoch 3/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m631s[0m 3s/step - accuracy: 0.9583 - loss: 0.1124 - val_accuracy: 0.7500 - val_loss: 0.4689 - learning_rate: 1.0000e-04
Epoch 4/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m624s[0m 3s/step - accuracy: 0.9555 - loss: 0.1089 - val_accuracy: 0.8125 - val_loss: 0.3776 - learning_rate: 1.0000e-04
Epoch 5/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m617s[0m 3s/step - accuracy: 0.9574 - loss: 0.1041 - val_accuracy: 0.7500 - val_loss: 0.3865 - learning_rate: 1.0000e-04


In [80]:
vgg_val_eval_03 = model_03.evaluate(valid_generator)
vgg_test_eval_03 = model_03.evaluate(test_generator)

print(f"Validation Loss: {vgg_val_eval_03[0]}")
print(f"Validation Accuarcy: {vgg_val_eval_03[1]}")
print(f"Test Loss: {vgg_test_eval_03[0]}")
print(f"Test Accuarcy: {vgg_test_eval_03[1]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 692ms/step - accuracy: 0.7500 - loss: 0.3865
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.8789 - loss: 0.3850
Validation Loss: 0.3864728808403015
Validation Accuarcy: 0.75
Test Loss: 0.38052311539649963
Test Accuarcy: 0.875


In [79]:
if not os.path.isdir('modelWeights/'):
    os.mkdir("modelWeights/")
model_03.save(filepath = "modelWeights/vgg_unfrozen.keras", overwrite=True)