In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/chest-xray-pneumonia/chest_xray/test/PNEUMONIA/'):
    for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing necessary libraries and hard-coded values
import pandas as pd
import numpy as np
import tensorflow as tf
import os

import keras
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import layers 
from tensorflow.keras import Model
from tensorflow.keras import Input

from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split


try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)


IMAGE_SIZE = [224,224]
EPOCHS = 25
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [None]:
# Getting filenames from our dataset 
filenames = tf.io.gfile.glob('../input/chest-xray-pneumonia/chest_xray/train/*/*')
filenames.extend(tf.io.gfile.glob('../input/chest-xray-pneumonia/chest_xray/val/*/*'))


train_filenames, val_filenames = train_test_split(filenames, test_size = 0.2)

In [None]:
print(len(filenames))

In [None]:
train_list_ds = tf.data.Dataset.from_tensor_slices(train_filenames)
val_list_ds = tf.data.Dataset.from_tensor_slices(val_filenames)

In [None]:
for index,element in enumerate(val_list_ds):
    print(element)
    if index == 5:
        break

In [None]:
COUNT_NORMAL = len([filename for filename in train_filenames if "NORMAL" in filename])
print("Normal images count in training set: " + str(COUNT_NORMAL))

COUNT_PNEUMONIA = len([filename for filename in train_filenames if "PNEUMONIA" in filename])
print("Pneumonia images count in training set: " + str(COUNT_PNEUMONIA))

TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print("Training images count: " + str(TRAIN_IMG_COUNT))

VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print("Validating images count: " + str(VAL_IMG_COUNT))

In [None]:
CLASS_NAMES = np.array([str(tf.strings.split(item, os.path.sep)[-1].numpy())[2:-1]
                        for item in tf.io.gfile.glob("../input/chest-xray-pneumonia/chest_xray/train/*")])
CLASS_NAMES

In [None]:
# What we want to do is map each filename to the corresponding pair - image,label. What we can do is rewrite
# labels as 1 indicating pneumonia and 0 indicating normal diagnose.
def get_label(file_path):
    # convert the path to a list of path components     
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    return parts[-2] == "PNEUMONIA"

In [None]:
def decode_img(img):
    # Decode a jpeg image to a unit8 tensor, channels indicate RGB
    img = tf.image.decode_jpeg(img, channels=3)
    # Converting integer types to floating point types returns normalized floating point values in the range [0,1)
    img = tf.image.convert_image_dtype(img, tf.float32)
    # Resize and return the image to the pre hard-coded resolution 
    return tf.image.resize(img, IMAGE_SIZE)
    

In [None]:
def process_path(file_path):
    # Getting label for a desired path     
    label = get_label(file_path)
    # Loading raw image and processing into desired range.
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

In [None]:
# Now we have to map corresponding pairs in a train and val dataset
train_ds = train_list_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_list_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
import matplotlib.pyplot as plt

print(train_ds)
for image, label in train_ds.take(1):
    print("Image shape: ", image.numpy())
    print("Label: ", label.numpy())

In [None]:
# Preparing the test dataset
test_list_ds = tf.data.Dataset.list_files(str('../input/chest-xray-pneumonia/chest_xray/test/*/*'))
TEST_IMAGE_COUNT = tf.data.experimental.cardinality(test_list_ds).numpy()
test_ds = test_list_ds.map(process_path, num_parallel_calls = tf.data.AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)

TEST_IMAGE_COUNT

# Visualize the dataset

In [None]:
# We'll use buffered prefetching so we can yield data from disk
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()
    
    # Randomly shuffles a tensor along its first dimension.     
    ds = ds.shuffle(buffer_size=shuffle_buffer_size)
    
    # Repeat forever
    ds = ds.repeat()
    
    # Combines consecutive elementes of this dataset into batches. N(batches) = len(ds) / batch_size 
    ds = ds.batch(BATCH_SIZE)
    
    # 'prefetch' lets the dataset fetch batches in the background while the model is training 
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    
    return ds

In [None]:
# Call the next batch iteration of the training data
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)

image_batch,label_batch = next(iter(train_ds))

In [None]:
def show_batch(image_batch, label_batch):
#     plt.figure(figsize=(30,10))
    f, ax = plt.subplots(3,5, figsize=(30,10))
    for n in range(15):
        ax[n//5, n%5].imshow(image_batch[n], cmap = 'gray')
        if label_batch[n]:
            ax[n//5, n%5].set_title("PNEUMONIA")
        else:
            ax[n//5, n%5].set_title("NORMAL")
        ax[n//5, n%5].axis('off')
#         ax[n//5, n%5].set_aspect('auto')

In [None]:
show_batch(image_batch.numpy(), label_batch.numpy())

# Loading and Base Model Training

In [None]:
# VGG16 

base_model = tf.keras.applications.VGG16(
    input_shape = (IMAGE_SIZE[0], IMAGE_SIZE[0], 3),
    weights = 'imagenet',
    include_top=False)

for layer in base_model.layers:
    layer.trainable = False

x = layers.Flatten()(base_model.output)
x = layers.Dense(512, activation='relu')(x)
# x = layers.Dropout(0.05)(x)
output = layers.Dense(len(CLASS_NAMES)-1, activation='softmax')(x)

In [None]:
model = Model(inputs=base_model.input, outputs=output)
model.summary()

In [None]:
# base_model = tf.keras.applications.EfficientNetB5(
#     include_top=False,
#     weights='imagenet',
#     input_shape = (IMAGE_SIZE[0], IMAGE_SIZE[1], 3)
# )

# base_model.trainable = True

# inputs = Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
# x = base_model(inputs)
# x = layers.GlobalAveragePooling2D()(x)
# x = layers.Dropout(0.25)(x)
# output = layers.Dense(1, activation='sigmoid')(x)

# model = Model(inputs, output)

# Correct for data imbalance

In [None]:
# Taking the natural logarithm of COUNT_PNEUMONIA and COUNT_NORMAL dividend. 
initial_bias = np.log([COUNT_PNEUMONIA/COUNT_NORMAL])
print("Pneumonia: %i , Normal: %i, Initial_bias: %f " %(COUNT_PNEUMONIA, COUNT_NORMAL, initial_bias))

In [None]:
# w0 = (4185/1086)/2 = 1.92
# w1 = (4185/3099)/2 = 0.67
weight_for_0 = (1 / COUNT_NORMAL) * (TRAIN_IMG_COUNT) / 2.0
weight_for_1 = (1 / COUNT_PNEUMONIA) * (TRAIN_IMG_COUNT) / 2.0 

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Precision is the number of true positives (TP) over the sum of TP and false positives (FP). It shows what fraction of labeled positives are actually correct.

Recall is the number of TP over the sum of TP and false negatves (FN). It shows what fraction of actual positives are correct.

In [None]:
optAdam = tf.keras.optimizers.Adam(learning_rate=0.001)

with strategy.scope():
    METRICS = [
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
    
    model.compile(
        optimizer= optAdam,
        loss= 'binary_crossentropy',
        metrics= ['accuracy']
    )

# Finetune the model

In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
filepath = 'best_model.h5'

early_stopping_cb = EarlyStopping(patience=5,
                                monitor='val_loss',
                                mode='min',
                                verbose=1)

ModelCheckpoint = ModelCheckpoint(filepath,
                             monitor='val_loss',
                             mode='min',
                             save_best_only=True,
                             verbose=1)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=3, verbose=1, factor=0.5, min_lr=2.5e-5)

callbacks_list = [ModelCheckpoint, early_stopping_cb, learning_rate_reduction]

In [None]:
no_epochs = 35

history = model.fit(
    train_ds,
    steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
    epochs= no_epochs,
    validation_data=val_ds,
    validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
    class_weight=class_weight,
    callbacks=callbacks_list
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(10)

plt.figure(figsize=(15, 15))
plt.subplot(2, 2, 1)
plt.plot(epochs_range, acc, label='accuracy')
plt.plot(epochs_range, val_acc, label='val_accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')


plt.subplot(2, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# history = model.fit(
#     train_ds,
#     steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
#     epochs=100,
#     validation_data=val_ds,
#     validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
#     class_weight=class_weight,
#     callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler]
# )

In [None]:
from keras.models import load_model

loaded_model = load_model('./best_model.h5')

# Visualizing model performance

In [None]:
fig, ax = plt.subplots(1,4, figsize=(20,3))
ax = ax.ravel()

for i, met in enumerate([ 'accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

# Predict and evautae results

In [None]:
loss, acc, prec, rec = loaded_model.evaluate(test_ds)