<a href="https://colab.research.google.com/github/MS1997/X-ray-classification/blob/master/Xray.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
 # installing kaggle library to import the data directly into Colab notebook
 ! pip install -q kaggle

In [0]:
# select file containing API key from local system
from google.colab import files
uploaded = files.upload()

In [0]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [0]:
# importing the chest x-ray data set from kaggle
! kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

In [0]:
# unzipping the files
! unzip chest-xray-pneumonia.zip -d chest_xray

In [0]:
# getting a look into the contents of the working directory
! ls 

In [0]:
# importing required packages
import os
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from keras.applications import ResNet50
from keras.applications.resnet50 import preprocess_input
from keras.models import Model
from keras.layers import Dropout, GlobalAveragePooling2D, Dense, BatchNormalization
from keras.callbacks import EarlyStopping, Callback
import warnings
warnings.filterwarnings('ignore')

# setting all the seeds to reproduce the output for the test set 

SEED = 1
import os
import random as rn
import numpy as np
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
rn.seed(SEED)

In [0]:
os.listdir('chest_xray/chest_xray') # we see there are train, test & validation sets

In [0]:
# setting the file paths for train, test & validation sets
train_path = 'chest_xray/chest_xray/train'
test_path = 'chest_xray/chest_xray/test'
val_path = 'chest_xray/chest_xray/val'

In [0]:
# lets get a count of normal & pneumonia images in each of the set folders
def get_count(given_path):
  count_dict={'Normal':len(os.listdir(given_path + '/NORMAL')),'Pneumonia':len(os.listdir(given_path + '/PNEUMONIA'))}
  sns.barplot(list(count_dict.keys()),list(count_dict.values()))
  plt.show()

In [0]:
get_count(train_path)

In [0]:
get_count(test_path)

In [0]:
get_count(val_path)

In [0]:
# selecting a random normal & pneumonia chest xray from the training set
choice_n = rn.choice(os.listdir(train_path + '/NORMAL'))
choice_p = rn.choice(os.listdir(train_path + '/PNEUMONIA'))

In [0]:
# saving the randomly chosen images
norm_image = Image.open(train_path + '/NORMAL/' + choice_n)
pne_image = Image.open(train_path + '/PNEUMONIA/' + choice_p)

In [0]:
# plotting the two images side by side
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.imshow(norm_image,cmap="gray") # cmap="gray" required to keep the images black and white 
ax2.imshow(pne_image,cmap="gray")
plt.show()

In [0]:
print(img_to_array(norm_image).shape) #(1909, 1906, 1)
print(img_to_array(pne_image).shape) #(624, 1112, 1)
# we see that the images are of different sizes, so we need to resize these to make them same in size 

In [0]:
# to delete directories
# import shutil
# shutil.rmtree('chest_xray/chest_xray/augmented')

In [0]:
# saving augmented images of the training set
os.mkdir('chest_xray/chest_xray/augmented')

In [0]:
# Image pre processing for resnet50 
# Keeping the color_mode as rgb because we want to try pre trained models like resnet50 which require the input to have 3 channels
# grayscale images have only one channel
# batch_size argument of above function defaults to 32, it takes 32 images and gives back 32 randomly transfomed images, in the next epoch
# 32 different random transformations of these images will be again taken 
# All 3 sets (including augmented images of the training) are applied with the pre processing function of resnet50 
save_here = 'chest_xray/chest_xray/augmented'

# train data generator
train_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input,height_shift_range=0.2,width_shift_range=0.2,shear_range = 0.2, horizontal_flip=True,zoom_range=0.2)
tr_batches = train_data_generator.flow_from_directory(directory= train_path,target_size= (150,150),color_mode='rgb',batch_size = 32,seed=101,save_to_dir=save_here,save_prefix="new",save_format="jpeg")
next(tr_batches)

# validation data generator
val_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)
val_batches = val_data_generator.flow_from_directory(directory= val_path,target_size= (150,150),color_mode='rgb',seed=101) 

# test data generator
test_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)
test_batches = test_data_generator.flow_from_directory(directory= test_path,target_size= (150,150),color_mode='rgb',shuffle=False) 

In [0]:
list_aug = os.listdir('chest_xray/chest_xray/augmented')

In [0]:
len(os.listdir('chest_xray/chest_xray/augmented')) # 32 random transoformations of 32 orginal images in the training set

In [0]:
# checking the shape of the augmented data
img = load_img('chest_xray/chest_xray/augmented/'+list_aug[0])
data = img_to_array(img)
data.shape #(150, 150, 3)

In [0]:
# looking at the first 9 augmented images from train generator
fig=plt.figure(figsize=(15, 15))
for i in range(9):
  img = load_img('chest_xray/chest_xray/augmented/'+list_aug[i])
  data = img_to_array(img)
  #define subplots
  fig.add_subplot(330 + 1 + i)
  plt.imshow(np.squeeze(data.astype('uint8')),cmap='gray')
plt.show()
# note the color change due to pre processing

In [0]:
# Defining initial model with Transfer Learning with the resnet50 architecture
model = keras.models.Sequential()
base_model = ResNet50(weights= 'imagenet',include_top=False,input_shape=(150,150,3),layers=tf.keras.layers) 
#include_top=False because do not want to include the final pooling and fully connected layer of the model and own layers
x = base_model.output # adding the resnet model
# additional layers
x = Dropout(0.5)(x)
x = GlobalAveragePooling2D()(x) 
x = Dense(64)(x)
x = Dense(2,activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=x)
# early stopping
callback = EarlyStopping(monitor='val_loss',patience= 2) #stop when val_loss is minimum
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
history_1 = model.fit_generator(tr_batches,validation_data=val_batches,epochs=10,steps_per_epoch=163,callbacks=[callback]) 
# steps_per_epoch=(3875+1341)/32, num_samples/batch_size

In [0]:
 model.evaluate_generator(test_batches) #loss, accuracy =  [0.48507723212242126, 0.8237179517745972]

In [0]:
# let's try training the whole model with fine tuning parameters 
base_model = ResNet50(weights='imagenet',include_top=False,input_shape=(150,150,3)) 
# include_top=False because we do not want to include the final pooling and fully connected layer of the model, we will add our own
x = base_model.output #adding the resnet model
for layer in base_model.layers:
    if isinstance(layer, BatchNormalization):
        layer.trainable = True
    else:
        layer.trainable = False # freezing all layers except the batch normalization layers

# additional layers
x = Dropout(0.5)(x)
x = GlobalAveragePooling2D()(x) 
x = Dense(256)(x)
x = Dropout(0.2)(x)
x = Dense(2,activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=x)
#learning rate = 
lr = 0.01 #0.001 was less accuracy was 90% , 0.1 was too high
#optimizer
opt = keras.optimizers.Adam(learning_rate=lr)

#early stopping with custom call back
class EarlyStoppingByLossVal(Callback):
    def __init__(self, monitor=['val_loss'],patience=0, value=0.00001, verbose=0):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose
        self.patience = patience

    def on_train_begin(self, logs=None):
        # the number of epoch the model has waited when loss is below the required value
        self.wait = 0

    def on_epoch_end(self, epoch, logs={}):
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)
        if current < self.value:
          self.wait +=1
          if self.wait >= self.patience:
            if self.verbose > 0:
                print("Epoch %05d: early stopping" % epoch)
                self.model.stop_training = True

callback = EarlyStoppingByLossVal(monitor='val_loss', value=0.1, verbose=1,patience=2) #stop when val_loss is less than given value
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [0]:
history_2 = model.fit_generator(tr_batches,validation_data=val_batches,epochs=15,steps_per_epoch=163,callbacks=[callback])

In [0]:
model.evaluate_generator(test_batches)# [0.027545811608433723, 0.9375]

In [0]:
#mounting the drive
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
#using model.save_weights() to save the weights of the model in HDF5 format
model.save_weights("/content/gdrive/My Drive/model1.h5") 

In [0]:
model.summary()

In [0]:
!ls /content/gdrive/'My Drive' # to see contents of the drive

In [0]:
model.load_weights('/content/gdrive/My Drive/model.h5') #load the previously saved weights 
# before running this the model defining block of code should be run

In [0]:
model.evaluate_generator(test_batches)

In [0]:
# plotting the confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix

probabilities = model.predict_generator(generator=test_batches,workers = 0)
true = test_batches.classes[test_batches.index_array]
preds = np.argmax(probabilities, axis=1)
cm  = confusion_matrix(true, preds)

ax = plt.subplot()
sns.set(font_scale=3.0) # Adjust to fit
sns.heatmap(cm, annot=True, ax=ax, cmap="Blues", fmt="g",cbar=False)

label_font = {'size':'10'}  # Adjust to fit
ax.set_xlabel('Predicted labels', fontdict=label_font)
ax.set_ylabel('Observed labels', fontdict=label_font)

title_font = {'size':'21'}  # Adjust to fit
ax.set_title('Confusion Matrix', fontdict=title_font)

ax.tick_params(axis='both', which='major', labelsize=14)  # Adjust to fit
ax.xaxis.set_ticklabels(['Normal', 'Pneumonia'])
ax.yaxis.set_ticklabels(['Normal', 'Pneumonia'])

plt.show()

In [0]:
(216+369)/624 #accuracy = 0.9375

Precision is defined as the number of true positives over the number of true positives plus the number of false positives.



In [0]:
369/(369+18) #precision = 0.9534883720930233

Recall is defined as the number of true positives over the number of true positives plus the number of false negatives.

In [0]:
369/(369+21) #recall = 0.9461538461538461

In [0]:
!pip install scikit-plot

In [0]:
# plotting the ROC curve
import scikitplot as skplt
import matplotlib.pyplot as plt
skplt.metrics.plot_roc_curve(true, probabilities)
plt.show()