In [None]:
!pip install nb_black -q
!wget https://raw.githubusercontent.com/mcarujo/pneumonia-detector/develop/helpers.py -q
!wget -O helpers_data.py https://raw.githubusercontent.com/mcarujo/pneumonia-detector/develop/data/helpers.py -q
!conda install -c plotly plotly-orca --yes --quiet
!pip install tensorflow==2.3.1 lungs-finder -q
!mkdir models
!mkdir models/cnn6_ann3_pow10_adamax
!mkdir 

In [None]:
%load_ext nb_black

# Libraries and imports

Here I'm importing all libs that will be used by me in this notebook.

In [None]:
!ls

In [None]:
!pip install lungs_finder

In [None]:
# Python's libs
%matplotlib inline
import glob
import os
import warnings
import lungs_finder as lf

# Tools libs
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap

# Machine Learning libs
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

# Helpers developed by me
from helpers import metrics, plot_training
from helpers_data import compose_dataset

warnings.filterwarnings("ignore")

# The Dataset

![](https://github.com/mcarujo/pneumonia-detector/raw/develop/images/images.png)
## Source

#### This dataset was taken from the Kaggle platform posted here in this [link](https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia).
## Information
The dataset is organized into 3 folders (train, test, val) and contains subfolders for each image category (Pneumonia/Normal). There are 5,863 X-Ray images (JPEG) and 2 categories (Pneumonia/Normal).

Chest X-ray images (anterior-posterior) were selected from retrospective cohorts of pediatric patients of one to five years old from Guangzhou Women and Children’s Medical Center, Guangzhou. All chest X-ray imaging was performed as part of patients’ routine clinical care.

For the analysis of chest x-ray images, all chest radiographs were initially screened for quality control by removing all low quality or unreadable scans. The diagnoses for the images were then graded by two expert physicians before being cleared for training the AI system. In order to account for any grading errors, the evaluation set was also checked by a third expert.

In [None]:
base_path = os.path.join("..","input","chest-xray-pneumonia","chest_xray")

path_train_normal = os.path.join(base_path, "train","NORMAL")
path_train_pneu = os.path.join(base_path , "train","PNEUMONIA")

path_test_normal = os.path.join(base_path , "test","NORMAL")
path_test_pneu = os.path.join(base_path , "test","PNEUMONIA")

path_val_normal = os.path.join(base_path , "val","NORMAL")
path_val_pneu = os.path.join(base_path , "val","PNEUMONIA")

In [None]:
def creat_dataframe(base_path, kind, flag):
    train_normal_imgs = os.listdir(base_path)
    aux = pd.DataFrame(train_normal_imgs)
    aux.columns = ["image_name"]
    aux["full_path"] = base_path + os.path.sep + aux.image_name
    aux["kind"] = kind
    aux["flag"] = flag
    return aux


grid = [
    (path_train_normal, "train", 0),
    (path_train_pneu, "train", 1),
    (path_test_normal, "test", 0),
    (path_test_pneu, "test", 1),
    (path_val_normal, "val", 0),
    (path_val_pneu, "val", 1),
]

dataset = pd.concat([creat_dataframe(*el) for el in grid])
dataset.to_csv("data_image.csv", index=False)
dataset.head()

## Summary

To explain the labels and counts of the dataset (metadata):
- **image_name**: the file's name.
- **full_path**: the full path to the image.
- **kind**: how will be used the file train, test, and val.
- **flag**: flag that define the class, 0 means normal and 1 means pneumonia.


In [None]:
%%time
IMAGE_RESOLUTION = (250, 250, 1)
BORDER = 30
print('Image shape: {}'.format(IMAGE_RESOLUTION))


# Creating dataset
train_set = dataset[dataset.kind == "train"][["full_path", "flag"]]
test_set = dataset[dataset.kind == "test"][["full_path", "flag"]]
val_set = dataset[dataset.kind == "val"][["full_path", "flag"]]

# Creating X and y variables
X_train, y_train = compose_dataset(train_set, IMAGE_RESOLUTION, BORDER)
X_test, y_test = compose_dataset(test_set, IMAGE_RESOLUTION, BORDER)
X_val, y_val = compose_dataset(val_set, IMAGE_RESOLUTION, BORDER)

# Infortmations
print('Train data shape: {}, Labels shape: {}'.format(X_train.shape, y_train.shape))
print('Test data shape: {}, Labels shape: {}'.format(X_test.shape, y_test.shape))
print('Validation data shape: {}, Labels shape: {}'.format(X_val.shape, y_val.shape))

### Examples

In [None]:
labels = ["Pneumonia" if y else "Normal" for y in y_val]
plt.figure(figsize=(20, 15))
for i, x in enumerate(X_val):
    plt.subplot(4, 4, i + 1)
    plt.imshow(x.reshape(IMAGE_RESOLUTION),'gray')
    plt.axis("on")
    plt.title("Label: {}".format(labels[i]))

Where is our focus on these images?
<br/>
<br/>
<br/>
    <img src="https://github.com/mcarujo/pneumonia-detector/raw/develop/images/region_of_interest.PNG" width="25%"/>
<br/>
<br/>
We must keep our CNN look at the chest region of the image, where is the important to define if a person has pneumonia or not.

<center><h1 style='background-color:#99ccff;padding:10px;font-family:courier'>📗 Exploring the Data</h1></center>

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x='flag', data=train_set, palette="mako")

plt.xlabel("Class", fontsize= 12)
plt.ylabel("Count", fontsize= 12)
plt.ylim(0,5000)
plt.xticks([0,1], ['Normal', 'Pneumonia'], fontsize = 11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30, p.get_height()+300), fontsize = 13)
    
plt.show()

In [None]:
plt.figure(figsize=(7,5))

train_set['flag'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors = ['darkcyan','blue'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x='flag', data=test_set, palette="mako")

plt.xlabel("Class", fontsize= 12)
plt.ylabel("Count", fontsize= 12)
plt.ylim(0,5000)
plt.xticks([0,1], ['Normal', 'Pneumonia'], fontsize = 11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30, p.get_height()+300), fontsize = 13)
    
plt.show()

In [None]:
plt.figure(figsize=(7,5))

test_set ['flag'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors = ['darkcyan','blue'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x='flag', data=val_set , palette="mako")

plt.xlabel("Class", fontsize= 12)
plt.ylabel("Count", fontsize= 12)
plt.ylim(0,5000)
plt.xticks([0,1], ['Normal', 'Pneumonia'], fontsize = 11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30, p.get_height()+300), fontsize = 13)
    
plt.show()

In [None]:
plt.figure(figsize=(7,5))

val_set['flag'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors = ['darkcyan','blue'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

### ImageDataGenerator
Generate batches of tensor image data with real-time data augmentation.

In [None]:
# Define generator
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.3,
    width_shift_range=0.3,
    height_shift_range=0.3,
    horizontal_flip=False,
    vertical_flip=False,
)

# Fit generator on our train features
datagen.fit(X_train)

# Model
We are using Convolutional Neural Network as a model to learn from our dataset to be a specialist model in predict pneumonia.

The models are defined in the folder **models** and will be imported and executed.
At the moment there are 3 different models available:
- cnn5_ann5_fibonacci_adam
- cnn6_ann3_pow10_adamax
- cnn10_ann2_pow2_adam

In [None]:
# Imports

from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D
from tensorflow.keras.models import Sequential


def cnn6_ann3_pow10_adamax(IMAGE_RESOLUTION):
    model = Sequential()
    model.add(
        Conv2D(
            filters=10,
            kernel_size=(7, 7),
            padding="same",
            activation="relu",
            input_shape=IMAGE_RESOLUTION,
        )
    )
    model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Conv2D(filters=20, kernel_size=(5, 5), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Conv2D(filters=30, kernel_size=(3, 3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=40, kernel_size=(3, 3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=50, kernel_size=(3, 3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=60, kernel_size=(3, 3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(200, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adamax", metrics=["accuracy"])
    return model


# Models
model = cnn6_ann3_pow10_adamax(IMAGE_RESOLUTION)

# Will be used to save model and plots
model_path = os.path.join("models","cnn6_ann3_pow10_adamax")

# Saving the model's plot
plot_model(
    model,
    to_file=os.path.join(model_path, "model_plot.png"),
    show_shapes=True,
    show_layer_names=False,
    dpi=100,
    rankdir="TB",
)

### Class Weight
Due a imbalanced class we shoud compensate defining the **class_weight**.

In [None]:
COUNT_NORMAL = train_set.flag.value_counts().loc[0]
COUNT_PNEUMONIA = train_set.flag.value_counts().loc[1]

TRAIN_IMG_COUNT = COUNT_NORMAL + COUNT_PNEUMONIA
weight_for_0 = (1 / COUNT_NORMAL) * (TRAIN_IMG_COUNT) / 2.0
weight_for_1 = (1 / COUNT_PNEUMONIA) * (TRAIN_IMG_COUNT) / 2.0

class_weight = {0: round(weight_for_0,3), 1: round(weight_for_1,3)}
class_weight

### Training
Here we are going to start the training process using the **model.fit**.

In [None]:
%%time
# EarlyStopping to stop our trainig process when is not nescessary keep training 
callback = EarlyStopping(monitor="loss", patience=3)

# Model fit return the historical metrics of it
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=5),
    validation_data=(X_test, y_test),
    epochs=100,
    verbose=1,
    callbacks=[callback],
    class_weight=class_weight
)

### Metrics

Generating the training metrics shuch as accuracy, recal, precision, f1 score, and confusion matrix.

In [None]:
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

train_loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(12, 10))

ax[0].set_title('Training Accuracy vs. Epochs')
ax[0].plot(train_accuracy, 'o-', label='Train Accuracy')
ax[0].plot(val_accuracy, 'o-', label='Validation Accuracy')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')
ax[0].legend(loc='best')

ax[1].set_title('Training/Validation Loss vs. Epochs')
ax[1].plot(train_loss, 'o-', label='Train Loss')
ax[1].plot(val_loss, 'o-', label='Validation Loss')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss')
ax[1].legend(loc='best')

plt.tight_layout()
plt.show()

In [None]:
# Predicting the classes model
y_pred = model.predict(X_test, batch_size=4)

# Predicting the classes model
y_pred_class = y_pred.round()

# Plot training line
plot_training(history, model_path)

# Plot the metrics
metrics(
    y_test,
    y_pred.reshape(1, -1)[0],
    y_pred_class.reshape(1, -1)[0].astype(int),
    model_path,
)