# [NSIETE] Pneumonia

Dataset pochádza z [https://www.kaggle.com/tolgadincer/labeled-chest-xray-images](https://www.kaggle.com/tolgadincer/labeled-chest-xray-images). Obsahuje rontgenové snímky pľúc pacientov rozdelených do dvoch tried - tí, ktorí trpia pneumóniou (akútny zápal pľúc vírusového alebo bakteriálneho pôvodu) a rontgenové snímky pacientov so zdravými pľúcami, resp. pacientami netrpiacimi pneumóniou.

In [None]:
!pip install tensorflow seedir numpy matplotlib tensorflow_addons wandb >> '/dev/null'

In [None]:
import os
import re

import matplotlib.pyplot as plt
import numpy as np
from zipfile import ZipFile
from PIL import Image
import seedir as sd
import wandb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import tensorflow_addons as tfa

### Spracovanie datasetu

In [None]:
data_path = '../data'
data_compressed_filename = 'archive.zip'
data_extracted_foldername = 'chest_xray'

In [None]:
# extract data 
if not (os.path.exists(f"{data_path}/{data_extracted_foldername}")):
    print('Extracting...')
    with ZipFile(f'{data_path}/{data_compressed_filename}', 'r') as zip:
        zip.extractall(f'{data_path}/')     

Pozrieme sa na štruktúru a počty dát.

In [None]:
>>> sd.seedir(f'{data_path}/{data_extracted_foldername}', style='spaces', indent=2, anystart='- ',depthlimit=2)

In [None]:
def list_class_dir(path):
    class_count = dict()
    for subfolder in os.listdir(path):   
        if not subfolder.startswith('.'):
            class_count[subfolder] = len(os.listdir(f'{path}/{subfolder}'))

    return class_count
    
train = list_class_dir(f'{data_path}/{data_extracted_foldername}/train')

In [None]:
plt.bar(dict.keys(train),dict.values(train),color=['b','c'])

Vidíme že naše triedy sú nevyvážené - počet snímkov pacientov trpiacich pneumóniou viacnásobne prevyšuje zdravých pacientov. Rozhodli sme sa teda vyrovnať počty snímkov tým, že nagenerujeme snímky náhodným otáčaním snímok zdravých jedincov z trénovacieho datasetu o +-5 stupňov.

In [None]:
list = ['r5', 'r-5-']

def rotateImages(rotationAmt, naming, images_path ):
  # for each image in the current directory
    for image in os.listdir(images_path):
        # open the image and check whether it was not previously rotated
        if not any([x in image for x in list]): 
            img = Image.open(f'{images_path}/{image}')
            img = img.rotate(rotationAmt)
            img.save(f'{images_path}/{naming}-{image}', 'JPEG')


# rotate data in case the folder doesn't contain any rotated files
images_path = f'{data_path}/{data_extracted_foldername}/train/NORMAL'
res = any(re.search(r'r5-*', f) for f in os.listdir(images_path))
if not (res):
    rotateImages(5, 'r5', images_path)
    rotateImages(-5, 'r-5', images_path)


train = list_class_dir(f'{data_path}/{data_extracted_foldername}/train')
plt.bar(dict.keys(train),dict.values(train),color=['b','c'])

Môžme vidieť že sa nám podarilo vyrovnať počet dát v jednotlivých triedach.

### Trénovanie modelu

In [None]:
wandb.login()

In [None]:
run = wandb.init(project='pneumonia', entity='nn2021')

Nastavíme konfiguráciu modelu - epochy, batch size, loss funkciu, optimizer ... 

In [None]:
optimizer = 'adam'

config = {
    'IMAGE_HEIGHT': 224,
    'IMAGE_WIDTH': 224,
    'CLASSES': 2,
    "EPOCHS": 3,
    "BATCH_SIZE": 256,
    "LEARNING_RATE": 0.0001,
    "EPSILON": 1e-07,
    'LOSS': tf.keras.losses.BinaryCrossentropy(from_logits=True),
    'DENSE': 256,
    'EARLYSTOPPING': True
}

if (optimizer == 'adam'):
    config['OPTIMIZER'] = keras.optimizers.Adam(
        learning_rate=config['LEARNING_RATE'],
        epsilon=config['EPSILON']
    )
else:
    config["RHO"] = 0.9
    config['MOMENTUM'] = 0.5
    config['OPTIMIZER'] = keras.optimizers.RMSprop(
        learning_rate=config['LEARNING_RATE'],
        rho=config['RHO'],
        momentum=config['MOMENTUM'],
        epsilon=config['EPSILON'], 
    )

# if config['EARLYSTOPPING'] :
#   config['callbacks'] = [
#                          WandbCallback(),
#                          tf.keras.callbacks.EarlyStopping(monitor='val_Recall') 
#                          ]
# else:
#   config['callbacks'] = [WandbCallback()]

# wandb.config.update(config)


Načítame testovacie aj trénovacie dáta.

In [None]:
def load_data(filepath, datatype):
    return tf.keras.preprocessing.image_dataset_from_directory(
        f'{filepath}/{datatype}',
        seed=123,
        image_size=(config['IMAGE_HEIGHT'], config['IMAGE_WIDTH']),
        label_mode = None,
        class_names=None,
    )

train = load_data(f'{data_path}/{data_extracted_foldername}','train')
test = load_data(f'{data_path}/{data_extracted_foldername}','test')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(train).shuffle(60000).batch(256)

In [None]:
image_sequence = an_image.getdata()
image_array = np.array(image_sequence)

print(image_array.shape)

Pozrieme sa na ukážku obrázkových dát v našej sade:

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(class_names[labels[i]])
        plt.axis("off")

Pre potreby rýchlejšieho I/O nastavíme buffery pre cachovanie.

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train = train.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
test = test.cache().prefetch(buffer_size=AUTOTUNE)

Zadefinujeme model našej konvolučnej siete

In [None]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(config['IMAGE_HEIGHT'], config['IMAGE_WIDTH'], 3)),
  
  tfa.layers.WeightNormalization(layers.Conv2D(32, 3, padding='same', activation='relu', dilation_rate=(1, 1) )),
  layers.MaxPooling2D(),
  
  tfa.layers.WeightNormalization(layers.Conv2D(64, 3, padding='same', activation='relu', dilation_rate=(2, 2))),
  layers.MaxPooling2D(),

  layers.Flatten(),
  layers.Dense(config['DENSE'], activation='relu'),
  layers.Dense(1, activation='sigmoid')
])


In [None]:
model.compile(
    optimizer=config['OPTIMIZER'],
    loss=config['LOSS'],
       metrics=[
        'accuracy',
        keras.metrics.Precision(name='Precision'), 
        keras.metrics.Recall(name='Recall'), 
        keras.metrics.SpecificityAtSensitivity(0.5,name='SpecificityAtSensitivity'), 
        keras.metrics.SensitivityAtSpecificity(0.5,name='SensitivityAtSpecificity')
    ]
)

In [None]:
model.summary()

A trénujeme. A plačeme.

In [None]:
model.fit(
  train,
  epochs=config["EPOCHS"], 
  batch_size=config["BATCH_SIZE"], 
  validation_data=test,
  callbacks=config["callbacks"]
)

Nakoniec už len nahráme dáta do wandb

In [None]:
run.finish()