<a href="https://colab.research.google.com/github/Joonyoung-Song/Kaggle-Chest-X-Ray-Images-Pneumonia-/blob/main/CNN_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from keras.preprocessing import image
from keras.utils import np_utils
import os
from google.colab import drive
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from tensorflow.keras.utils import plot_model, to_categorical

import seaborn as sns
import matplotlib.pyplot as plt

import cv2

In [None]:
drive.mount('/gdrive', force_remount=True)

In [None]:
# Define path to the data directory
data_dir = Path('/gdrive/MyDrive/Colab Notebooks/kaggle_pneumonia_classification')

# Path to train directory (Fancy pathlib...no more os.path!!)
train_dir = data_dir / 'train'

# Path to validation directory
val_dir = data_dir / 'val'

# Path to test directory
test_dir = data_dir / 'test'

In [None]:
# Get the path to the normal and pneumonia sub-directories
normal_cases_dir = train_dir / 'NORMAL'
pneumonia_cases_dir = train_dir / 'PNEUMONIA'

# Get the list of all the images
normal_cases = normal_cases_dir.glob('*.jpeg')
pneumonia_cases = pneumonia_cases_dir.glob('*.jpeg')

# An empty list. We will insert the data into this list in (img_path, label) format
train_data = []

# Go through all the normal cases. The label for these cases will be 0
for img in normal_cases:
    train_data.append((img,0))

# Go through all the pneumonia cases. The label for these cases will be 1
for img in pneumonia_cases:
    train_data.append((img, 1))

# Get a pandas dataframe from the data we have in our list 
train_data = pd.DataFrame(train_data, columns=['image', 'label'],index=None)

# Shuffle the data 
train_data = train_data.sample(frac=1.).reset_index(drop=True)

# How the dataframe looks like?
train_data.head()

In [None]:
img_size=224
X_train = np.zeros(shape=(len(train_data),img_size,img_size,3))

for idx,fname in enumerate(tqdm(train_data.image)):
  img = image.load_img(fname,target_size=(img_size,img_size))
  img_array_train = image.img_to_array(img)
  img_array_train = np.expand_dims(img_array_train,axis=0)
  X_train[idx] = img_array_train

In [None]:
# Get the path to the normal and pneumonia sub-directories
normal_cases_dir = val_dir / 'NORMAL'
pneumonia_cases_dir = val_dir / 'PNEUMONIA'

# Get the list of all the images
normal_cases = normal_cases_dir.glob('*.jpeg')
pneumonia_cases = pneumonia_cases_dir.glob('*.jpeg')

# An empty list. We will insert the data into this list in (img_path, label) format
val_data = []

# Go through all the normal cases. The label for these cases will be 0
for img in normal_cases:
    val_data.append((img,0))

# Go through all the pneumonia cases. The label for these cases will be 1
for img in pneumonia_cases:
    val_data.append((img, 1))

# Get a pandas dataframe from the data we have in our list 
val_data = pd.DataFrame(val_data, columns=['image', 'label'],index=None)

# Shuffle the data 
val_data = val_data.sample(frac=1.).reset_index(drop=True)

# How the dataframe looks like?
val_data.head()

In [None]:
img_size=224
X_val = np.zeros(shape=(len(val_data),img_size,img_size,3))

for idx,fname in enumerate(tqdm(val_data.image)):
  img = image.load_img(fname,target_size=(img_size,img_size))
  img_array_val = image.img_to_array(img)
  img_array_val = np.expand_dims(img_array_val,axis=0)
  X_val[idx] = img_array_val

In [None]:
# Get the path to the normal and pneumonia sub-directories
normal_cases_dir = test_dir / 'NORMAL'
pneumonia_cases_dir = test_dir / 'PNEUMONIA'

# Get the list of all the images
normal_cases = normal_cases_dir.glob('*.jpeg')
pneumonia_cases = pneumonia_cases_dir.glob('*.jpeg')

# An empty list. We will insert the data into this list in (img_path, label) format
test_data = []

# Go through all the normal cases. The label for these cases will be 0
for img in normal_cases:
    test_data.append((img,0))

# Go through all the pneumonia cases. The label for these cases will be 1
for img in pneumonia_cases:
    test_data.append((img, 1))

# Get a pandas dataframe from the data we have in our list 
test_data = pd.DataFrame(test_data, columns=['image', 'label'],index=None)

# Shuffle the data 
test_data = test_data.sample(frac=1.).reset_index(drop=True)

# How the dataframe looks like?
test_data.head()

In [None]:
img_size=224
X_test = np.zeros(shape=(len(test_data),img_size,img_size,3))

for idx,fname in enumerate(tqdm(test_data.image)):
  img = image.load_img(fname,target_size=(img_size,img_size))
  img_array_test = image.img_to_array(img)
  img_array_test = np.expand_dims(img_array_test,axis=0)
  X_test[idx] = img_array_test

# EDA

In [None]:
print('train set shape :',X_train.shape,train_data['label'].shape)
print('valid set shape :',X_val.shape,val_data['label'].shape)
print('test set shape :',X_test.shape,test_data['label'].shape)

In [None]:
plt.bar(["Pneumonia : 1","Normal : 0"],train_data['label'].value_counts())

- 폐렴이미지의 수가 3배정도 높은 빈도를 관찰할 수 있음

#### train image 관찰하기

In [None]:
#train image 관찰하기

from skimage.io import imread

pneumonia_samples = (train_data[train_data['label']==1]['image'].iloc[:5]).tolist()
normal_samples = (train_data[train_data['label']==0]['image'].iloc[:5]).tolist()

# Concat the data in a single list and del the above two list
samples = pneumonia_samples + normal_samples
del pneumonia_samples, normal_samples

# Plot the data 
f, ax = plt.subplots(2,5, figsize=(30,10))
for i in range(10):
    img = imread(samples[i])
    ax[i//5, i%5].imshow(img, cmap='gray')
    if i<5:
        ax[i//5, i%5].set_title("Pneumonia")
    else:
        ax[i//5, i%5].set_title("Normal")
    ax[i//5, i%5].axis('off')
    ax[i//5, i%5].set_aspect('auto')
plt.show()

- 육안상으로 폐렴 유무를 구분하기 힘듬

#### valid image 관찰하기

In [None]:
#valid image 관찰하기

from skimage.io import imread

pneumonia_samples = (val_data[train_data['label']==1]['image'].iloc[:5]).tolist()
normal_samples = (val_data[train_data['label']==0]['image'].iloc[:5]).tolist()

# Concat the data in a single list and del the above two list
samples = pneumonia_samples + normal_samples
del pneumonia_samples, normal_samples

# Plot the data 
f, ax = plt.subplots(2,5, figsize=(30,10))
for i in range(10):
    img = imread(samples[i])
    ax[i//5, i%5].imshow(img, cmap='gray')
    if i<5:
        ax[i//5, i%5].set_title("Pneumonia")
    else:
        ax[i//5, i%5].set_title("Normal")
    ax[i//5, i%5].axis('off')
    ax[i//5, i%5].set_aspect('auto')
plt.show()

이미지의 특징
- 가운데로 모두 정렬되어 있기 때문에 augmentaion 큰 변화 지양
- 이미지의 좌우상하가 명확함 
- 이미지의 각도가 일정함
- 흑백이며 x-ray이기 때문에 밝기도 어느정도 일정함.

=> 이러한 정보를 바탕으로 augmentaion 시도

# Augmentation

In [None]:
# With data augmentation to prevent overfitting and handling the imbalance in dataset
datagen = ImageDataGenerator(
        rescale = 1./255,
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range = 30,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.2, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip = False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
test_datagen = ImageDataGenerator(rescale=1./255)

- 30도씩 랜덤하게 회전
- 무작위로 20% 확대/축소
- 10%만큼 수평으로 이동
- 10%만큼 수직으로 이동
- 수평으로 랜덤하게 뒤집기


In [None]:
datagen.fit(X_train)

# Modeling

In [None]:
# CNN 빌드
def conv_block(filters):
    block = tf.keras.Sequential([
        tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
        tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPool2D()
    ]
    )
    return block
def dense_block(units, dropout_rate):
    block = tf.keras.Sequential([
        tf.keras.layers.Dense(units, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(dropout_rate)
    ])
    return block
def build_model():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(img_size, img_size, 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),
        
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Flatten(),
        dense_block(512, 0.7),
        dense_block(128, 0.5),
        dense_block(64, 0.3),
        
        tf.keras.layers.Dense(2, activation='softmax')  
    ])
    model.compile( loss='binary_crossentropy',optimizer='adam',metrics='accuracy')
    return model

In [None]:
# 모델 밸런싱

train_data['label'].value_counts(normalize=True)[0]

initial_bias = np.log([train_data['label'].value_counts()[1]/train_data['label'].value_counts()[0]])
initial_bias

weight_for_0 = (1 / train_data['label'].value_counts()[0])*(len(train_data))/2.0 
weight_for_1 = (1 / train_data['label'].value_counts()[1])*(len(train_data))/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
y_train=train_data["label"].values
y_val=val_data["label"].values

In [None]:
model_dir = data_dir / 'model'
checkpoint_path = model_dir / "cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10,
                   verbose=1, mode='min', baseline=None, restore_best_weights=True)

# rlr = ReduceLROnPlateau(monitor='val_loss', patience = 8, verbose=1,factor=0.5, min_lr=0.000001)

chkpt = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, save_weights_only=True)

clf = build_model()
hist=clf.fit(datagen.flow(X_train, 
            to_categorical(y_train),batch_size=32),
            validation_data=datagen.flow(X_val, to_categorical(y_val)),
            epochs=50,
            class_weight=class_weight,
            callbacks=[es, chkpt 
                      #  ,rlr
                       ]
            )

In [None]:
train_acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

In [None]:
plt.plot(range(1,len(train_acc)+1),train_acc)
plt.plot(range(1,len(train_acc)+1),val_acc)
plt.legend(['train accuracy', 'validation accuracy'],loc=2)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

In [None]:
clf.load_weights(checkpoint_path)

In [None]:
clf.summary()

In [None]:
loss, acc = clf.evaluate(test_datagen.flow(X_test,  to_categorical(test_data['label'].values)), verbose=2)
print('정확도: {:5.2f}%'.format(100*acc))