# 프로젝트 : 폐렴 진단기 성능개선

## 1. 실험환경 Set-up

In [12]:
import re    # 정규표현식 관련된 작업에 필요한 패키지
import os    # I/O 관련된 작업에 필요한 패키지 
import pandas as pd     # 데이터 전처리 관련된 작업에 필요한 패키지
import numpy as np      # 데이터 array 작업에 필요한 패키지
import tensorflow as tf  # 딥러닝 관련된 작업에 필요한 패키지
import matplotlib.pyplot as plt    # 데이터 시각화에 관련된 작업에 필요한 패키지
from sklearn.model_selection import train_test_split  # 데이터 전처리에 필요한 패키지
import random, math

In [13]:
# 데이터 로드할 때 빠르게 로드할 수 있도록하는 설정 변수
AUTOTUNE = tf.data.experimental.AUTOTUNE

# 데이터 ROOT 경로 변수
ROOT_PATH = os.path.join(os.getenv('HOME'), 'aiffel/EXPLORATION/11/')
TRAIN_PATH = ROOT_PATH + '/chest_xray/data/train/*/*'
VAL_PATH = ROOT_PATH + '/chest_xray/data/val/*/*'
TEST_PATH = ROOT_PATH + '/chest_xray/data/test/*/*'

# BATCH_SIZE 변수
BATCH_SIZE_1 = 32
BATCH_SIZE_2 = 32
BATCH_SIZE_3 = 32
BATCH_SIZE_4 = 32
BATCH_SIZE_5 = 32

# X-RAY 이미지 사이즈 변수
IMAGE_SIZE = [180, 180]

# EPOCH 크기 변수
EPOCHS_1 = 5
EPOCHS_2 = 5
EPOCHS_3 = 5
EPOCHS_4 = 5
EPOCHS_5 = 5

print(ROOT_PATH)

/aiffel/aiffel/EXPLORATION/11/


## 2. 데이터 준비하기

In [14]:
train_filenames = tf.io.gfile.glob(TRAIN_PATH)
test_filenames = tf.io.gfile.glob(TEST_PATH)
val_filenames = tf.io.gfile.glob(VAL_PATH)

print(len(train_filenames))
print(len(test_filenames))
print(len(val_filenames))

0
0
0


In [15]:
# train 데이터와 validation 데이터를 모두 filenames에 담습니다
filenames = tf.io.gfile.glob(TRAIN_PATH)
filenames.extend(tf.io.gfile.glob(VAL_PATH))

# 모아진 filenames를 8:2로 나눕니다
train_size = math.floor(len(filenames)*0.8)
random.seed(8)
random.shuffle(filenames)
train_filenames = filenames[:train_size]
val_filenames = filenames[train_size:]

print(len(train_filenames))
print(len(val_filenames))

0
0


In [16]:
COUNT_NORMAL = len([filename for filename in train_filenames if "NORMAL" in filename])
print("Normal images count in training set: " + str(COUNT_NORMAL))

COUNT_PNEUMONIA = len([filename for filename in train_filenames if "PNEUMONIA" in filename])
print("Pneumonia images count in training set: " + str(COUNT_PNEUMONIA))

Normal images count in training set: 0
Pneumonia images count in training set: 0


In [17]:
train_list_ds = tf.data.Dataset.from_tensor_slices(train_filenames)
val_list_ds = tf.data.Dataset.from_tensor_slices(val_filenames)

In [18]:
TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print("Training images count: " + str(TRAIN_IMG_COUNT))

VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print("Validating images count: " + str(VAL_IMG_COUNT))

Training images count: 0
Validating images count: 0


In [19]:
CLASS_NAMES = np.array([str(tf.strings.split(item, os.path.sep)[-1].numpy())[2:-1]
                        for item in tf.io.gfile.glob(str(ROOT_PATH + "/chest_xray/data/train/*"))])
print(CLASS_NAMES)

[]


In [20]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2] == "PNEUMONIA"   # 폐렴이면 양성(True), 노말이면 음성(False)를 리턴하게 합니다.


In [21]:
def decode_img(img):
  # 이미지를 uint8 tensor로 바꾼다.
  img = tf.image.decode_jpeg(img, channels=3)
  # img를 범위 [0,1]의 float32 데이터 타입으로 바꾼다.
  img = tf.image.convert_image_dtype(img, tf.float32)
  # img의 이미지 사이즈를 IMAGE_SIZE에서 지정한 사이즈로 수정한다.
  return tf.image.resize(img, IMAGE_SIZE)

def process_path(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

In [23]:
train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

ValueError: in user code:

    <ipython-input-21-f404d02b64be>:10 process_path  *
        label = get_label(file_path)
    <ipython-input-20-9c758d65f5c0>:2 get_label  *
        parts = tf.strings.split(file_path, os.path.sep)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/ragged/ragged_string_ops.py:511 string_split_v2
        input, dtype=dtypes.string, name="input")
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/ragged/ragged_tensor.py:2543 convert_to_tensor_or_ragged_tensor
        value=value, dtype=dtype, dtype_hint=preferred_dtype, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1405 convert_to_tensor_v2_with_dispatch
        value, dtype=dtype, dtype_hint=dtype_hint, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1415 convert_to_tensor_v2
        as_ref=False)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/profiler/trace.py:163 wrapped
        return func(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1509 convert_to_tensor
        (dtype.name, value.dtype.name, value))

    ValueError: Tensor conversion requested dtype string for Tensor with dtype float32: <tf.Tensor 'args_0:0' shape=() dtype=float32>


In [24]:
def prepare_for_training_1(ds, shuffle_buffer_size=1000):

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE_1)

    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

train_ds_1 = prepare_for_training_1(train_ds)
val_ds_1 = prepare_for_training_1(val_ds)

NameError: name 'train_ds' is not defined

In [None]:
#좌우

def augment_LR(image,label):
    image = tf.image.random_flip_left_right(image)  # 랜덤하게 좌우를 반전합니다.
    return image,label

def prepare_for_training_LR(ds, shuffle_buffer_size=1000):
    # augment 적용 부분이 배치처리 함수에 추가되었습니다.
    ds = ds.map(
            augment_LR,       # augment 함수 적용
            num_parallel_calls=2
        )

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE_2)

    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

train_ds_2 = prepare_for_training_LR(train_ds)
val_ds_2 = prepare_for_training_LR(val_ds)

In [None]:
def augment_C(image,label):
    image = tf.image.adjust_contrast(image, 0.5)  # 이미지의 대비를 조절
    return image,label

def prepare_for_training_C(ds, shuffle_buffer_size=1000):
    # augment 적용 부분이 배치처리 함수에 추가되었습니다.
    ds = ds.map(
            augment_C,       # augment 함수 적용
            num_parallel_calls=2
        )

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE_3)

    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

train_ds_3 = prepare_for_training_C(train_ds)
val_ds_3 = prepare_for_training_C(val_ds)

In [None]:
def prepare_for_training_4(ds, shuffle_buffer_size=1000):

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE_4)

    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

train_ds_4 = prepare_for_training_4(train_ds)
val_ds_4 = prepare_for_training_4(val_ds)


In [None]:

def prepare_for_training_5(ds, shuffle_buffer_size=1000):

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE_5)

    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

train_ds_5 = prepare_for_training_5(train_ds)
val_ds_5 = prepare_for_training_5(val_ds)

In [None]:
for image, label in train_ds.take(1):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

In [None]:
test_list_ds = tf.data.Dataset.list_files(str(ROOT_PATH + '/chest_xray/data/test/*/*'))
TEST_IMAGE_COUNT = tf.data.experimental.cardinality(test_list_ds).numpy()
test_ds = test_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds_1 = test_ds.batch(BATCH_SIZE_1)
test_ds_2 = test_ds.batch(BATCH_SIZE_2)
test_ds_3 = test_ds.batch(BATCH_SIZE_3)
test_ds_4 = test_ds.batch(BATCH_SIZE_4)
test_ds_5 = test_ds.batch(BATCH_SIZE_5)

print(TEST_IMAGE_COUNT)

## 3. 데이터 시각화

- 베이스라인

In [None]:
image_batch_1, label_batch_1 = next(iter(train_ds_1))

def show_batch_1(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch_1[n])
        if label_batch_1[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

show_batch_1(image_batch_1.numpy(), label_batch_1.numpy())

- 좌우반전

In [None]:
image_batch_2, label_batch_2 = next(iter(train_ds_2))

def show_batch_2(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch_2[n])
        if label_batch_2[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

show_batch_2(image_batch_2.numpy(), label_batch_2.numpy())

- 이미지 대비

In [None]:
image_batch_3, label_batch_3 = next(iter(train_ds_3))

def show_batch_3(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch_3[n])
        if label_batch_3[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

show_batch_3(image_batch_3.numpy(), label_batch_3.numpy())

- drop out 제거

In [None]:
image_batch_4, label_batch_4 = next(iter(train_ds_4))

def show_batch_4(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch_4[n])
        if label_batch_4[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

show_batch_4(image_batch_4.numpy(), label_batch_4.numpy())

- regularization

In [None]:

image_batch_5, label_batch_5 = next(iter(train_ds_5))

def show_batch_5(image_batch, label_batch):
    plt.figure(figsize=(15,15))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch_5[n])
        if label_batch_5[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

show_batch_5(image_batch_5.numpy(), label_batch_5.numpy())

##  4. CNN 모델링

In [None]:
def conv_block(filters):
    block = tf.keras.Sequential([
        tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
        tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPool2D()
    ])
    
    return block

In [None]:
def dense_block(units, dropout_rate):
    block = tf.keras.Sequential([
        tf.keras.layers.Dense(units, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(dropout_rate)
    ])
    
    return block

In [None]:
def dense_block_d0(units):
    block = tf.keras.Sequential([
        tf.keras.layers.Dense(units, activation='relu'),
        tf.keras.layers.BatchNormalization()
    ])
    
    return block

- base line

In [None]:
def build_model_1():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),
        
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Flatten(),
        dense_block(512, 0.7),
        dense_block(128, 0.5),
        dense_block(64, 0.3),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model


- 좌우반전

In [None]:
def build_model_2():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),
        
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Flatten(),
        dense_block(512, 0.7),
        dense_block(128, 0.5),
        dense_block(64, 0.3),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

- 
이미지 대비

In [None]:

def build_model_3():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),
        
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Flatten(),
        dense_block(512, 0.7),
        dense_block(128, 0.5),
        dense_block(64, 0.3),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

- drop out 제거

In [None]:
def build_model_4():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),        
        conv_block(128),        
        conv_block(256),  # drop out 제거
        
        tf.keras.layers.Flatten(),
        dense_block_d0(512),
        dense_block_d0(128),
        dense_block_d0(64),  # drop out 제거된 dense_block 사용
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model


- regularization

In [None]:

def build_model_5():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
        tf.keras.layers.MaxPool2D(),
        
        conv_block(32),
        conv_block(64),
        
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Flatten(),
        dense_block(512, 0.7),
        dense_block(128, 0.5),
        dense_block(64, 0.3),
        
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))
    ])
    
    return model

## 5. 데이터 imbalance 처리


In [None]:
weight_for_0 = (1 / COUNT_NORMAL)*(TRAIN_IMG_COUNT)/2.0 
weight_for_1 = (1 / COUNT_PNEUMONIA)*(TRAIN_IMG_COUNT)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for NORMAL: {:.2f}'.format(weight_for_0))
print('Weight for PNEUMONIA: {:.2f}'.format(weight_for_1))

## 6. 모델 훈련

- 베이스라인

In [None]:
with tf.device('/GPU:0'):
    model_1 = build_model_1()

    METRICS = [
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
    
    model_1.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=METRICS
    )

In [None]:
with tf.device('/GPU:0'):
    history_1 = model_1.fit(
        train_ds_1,
        steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE_1,
        epochs=EPOCHS_1,
        validation_data=val_ds_1,
        validation_steps=VAL_IMG_COUNT // BATCH_SIZE_1,
        class_weight=class_weight,
    )

## 7. 결과 확인과 시각화

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
ax = ax.ravel()

for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
    ax[i].plot(history_1.history[met])
    ax[i].plot(history_1.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

In [None]:
#base line
loss_1, acc_1, prec_1, rec_1 = model_1.evaluate(test_ds_1)

# 회고

- **마지막으로 한번 전체 실행을 돌렸다가 갑자기 오류가 발생했고, 마감시간 문제로 포기함**


- 비록 이미지 변환도 시켜보는등으로 신뢰도를 높이기위한 노력을 했지만 아쉽게도, 다른방식의 데이터표현방법에 대해서 학습과 시각화를 하지못한 점은 아쉽다,,


- 여전히 마감 직전에 돌리는 학습은 너무나도 버겁다.. 매번 하는생각이지만 미리해야겠다 (예정된 20짜리 epochs 를 5밖에 못돌려서 아쉽다) 


- 발상의 전환이라고 했는가 정상적인 이미지를 다양한 방법으로 처치하는 방법에서 모델훈련을 위한 여러가지 방법도 창의성이 중요하다는 것을 알게됐다.