In [1]:
#
import numpy as np 
import pandas as pd 
import random

# folder
import os
import glob

# image
from PIL import Image

# visu
import matplotlib.pyplot as plt
plt.rc('image', cmap='gray')

# sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#tensorflow
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
categories = ["NORMAL", "PNEUMONIA"]
datasets = ["train", "test", "val"]

In [5]:
#glob.glob('C:\\Users\\김주빈\\project\\Damion\\project\\Chest X-Ray Images (Pneumonia)\\chest_xray\\*')

In [7]:
%%time

widths = []
heights = []

for set_ in datasets:
    for cat in categories:
        filelist = glob.glob('C:\\Users\\Been\\chest_xray\\'+ set_ + '/' + cat + '/*.jpeg')
        widths.extend([Image.open(fname).size[0] for fname in filelist])
        heights.extend([Image.open(fname).size[1] for fname in filelist])

images_size = pd.DataFrame({"widths": widths, "heights": heights})
        
print("Average image width: " + f'{images_size["widths"].mean():.2f}')
print("Average image height: " + f'{images_size["heights"].mean():.2f}')

Average image width: 1327.88
Average image height: 970.69
Wall time: 3.93 s


In [8]:
im_width = int(images_size["widths"].mean()/10)
im_height = int(images_size["heights"].mean()/10)
print("image width: " + str(im_width))
print("image height: " + str(im_height))

image width: 132
image height: 97


In [17]:
%%time

data = []
target = []

for set_ in datasets:
    for cat in categories:
        filelist = glob.glob('C:\\Users\\Been\\chest_xray\\'+ set_ + '/' + cat + '/*.jpeg')
        target.extend([cat for _ in filelist])
        data.extend([np.array(Image.open(fname).convert('L').resize((im_width, im_height))) for fname in filelist])
#
data_array = np.stack(data, axis=0)
target = np.array(target)

Wall time: 2min 9s


In [18]:
print(data_array.shape)
print(target)

(5856, 97, 132)
['NORMAL' 'NORMAL' 'NORMAL' ... 'PNEUMONIA' 'PNEUMONIA' 'PNEUMONIA']


In [19]:
pd.concat([pd.DataFrame(pd.DataFrame({"target" : target}).value_counts()).rename(columns={0:"count"}),
           pd.DataFrame(pd.DataFrame(target).value_counts()*100/len(target)).applymap(round).rename(columns={0:"%"})], axis=1)

Unnamed: 0,count,%
PNEUMONIA,4273,73
NORMAL,1583,27


In [20]:
fig = plt.figure(figsize=(20,15))
gs = fig.add_gridspec(4, 4)
#
for line in range(0, 3):
    for row in range(0, 3):
        num_image = random.randint(0, data_array.shape[0])
        ax = fig.add_subplot(gs[line, row])
        ax.axis('off');
        ax.set_title(target[num_image])
        ax.imshow(data_array[num_image]);

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data_array, target, random_state = 43, test_size = 0.2, stratify=target)

In [24]:
print(X_train.shape)
print(y_train.shape)

(4684, 97, 132)
(4684,)


In [25]:
pd.DataFrame(y_train).value_counts()/len(y_train)
pd.DataFrame().value_counts()/len(y_train)

PNEUMONIA    0.729718
NORMAL       0.270282
dtype: float64

In [30]:
#정규화 (0에서 1까지 픽셀값 조정)
X_test_norm = np.round((X_test/255), 3).copy()
X_train_norm = np.round((X_train/255), 3).copy()

In [33]:
encoder = LabelEncoder()

In [35]:
y_train_cat = encoder.fit_transform(y_train)
y_test_cat = encoder.fit_transform(y_test)

In [36]:
print(y_train[:3])
print(y_train_cat[:3])

['PNEUMONIA' 'NORMAL' 'NORMAL']
[1 0 0]


In [37]:
X_train_norm = X_train_norm.reshape(-1, 97, 132, 1)
X_test_norm = X_test_norm.reshape(-1, 97, 132, 1)
X_train_norm.shape

(4684, 97, 132, 1)