In [None]:
# google drive내의 데이터를 학습하는 코드입니다. 
# 연동없이는 학습이 진행되지 않습니다. 
import sys
!{sys.executable} -m pip install keras pandas numpy image matplotlib scikit-learn
import warnings
warnings.filterwarnings('ignore')


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import os
print(os.listdir("/content/drive/My Drive/dataset"))

In [None]:
FAST_RUN = False
IMAGE_WIDTH = 128
IMAGE_HEIGHT = 128
IMAGE_SIZE = (IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_CHANNELS = 3

In [None]:
# training data prepare
trainDirName = "/content/drive/My Drive/dataset/train/"
filenames = os.listdir(trainDirName)
categories = []
for filename in filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        categories.append(1)
    else:
        categories.append(0)

df = pd.DataFrame({
    'filename' : filenames,
    'category' : categories
})


In [None]:
### dataframe check ###
# df.head()
# df.tail()

### data balance check ###
# df['category'].value_counts().plot.bar()

### sample data check ###
sample = random.choice(filenames)
image = load_img(trainDirName+sample)
plt.imshow(image)

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax')) # cat 과 dog라서 Dense가 2임

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model.summary()

In [None]:
# Callbacks
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Early Stop
# to prevent over fitting, stop the learning after 10 epochs and val_loss value not decreased
earlystop = EarlyStopping(patience=10)

# Learning Rate Reduction (학습율 조정해주기)
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_acc',
                                            patience = 2,
                                            verbose = 1,
                                            factor = 0.5,
                                            min_lr = 0.00001)

# callback 설정
callbacks = [earlystop, learning_rate_reduction]

In [None]:
# dataframe을 string으로 변환
df["category"] = df["category"].replace({0: 'cat', 1: 'dog'})

# train과 validation 데이터 분리
train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

In [None]:
# 데이터 분포 확인
# train_df['category'].value_counts().plot.bar()
# validate_df['category'].value_counts().plot.bar()

In [None]:
total_train = train_df.shape[0]
total_validate = validate_df.shape[0]
batch_size = 15

# 학습데이터 늘리기 (augmentation(이미지 약간회전, 줌, 상하/좌우반전 등) 활용)
train_datagen = ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    trainDirName,
    x_col = 'filename',
    y_col = 'category',
    target_size = IMAGE_SIZE,
    class_mode = 'categorical',
    batch_size = batch_size
)

validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validate_df,
    trainDirName,
    x_col='filename',
    y_col='category',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

In [None]:
# sample check
example_df = train_df.sample(n=1).reset_index(drop=True)
example_generator = train_datagen.flow_from_dataframe(
    example_df,
    trainDirName,
    x_col='filename',
    y_col='category',
    target_size=IMAGE_SIZE,
    class_mode='categorical'
)

# show pic
plt.figure(figsize=(12, 12))
for i in range(0,15):
    plt.subplot(5, 3, i+1)
    for X_batch, Y_batch in example_generator:
        image = X_batch[0]
        plt.imshow(image)
        break

plt.tight_layout()
plt.show()

In [None]:
# model 학습
# epochs = 3 if FAST_RUN else 50
epochs = 50
history = model.fit_generator(
    train_generator,
    epochs=epochs,
    validation_data = validation_generator,
    validation_steps = total_validate//batch_size,
    #step_per_epoch = total_train//batch_size,
    callbacks = callbacks
)

# model save
model.save_weights("model.h5")

In [None]:
# Virtualize Training
#fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
fig, (ax1) = plt.subplots(1, 1, figsize=(12, 12))
ax1.plot(history.history['loss'], color='b', label="Training loss")
ax1.plot(history.history['val_loss'], color='r', label="validation loss")
ax1.set_xticks(np.arange(1, epochs, 1))
ax1.set_yticks(np.arange(0, 1, 0.1))

# ax2.plot(history.history['acc'], color='b', label="Training accuracy")
# ax2.plot(history.history['val_acc'], color='r', label="validation accuracy")
# ax2.set_xticks(np.arange(1, epochs, 1))

legend = plt.legend(loc='best', shadow=True)
plt.tight_layout()
plt.show()

In [None]:
# prepare test data
testDirName = "/content/drive/My Drive/dataset/test1/"
test_filenames = os.listdir(testDirName)
test_df = pd.DataFrame({
    'filename' : test_filenames
})
nb_samples = test_df.shape[0]

# 데이터 준비
test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df,
    testDirName,
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=batch_size,
    shuffle=False
)

In [None]:
# 예측(predict)
predict = model.predict_generator(test_generator, steps=np.ceil(nb_samples/batch_size))

# 평가
test_df['category'] = np.argmax(predict, axis=-1)

In [None]:
# label 변환 (text to int)
print(test_df['category'])
test_df['category'] = test_df['category'].replace({1:'dog', 0:'cat'})

# 정답비율 확인
test_df['category'].value_counts().plot.bar()