# 資料增補 (Data Augmentation)

In [3]:
# 載入套件
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## 以檔案目錄為基礎，建立訓練(Training)及驗證(Validation)資料集(Dataset)

In [4]:
import os
label_list = list(os.listdir('image_data'))
', '.join(label_list)

'丁, 七, 丈, 三, 上, 下, 不, 丐, 丑, 且, 丕, 世, 丘, 丙, 丞, 丟, 並, 丫, 中, 串, 丸, 丹, 主, 乃, 久, 么, 之, 乍, 乎, 乏, 乒, 乓, 乖, 乘, 乙, 九, 乞, 也, 乩, 乳, 乾, 亂, 了, 予, 事, 二, 于, 云, 互, 五, 井, 亙, 些, 亞, 亟, 亡, 交, 亥, 亦, 亨, 享, 京, 亭, 亮, 人, 什, 仁, 仃, 仄, 仆, 仇, 今, 介, 仍, 仔, 仕, 他, 仗, 付, 仙, 仞, 仟, 代, 令, 以, 仰, 仲, 仳, 件, 任, 份, 仿, 企, 伉, 伊, 伍, 伏, 伐, 休, 伕, 伙, 伯, 估, 伴, 伶, 伸, 伺, 似, 伽, 佃, 但, 佇, 位, 低, 住, 佐, 佑, 佔, 何, 佗, 余, 佛, 作, 佝, 佞, 你, 佣, 佩, 佬, 佯, 佰, 佳, 併, 佻, 佾, 使, 侃, 來, 侈, 例, 侍, 侏, 侖, 供, 依, 侮, 侯, 侵, 侶, 便, 係, 促, 俄, 俊, 俎, 俏, 俐, 俑, 俗, 俘, 俚, 保, 俞, 俟, 俠, 信, 修, 俯, 俱, 俳, 俸, 俺, 俾, 倀, 倆, 倉, 個, 倌, 倍, 倏, 們, 倒, 倔, 倖, 倘, 候, 倚, 借, 倡, 倣, 倥, 倦, 倨, 倩, 倪, 倫, 倭, 值, 偃, 假, 偉, 偌, 偎, 偏, 偕, 做, 停, 健, 側, 偵, 偶, 偷, 偺, 偽, 傀, 傅, 傍, 傑, 傖, 傘, 備, 傢, 催, 傭, 傯, 傲, 傳, 債, 傷, 傻, 傾, 僅, 像, 僑, 僕, 僖, 僚, 僥, 僧, 僭, 僮, 僱, 僵, 價, 僻, 儀, 儂, 億, 儈, 儉, 儐, 儒, 儔, 儘, 償, 儡, 優, 儲, 儷, 儼, 兀, 允, 元, 兄, 充, 兆, 兇, 先, 光, 克, 兌, 免, 兒, 兔, 兕, 兗, 兜, 兢, 入, 內, 全, 兩, 八, 公, 六, 兮, 共, 兵, 其, 具, 典, 兼, 冀, 冉, 冊, 再, 冑, 冒, 冕, 冗, 冠, 冢, 冤, 冥, 冬, 冰, 冶, 冷, 冽, 准, 凋, 凌, 凍, 凜, 凝, 几, 凡, 凰, 凱, 凳, 凶, 凸, 凹, 出, 函, 刀, 刁, 刃, 分, 切, 刈, 刊, 刎, 

## 定義資料增補(Data Augmentation)

In [8]:
# https://medium.com/@shihaoticking/實作資料強化-data-augmentation-實現圖片翻轉-平移-縮放-4b37d4400ffb
image_gen_train = ImageDataGenerator(
#     rescale=1./255,               # 從0~255整數，壓縮為0~1浮點數
    rotation_range=10,            # 隨機旋轉 ±10°
    width_shift_range=.15,        # 隨機水平移動 ±15%
    height_shift_range=.15,       # 隨機垂直移動 ±15%
#     horizontal_flip=True,         # 隨機水平翻轉
    zoom_range=0.2                # 隨機縮放 20%
)

image_gen_test = ImageDataGenerator(
)

In [15]:
# image_dataset_from_directory：讀取目錄中的檔案，存入 dataset
# image_dataset_from_directory：tf v2.3.0 才支援

image_size = (50, 50)  # 影像尺寸
batch_size = 100          # 批量
directory = './image_data'

# 訓練資料集(Dataset)
train_data_gen = image_gen_train.flow_from_directory(
    directory=directory,
    target_size=image_size,
    batch_size=batch_size,
    color_mode="grayscale", 
    class_mode='sparse'
)

# 驗證(Validation)資料集
val_data_gen = image_gen_test.flow_from_directory(
    directory=directory,
    target_size=image_size,
    batch_size=batch_size,
    color_mode="grayscale", 
    class_mode='sparse'
)

Found 54107 images belonging to 1305 classes.
Found 54107 images belonging to 1305 classes.


## 建立模型

In [16]:
image_size + (1,)

(50, 50, 1)

In [33]:
# 定義模型
from tensorflow.keras.layers import *

def make_model(input_shape, num_classes):
    model = tf.keras.models.Sequential()

    model.add(layers.Rescaling(1.0 / 255, input_shape=input_shape))
    model.add(Conv2D(filters=32, kernel_size=(5,5), activation='relu'))
    model.add(Conv2D(filters=32, kernel_size=(5,5), activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(rate=0.2))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(rate=0.2))

    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(rate=0.2))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# 建立模型
model = make_model(input_shape=image_size + (1,), num_classes=len(label_list))
# 繪製模型結構
# keras.utils.plot_model(model, show_shapes=True)

## 訓練模型

In [34]:
epochs = 15

# 設定優化器(optimizer)、損失函數(loss)、效能衡量指標(metrics)的類別
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

history = model.fit(
    train_data_gen,               # 帶入訓練資料產生器
    epochs=epochs,                # 將所有資料看過 50 次
    validation_data=val_data_gen  # 帶入驗證資料產生器
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### 訓練 50 epochs，驗證準確率可達 96%.

## 從目錄中任選一個檔案測試

In [35]:
with open('./labels.txt', 'r', encoding='utf8') as f:
    text = f.read()
labels = {}    
for i, item in enumerate(text.split('\n')):
    labels[i]=item

In [36]:
# 任取一筆資料測試
import os, numpy as np

def predict(file_path):
    img = keras.preprocessing.image.load_img(
        file_path, target_size=image_size, color_mode="grayscale"
    )
    img_array = keras.preprocessing.image.img_to_array(img) # 將影像轉為陣列
    img_array = tf.expand_dims(img_array, 0)  # 增加一維在最前面，代表一筆資料
    print(img_array.shape)
    predictions = model.predict(img_array)
    pred = np.argmax(predictions, axis=-1)[0]
    score = np.max(predictions, axis=-1)[0]
    print(f"預測={labels[pred]}, 機率= {(100 * score):.2f}%")

In [37]:
predict(r".\image_data\博\7.png")

(1, 50, 50, 1)
預測=博, 機率= 89.53%


In [38]:
predict(r".\image_data\乙\7.png")

(1, 50, 50, 1)
預測=乙, 機率= 99.57%


In [39]:
predict(r".\image_data\丐\7.png")

(1, 50, 50, 1)
預測=丐, 機率= 89.76%


In [41]:
# 模型存檔
model.save('./chinese_model_2.h5')

In [42]:
# 模型載入
model = tf.keras.models.load_model('./chinese_model_2.h5')