In [1]:
from PIL import ImageFont, ImageDraw, Image
from fontTools.ttLib import TTFont
from macrotoolchain import Data, Graph, plot 

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

import time
import json

In [2]:
class Glyph(object):
    # transform character to bitmap
    def __init__(self, fonts, size=64):
        # load fonts, size. We will use 2 fonts for all CJK characters, so keep 2 codepoint books.
        self.codepoints = [set()] * len(fonts)
        self.size = int(size * 0.8)
        self.size_img = size
        self.pad = (size - self.size) // 2
        self.fonts = [ImageFont.truetype(f, self.size) for f in fonts]
        # use a cache to reduce computation if duplicated characters encountered.
        self.cache = {}
        for cp, font in zip(self.codepoints, fonts):
            font = TTFont(font)
            # store codepoints in font cmap into self.codepoints
            for cmap in font['cmap'].tables:
                if not cmap.isUnicode():
                    continue
                for k in cmap.cmap:
                    cp.add(k)
    
    def draw(self, ch):
        if ch in self.cache:
            return self.cache[ch]
        # search among fonts, use the first found
        exist = False
        for i in range(len(self.codepoints)):
            if ord(ch) in self.codepoints[i]:
                font = self.fonts[i]
                exist = True
                break
        if not exist:
            return None

        img = Image.new('L', (self.size_img, self.size_img), 0)
        draw = ImageDraw.Draw(img)
        (width, baseline), (offset_x, offset_y) = font.font.getsize(ch)
        draw.text((self.pad - offset_x, self.pad - offset_y + 4), ch, font=font, fill=255, stroke_fill=255) 
        img_array = np.array(img.getdata(), dtype='float32').reshape((self.size_img, self.size_img)) / 255
        self.cache[ch] = img_array

        return img_array

In [3]:
glyphbook = Glyph(['data/fonts/HanaMinA.otf', 'data/fonts/HanaMinB.otf'])

In [4]:
code_chart = pd.read_csv('data/cangjie6.txt', delimiter='\t', header=None, names=['Char', 'Code'], 
                        keep_default_na=False)

In [5]:
def preprocess_chart(chart):
    glyphs = []
    codes = []
    for char, code in chart.values:
        glyph = glyphbook.draw(char)
        if glyph is not None:
            glyphs.append(glyph)
            codes.append(code)
    return np.expand_dims(np.array(glyphs), -1), np.array(codes)

In [6]:
VOCAB = 28
def tokenizer(code_table):
    # Cangjie code consists only of a-z, with maximum length of 5, minimum of 1
    # start with 0, a-z are 1-26, end and padding are 27
    tokens = np.expand_dims(np.zeros(code_table.shape, dtype='int64'), -1)
    code_index = list(map(lambda x: list(map(lambda y: ord(y) - 96, list(x))) + [27] * (5-len(x)), code_table))
    tokens = np.append(tokens, np.array(code_index), axis=-1)
    return tokens

In [9]:
glyphs, codes = preprocess_chart(code_chart)
del code_chart

In [10]:
train_glyphs, validation_glyphs, train_tokens, validation_tokens = train_test_split(
    glyphs, tokenizer(codes), test_size=0.1)
del glyphs, codes

In [44]:
class Res_CNN(tf.keras.Model):
    def __init__(self, feature_dim, kernel_size):
        super(Res_CNN, self).__init__()
        self.cnn1 = tf.keras.layers.Convolution2D(feature_dim, kernel_size, activation='relu')
        self.cnn2 = tf.keras.layers.Convolution2D(feature_dim, kernel_size, padding='same', activation='relu')
        self.cnn3 = tf.keras.layers.Convolution2D(feature_dim, kernel_size, padding='same')
        
    def call(self, x):
        x = self.cnn1(x)
        x_identity = tf.identity(x)
        x = self.cnn2(x)
        x = self.cnn3(x)
        x = tf.nn.relu(x + x_identity)
        return x

In [45]:
class CNN_Encoder(tf.keras.Model):
    # This is essentially a CNN layer, 
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.res_cnn1 = Res_CNN(8, (3, 3))
        self.pool1 = tf.keras.layers.MaxPool2D((2, 2))
        self.res_cnn2 = Res_CNN(32, (3, 3))
        self.pool2 = tf.keras.layers.MaxPool2D((2, 2))
        self.res_cnn3 = Res_CNN(128, (3, 3))
        self.pool3 = tf.keras.layers.MaxPool2D((2, 2))
        self.fc = tf.keras.layers.Dense(embedding_dim, activation='relu')

    def call(self, x):
        # x shape after cnn1 == (batch_size, 62, 62, 8)
        x = self.res_cnn1(x)
        # x shape after pool1 == (batch_size, 31, 31, 8)
        x = self.pool1(x)
        # x shape after cnn2 == (batch_size, 29, 29, 32)
        x = self.res_cnn2(x)
        # x shape after pool2 == (batch_size, 14, 14, 32)
        x = self.pool2(x)
        # x shape after cnn2 == (batch_size, 12, 12, 128)
        x = self.res_cnn3(x)
        # x shape after pool2 == (batch_size, 6, 6, 128)
        x = self.pool3(x)
        # reshape from (batch_size, 6, 6, 128) to (batch_size, 36, 128)
        x = tf.reshape(x, [x.shape[0], -1, x.shape[-1]])
        # x shape after fc == (batch_size, 36, embedding_dim)
        x = self.fc(x)
        return x

In [46]:
class Bahdanau_Attention(tf.keras.Model):
    def __init__(self, hidden_size):
        super(Bahdanau_Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(hidden_size)
        self.W2 = tf.keras.layers.Dense(hidden_size)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_Encoder output) shape == (batch_size, 81, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 81, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 81, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, embedding_dim)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [73]:
class First_Step_Model(tf.keras.Model):
    def __init__(self, embedding_dim, max_length, hidden_size, vocab_size):
        super(First_Step_Model, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, hidden_size)
        self.attention = Bahdanau_Attention(hidden_size)
        self.fc1 = tf.keras.layers.Dense(hidden_size, activation='relu')
        self.fc2 = tf.keras.layers.Dense(vocab_size)
        
    def call(self, feature, position):
        # y shape (batch_size, hidden_size)
        y = self.embedding(position)
        # x shape (batch_size, embedding_dim)
        x, w = self.attention(feature, y)
        # x shape (batch_size, hidden_size)
        x = self.fc1(x)
        # x shape (batch_size, vocab_size)
        x = self.fc2(x)
        return x, w

In [74]:
optimizer_step1 = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    loss_ = loss_object(real, pred)
    return tf.reduce_mean(loss_)

def accuracy_function(real, pred):
    pred_index = tf.math.argmax(pred, axis=-1)
    return tf.math.reduce_mean(tf.cast(pred_index == real, tf.float32))

In [None]:
cnn_model = CNN_Encoder(embedding_dim = 128)
first_step_model = First_Step_Model(embedding_dim = 128, max_length = train_tokens.shape[1]-1,
                              hidden_size = 128, vocab_size = VOCAB)

In [111]:
@tf.function
def first_step_train(glyph, target):
    loss = 0; accuracy = 0
    with tf.GradientTape() as tape:
        feature = cnn_model(glyph)
        for i in range(1, target.shape[1]):
            position = tf.convert_to_tensor(np.repeat(i-1, target.shape[0]), dtype='int64')
            prediction, weight = first_step_model(feature, position)
            loss += tf.reduce_mean(loss_object(target[:, i], prediction))
            accuracy += accuracy_function(target[:, i], prediction)
    trainable_variables = first_step_model.trainable_variables + cnn_model.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer_step1.apply_gradients(zip(gradients, trainable_variables))
    return loss / (target.shape[1] - 1), accuracy / (target.shape[1] - 1)

In [85]:
@tf.function
def first_step_validation(glyph, target):
    loss = 0; accuracy = 0
    feature = cnn_model(glyph)
    for i in range(1, target.shape[1]):
        position = tf.convert_to_tensor(np.repeat(i-1, target.shape[0]), dtype='int64')
        prediction, weight = first_step_model(feature, position)
        loss += tf.reduce_mean(loss_object(target[:, i], prediction))
        accuracy += accuracy_function(target[:, i], prediction)
    return loss / (target.shape[1] - 1), accuracy / (target.shape[1] - 1)

In [86]:
BATCH_SIZE = 128
dataset = tf.data.Dataset.from_tensor_slices((train_glyphs, train_tokens))
dataset = dataset.shuffle(train_glyphs.shape[0]).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [87]:
# use a checkpoint to store weights
checkpoint_path_step1 = './checkpoints/train_step1'
history_path_step1 = './history/history_step1_'
ckpt_step1 = tf.train.Checkpoint(model=first_step_model, cnn_model=cnn_model, optimizer=optimizer_step1)
ckpt_manager_step1 = tf.train.CheckpointManager(ckpt_step1, checkpoint_path_step1, max_to_keep=5)

In [88]:
history_step1 = {'epoch': [], 'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
EPOCHS = 200
num_steps = len(train_glyphs) // BATCH_SIZE

epoch = 0
if ckpt_manager_step1.latest_checkpoint:
    epoch = int(ckpt_manager_step1.latest_checkpoint.split('-')[-1]) * 5 - 1
    ckpt_step1.restore(ckpt_manager_step1.latest_checkpoint)
    with open(history_path_step1+f'{epoch//5+1}.json') as readfile:  
        history_step1 = json.load(readfile)

In [116]:
while epoch < EPOCHS:
    start = time.time()
    total_loss = 0
    total_accuracy = 0

    for (batch, (glyph_tensor, target)) in enumerate(dataset):
        t_loss, accuracy = first_step_train(glyph_tensor, target)
        total_loss += t_loss
        total_accuracy += accuracy
        print(f'Epoch {epoch + 1}, Train Loss {total_loss/batch:.6f}, Accuracy {total_accuracy / batch:.2%};\
 progression {batch / num_steps:.1%}, time elapsed {time.time() - start:.2f} sec', end='\r')
    
    val_loss, val_accuracy = first_step_validation(validation_glyphs, validation_tokens)
   
    # storing the epoch end loss value to plot later
    history_step1['epoch'].append(epoch)
    history_step1['loss'].append(float((total_loss / num_steps).numpy()))
    history_step1['accuracy'].append(float((total_accuracy / num_steps).numpy()))
    history_step1['val_loss'].append(float(val_loss.numpy()))
    history_step1['val_accuracy'].append(float(val_accuracy.numpy()))
    
    if epoch % 5 == 4:
        ckpt_manager_step1.save()
        with open(history_path_step1+f'{epoch//5+1}.json', 'w') as outfile:
            json.dump(history_step1, outfile)

    print (f'Epoch {epoch + 1}, Train Loss {total_loss/num_steps:.6f}, Accuracy {total_accuracy / num_steps:.2%};\
 Validation Loss {val_loss:.6f}, Accuracy {val_accuracy:.2%}; taken {time.time() - start:.2f} sec')
    epoch += 1
    
    

Epoch 55, Train Loss 2.058856, Accuracy 42.62%; Validation Loss 2.308444, Accuracy 38.04%; taken 608.78 sec
Epoch 56, Train Loss 2.056540, Accuracy 42.69%; Validation Loss 2.314140, Accuracy 38.02%; taken 601.56 sec
Epoch 57, Train Loss 2.055257, Accuracy 42.71%; Validation Loss 2.314611, Accuracy 37.96%; taken 594.72 sec
Epoch 58, Train Loss 2.054896, Accuracy 42.76%; Validation Loss 2.317532, Accuracy 38.04%; taken 598.51 sec
Epoch 59, Train Loss 2.055310, Accuracy 42.75%; Validation Loss 2.309885, Accuracy 38.17%; taken 604.15 sec
Epoch 60, Train Loss 2.471562, Accuracy 50.78%; progression 0.7%, time elapsed 6.58 sec

KeyboardInterrupt: 

In [None]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, hidden_size, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.hidden_size,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(hidden_size, activation='relu')
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = Bahdanau_Attention(hidden_size)

    def call(self, x, features, hidden):
        # x is forward direction, y is beckward direction
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape before is (batch_size, 1) since it is passed through one by one at a time
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # context_vector shape is (batch_size, embedding_dim)
        # x shape after concatenation == (batch_size, 1, embedding_dim + embedding_dim)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        # x shape is (batch_size, hidden_size)
        # state is new hidden used in next step
        x, state = self.gru(x)
        # x shape (batch_size, hidden_size)
        x = self.fc1(x)
        # x shape (batch_size, vocab_size)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        # generate new hidden layer with different batch size
        return tf.zeros((batch_size, self.hidden_size))

In [None]:
decoder = RNN_Decoder(embedding_dim=128, hidden_size=128, vocab_size=VOCAB)

In [None]:
@tf.function
def predict(features, max_length):
    # start with 0
    dec_input = tf.convert_to_tensor([[0]]*features.shape[0], dtype='int64')
    hidden = decoder.reset_state(batch_size=features.shape[0])
    # iterate predictions, no teacher forcing here
    for i in range(max_length):
        prediction, hidden, attention_weights = decoder(tf.expand_dims(dec_input[:, i], 1), features, hidden)
        # we need deterministic result
        predicted_id = tf.math.argmax(prediction, axis=-1)
        dec_input = tf.concat([dec_input, tf.expand_dims(predicted_id, 1)], axis=1)
    return dec_input

In [None]:
@tf.function
def predict_next(features, target):
    hidden = decoder.reset_state(batch_size=features.shape[0])
    predictions = tf.constant(0, dtype='float32', shape=(features.shape[0], 1, VOCAB))
    for i in range(target.shape[1]-1):
        prediction, hidden, attention_weights = decoder(tf.expand_dims(target[:, i], 1), features, hidden)
        predictions = tf.concat([predictions, tf.expand_dims(prediction, 1)], axis=1)
    return predictions[:, 1:, :]

In [None]:
optimizer_step2 = tf.keras.optimizers.Adam()

def loss_function_step2(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    loss_ = tf.reduce_mean(loss_, axis=0)
    return tf.reduce_sum(loss_)

def accuracy_function_step2(real, pred):
    accuracy = tf.math.reduce_all(pred == real, 1)
    return tf.math.reduce_mean(tf.cast(accuracy, tf.float32))

In [119]:
@tf.function
def train_step(glyph_tensor, target):
    # use tape to auto generate gradients
    with tf.GradientTape() as tape:
        features = cnn_model(glyph_tensor)
        predictions = predict_next(features, target)
        loss = loss_function_step2(target[:, 1:], predictions)
    
    # calculate accuracy based on the code's whole string
    predictions_id = predict(features, target.shape[1]-1)
    accuracy = accuracy_function_step2(predictions_id, target)

    trainable_variables = decoder.trainable_variables + cnn_model.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer_step2.apply_gradients(zip(gradients, trainable_variables))

    return loss / (target.shape[1] - 1), accuracy

In [None]:
@tf.function
def validation_step(glyph_tensor, target):
    features = cnn_model(glyph_tensor)
    predictions = predict_next(features, target)
    loss = loss_function_step2(target[:, 1:], predictions)
    
    # calculate accuracy based on the code's whole string
    predictions_id = predict(features, target.shape[1]-1)
    accuracy = accuracy_function_step2(predictions_id, target)
    
    return loss / (target.shape[1] - 1), accuracy

In [None]:
# use a checkpoint to store weights
checkpoint_path_step2 = "./checkpoints/train_step2"
history_path_step2 = './history/history_step2_'
ckpt_step2 = tf.train.Checkpoint(encoder=cnn_model, decoder=decoder, optimizer=optimizer_step2)
ckpt_manager_step2 = tf.train.CheckpointManager(ckpt_step2, checkpoint_path_step2, max_to_keep=5)

In [None]:
history_step2 = {'epoch': [], 'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
EPOCHS = 200

epoch_step2 = 0
if ckpt_manager_step2.latest_checkpoint:
    epoch_step2 = int(ckpt_manager_step2.latest_checkpoint.split('-')[-1]) * 5 - 1
    ckpt_step2.restore(ckpt_manager_step2.latest_checkpoint)
    with open(history_path_step2+f'{epoch//5+1}.json') as readfile:  
        history_step2 = json.load(readfile)

In [120]:
while epoch_step2 < EPOCHS:
    start = time.time()
    total_loss = 0
    total_accuracy = 0

    for (batch, (glyph_tensor, target)) in enumerate(dataset):
        t_loss, accuracy = train_step(glyph_tensor, target)
        total_loss += t_loss
        total_accuracy += accuracy
        print(f'Epoch {epoch_step2 + 1}, Train Loss {total_loss/batch:.6f}, Accuracy {total_accuracy / batch:.2%};\
 progression {batch / num_steps:.1%}, time elapsed {time.time() - start:.2f} sec', end='\r')
    
    val_loss, val_accuracy = validation_step(validation_glyphs, validation_tokens)
   
    # storing the epoch end loss value to plot later
    history_step2['epoch'].append(epoch_step2)
    history_step2['loss'].append(float((total_loss / num_steps).numpy()))
    history_step2['accuracy'].append(float((total_accuracy / num_steps).numpy()))
    history_step2['val_loss'].append(float(val_loss.numpy()))
    history_step2['val_accuracy'].append(float(val_accuracy.numpy()))
    
    if epoch % 5 == 4:
        ckpt_manager_step2.save()
        with open(history_path_step2+f'{epoch//5+1}.json', 'w') as outfile:
            json.dump(history_step2, outfile)

    print(f'Epoch {epoch_step2 + 1}, Train Loss {total_loss/num_steps:.6f},\
 Accuracy {total_accuracy / num_steps:.2%}; Validation Loss {val_loss:.6f},\
 Accuracy {val_accuracy:.2%}; taken {time.time() - start:.2f} sec')
    epoch_step2 += 1

Epoch 142, Train Loss 1.934445, Accuracy 18.15%; Validation Loss 2.092308, Accuracy 13.82%; taken 624.94 sec
Epoch 143, Train Loss 1.875293, Accuracy 22.45%; Validation Loss 2.087523, Accuracy 14.19%; taken 606.81 sec
Epoch 144, Train Loss 1.862571, Accuracy 23.53%; Validation Loss 2.083583, Accuracy 14.50%; taken 600.68 sec
Epoch 145, Train Loss 1.856830, Accuracy 24.15%; Validation Loss 2.085940, Accuracy 14.98%; taken 602.32 sec
Epoch 146, Train Loss 3.836622, Accuracy 46.09%; progression 0.1%, time elapsed 2.88 sec

KeyboardInterrupt: 

In [121]:
def evaluate(word):
    test_input = []
    for char in word:
        glyph = glyphbook.draw(char)
        if glyph is not None:
            test_input.append(glyph)
        else:
            raise ValueError(f'Character {char} unsupported.')
    test_input = tf.expand_dims(test_input, -1)
    features = cnn_model(test_input)
    test_result = predict(features, 5)

    def decode(indexes):
        code = ''
        for i in indexes:
            if i <= 0:
                continue
            elif i >= 27:
                break
            else:
                code += chr(i + 96)
        return code

    return np.apply_along_axis(decode, 1, test_result.numpy())

In [123]:
evaluate('雪齋')

array(['mzs', 'yxf'], dtype='<U3')

正確倉頡碼：mzs, yxf