In [1]:
from PIL import ImageFont, ImageDraw, Image
from fontTools.ttLib import TTFont

import numpy as np
import tensorflow as tf

## Load Font

In [2]:
class Glyph(object):
    # transform character to bitmap
    def __init__(self, fonts, size=64):
        # load fonts, size. We will use 2 fonts for all CJK characters, so keep 2 codepoint books.
        self.codepoints = [set() for _ in fonts]
        self.size = int(size * 0.8)
        self.size_img = size
        self.pad = (size - self.size) // 2
        self.fonts = [ImageFont.truetype(f, self.size) for f in fonts]
        # use a cache to reduce computation if duplicated characters encountered.
        self.cache = {}
        for cp, font in zip(self.codepoints, fonts):
            font = TTFont(font)
            # store codepoints in font cmap into self.codepoints
            for cmap in font['cmap'].tables:
                if not cmap.isUnicode():
                    continue
                for k in cmap.cmap:
                    cp.add(k)
    
    def draw(self, ch):
        if ch in self.cache:
            return self.cache[ch]
        # search among fonts, use the first found
        exist = False
        for i in range(len(self.codepoints)):
            if ord(ch) in self.codepoints[i]:
                font = self.fonts[i]
                exist = True
                break
        if not exist:
            return None

        img = Image.new('L', (self.size_img, self.size_img), 0)
        draw = ImageDraw.Draw(img)
        (width, baseline), (offset_x, offset_y) = font.font.getsize(ch)
        draw.text((self.pad - offset_x, self.pad - offset_y + 4), ch, font=font, fill=255, stroke_fill=255) 
        img_array = np.array(img.getdata(), dtype='float32').reshape((self.size_img, self.size_img)) / 255
        self.cache[ch] = img_array

        return img_array

In [3]:
glyphbook = Glyph(['data/fonts/TH-Ming-HP0.ttf', 'data/fonts/TH-Ming-P2.ttf'], size=64)

## Load Model

In [4]:
cangjie = tf.saved_model.load('./Cangjie_Model/')

## Evaluate

In [5]:
def evaluate(file_in, file_out, cores=multiprocessing.cpu_count()):
    test_input = []
    words = []
    for char in file_in:
        words.append(char[0])
        glyph = glyphbook.draw(char[0])
        if glyph is not None:
            test_input.append(glyph)
        else:
            raise ValueError('Character {} unsupported.'.format(char[0]))
    
    test_input = np.expand_dims(test_input, -1)
    input_data = tf.data.Dataset.from_tensor_slices((test_input)).batch(128)
        
    def decode(indexes):
        code = ''
        for i in indexes:
            if i <= 0:
                continue
            elif i >= 27:
                break
            else:
                code += chr(i - 1 + ord('a'))
        return code

    k = 0
    for glyphs in input_data:
        results, probs, dups_dict = cangjie(glyphs)
        for i in range(results.shape[0]):
            for j in range(results.shape[1]):
                if j <= dups_dict[i]:
                    file_out.write(f'{words[k]}\t{decode(results[i, j, :].numpy())}\t{probs[i, j].numpy()}\n')
            k += 1

In [6]:
file_in = open('./data/cangjie6_char.txt', 'r')
file_out = open('./data/cangjie6_encoded.txt', 'w')

In [7]:
evaluate(file_in, file_out)

In [8]:
file_in.close()
file_out.close()