In [1]:
from PIL import ImageFont, ImageDraw, Image
from fontTools.ttLib import TTFont

import numpy as np
import tensorflow as tf

## Load Font

In [2]:
class Glyph(object):
    # transform character to bitmap
    def __init__(self, fonts, size=64):
        # load fonts, size. We will use 2 fonts for all CJK characters, so keep 2 codepoint books.
        self.codepoints = [set() for _ in fonts]
        self.size = int(size * 0.8)
        self.size_img = size
        self.pad = (size - self.size) // 2
        self.fonts = [ImageFont.truetype(f, self.size) for f in fonts]
        # use a cache to reduce computation if duplicated characters encountered.
        self.cache = {}
        for cp, font in zip(self.codepoints, fonts):
            font = TTFont(font)
            # store codepoints in font cmap into self.codepoints
            for cmap in font['cmap'].tables:
                if not cmap.isUnicode():
                    continue
                for k in cmap.cmap:
                    cp.add(k)
    
    def draw(self, ch):
        if ch in self.cache:
            return self.cache[ch]
        # search among fonts, use the first found
        exist = False
        for i in range(len(self.codepoints)):
            if ord(ch) in self.codepoints[i]:
                font = self.fonts[i]
                exist = True
                break
        if not exist:
            return None

        img = Image.new('L', (self.size_img, self.size_img), 0)
        draw = ImageDraw.Draw(img)
        (width, baseline), (offset_x, offset_y) = font.font.getsize(ch)
        draw.text((self.pad - offset_x, self.pad - offset_y + 4), ch, font=font, fill=255, stroke_fill=255) 
        img_array = np.array(img.getdata(), dtype='float32').reshape((self.size_img, self.size_img)) / 255
        self.cache[ch] = img_array

        return img_array

In [3]:
glyphbook = Glyph(['data/fonts/HanaMinA.otf', 'data/fonts/HanaMinB.otf'])

## Load Model

In [4]:
cangjie = tf.saved_model.load('./Cangjie_Model/')

## Evaluate

In [5]:
def evaluate(word):
    test_input = []
    for char in word:
        glyph = glyphbook.draw(char)
        if glyph is not None:
            test_input.append(glyph)
        else:
            raise ValueError('Character {} unsupported.'.format(char))
    test_input = np.expand_dims(test_input, -1)
    input_data = tf.data.Dataset.from_tensor_slices((test_input)).batch(256)
        
    def decode(indexes):
        code = ''
        for i in indexes:
            if i <= 0:
                continue
            elif i >= 27:
                break
            else:
                code += chr(i - 1 + ord('a'))
        return code

    final_result = []
    for glyphs in input_data:
        results, probs, dups_dict = cangjie(glyphs)

        for i in range(results.shape[0]):
            for j in range(results.shape[1]):
                if j <= dups_dict[i]:
                    final_result.append([word[i] ,decode(results[i, j, :].numpy()), probs[i, j].numpy()])

    return final_result

In [6]:
evaluate('日月金木水火土的戈十大中一弓人心手口尸廿山女田止卜片')

[['日', 'a', 0.94346833],
 ['月', 'b', 0.8172164],
 ['金', 'c', 0.70550275],
 ['木', 'd', 0.45743632],
 ['水', 'e', 0.9323918],
 ['火', 'f', 0.6878291],
 ['土', 'g', 0.6260477],
 ['的', 'h', 0.95940524],
 ['戈', 'i', 0.3986519],
 ['十', 'j', 0.96117085],
 ['大', 'k', 0.98391473],
 ['中', 'l', 0.9426256],
 ['一', 'm', 0.92876565],
 ['弓', 'n', 0.9896928],
 ['人', 'o', 0.5605947],
 ['心', 'p', 0.67877287],
 ['手', 'q', 0.87321275],
 ['口', 'r', 0.757297],
 ['尸', 's', 0.7976015],
 ['廿', 't', 0.8845617],
 ['山', 'u', 0.98116463],
 ['女', 'v', 0.3145672],
 ['田', 'w', 0.994529],
 ['止', 'x', 0.9314267],
 ['卜', 'y', 0.9755738],
 ['片', 'llml', 0.946877],
 ['片', 'llms', 0.9570422]]