In [1]:
import numpy as np
# ^^^ pyforest auto-imports - don't write above this line
import pyforest
import struct
import skimage.transform
import tensorflow as tf
import datetime

# Dependency imports
from tensorflow.keras import datasets, layers, models
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.wrappers import scikit_learn
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from PIL import Image
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder

# Notebook Extentions
%load_ext tensorboard

# Japanese Language Classification

![title_banner](img/title_banner.jpg)

## Agenda

- This project is designed to take image recognition of Japanese characters, and create a learning model that can classify the characters based on the read input.
- Using the three main Japanese language writing systems (Kanji, Hiragana, Katakana), as well as the linguistically depreciated writing system of kuzushiji - used in Japan for over a thousand years since the 8th century.
- The data is from the ETL Character Database, which includes over a billion total of Japanese characters hand-written and reorganized by the National Institute of Advanced Industrial Science and Technology (AIST).

### The Writing Systems of Japan

- **Kanji:**
    - Kanji entered Japan in the 8th century via Chinese monks who also brought other traditions with them such as tea and Buddhism. Kanji is based on comparable Chinese characters that convey meaning from pictographic images.<br><br>

- **Hiragana:**
    - Phonetic writing system taking the mostly curviture root aspects of some kanji characters to represent a phonetic representation of sounds. There are 46 individual hiragana characters used today (alongside 29 diphthongs).<br><br>

- **Katakana:**
    - Much like hiragana, katakana is phonetically identical to hiragana. Katakana takes the angular aspects of some kanji characters and is mainly used for foreign words, onamatepeia, and sounds. Katakana contains the same amound of phonetic characters as hiragana.<br><br>

- **Kuzushiji:**
    - A cursive writing style, over 3 million books, on a diverse array of topics such as literature, science, mathematics and cooking written in kuzushiji are preserved today. However, the standardization of Japanese textbooks known as the “Elementary School Order” in 1900, removed Kuzushiji from regular school curriculum, as modern japanese print became popular. As a result, most Japanese natives today cannot read books written or printed in kuzushiji just 120 years ago.

## Data

### Data Information

- Each file contains 5 data sets except ETL8G_33.
- Each data set contains 956 characters written by a writer.
- Each writer wrote 10 sheets (genkouyoushi) per data set.

### Japanese Character Dataset Information

- **Hiragana (ETL 8):**
    - 71 hiragana characters (46 unique + 29 diphthongs)
    - 160 writers
    - 8199 records (genkouyoushi sheets) 
    - 1,254,120,000 unique handwritten hiragana characters (shared with kanji chars in the same files)<br><br>
    
- **Kanji (ETL 8):**
    - 883 daily use kanji
    - 160 writers
    - 8199 records 
    - 152,878,411 unique handwritten kanji (shared with hiragana chars in the same files)<br><br>
    
- **Katakana (ETL 1):**
    - 46 katakana characters (46 unique, dipthongs not included as they are phonetically identical to hiragana)
    - 1411 writers
    - 2052 records
    - 2,436,366 unique handwritten katakana characters

### Import & Read Hiragana

In [2]:
hira_kanji_record = 8199

In [3]:
# Reading the Hiragana ETL8G File
def read_ETL8G(f):
    s = f.read(hira_kanji_record)
    r = struct.unpack('>2H8sI4B4H2B30x8128s11x', s)
    iF = Image.frombytes('F', (128, 127), r[14], 'bit', 4)
    iL = iF.convert('L')
    return r + (iL,)

In [4]:
# Function to read the binary code present in each of the 33 files included in the ETL8G folder
def read_hiragana():
    # Characters = 71, writers = 160, y = 127, x = 128
    hiragana = np.zeros([71, 160, 127, 128], dtype = np.uint8)

    for j in range(1, 33):
        filename = 'data/ETL8G/ETL8G_{:02d}'.format(j)
        with open(filename, 'rb') as f:
            for id_dataset in range(5):
                moji = 0
                for i in range(956):
                    r = read_ETL8G(f)
                    if b'.HIRA' in r[2] or b'.WO.' in r[2]:
                        if not b'KAI' in r[2] and not b'HEI' in r[2]:
                            hiragana[moji, (j - 1) * 5 + id_dataset] = np.array(r[-1])
                            moji += 1
    np.savez_compressed('data/hiragana.npz', hiragana)

In [5]:
# Running the Function Above
# read_hiragana()

In [6]:
hiragana_data = np.load('data/hiragana.npz')
hira_test = hiragana_data.f.arr_0

In [7]:
hira_test.shape

(71, 160, 127, 128)

### Import & Read Kanji

In [8]:
def read_kanji():
    # Characters = 883, writers = 160, y = 127, x = 128
    kanji = np.zeros([883, 160, 127, 128], dtype=np.uint8)
    for i in range(1, 33):
        filename = 'data/ETL8G/ETL8G_{:02d}'.format(i)
        with open(filename, 'rb') as f:
            for dataset in range(5):
                char = 0
                for j in range(956):
                    r = read_ETL8G(f)
                    if not (b'.HIRA' in r[2] or b'.WO.' in r[2]): 
                        kanji[char, (i - 1) * 5 + dataset] = np.array(r[-1])
                        char += 1
    np.savez_compressed('data/kanji.npz', kanji)

In [9]:
# read_kanji()

In [10]:
kanji_data = np.load('data/kanji.npz')
kanji_test = kanji_data.f.arr_0

In [11]:
kanji_test.shape

(883, 160, 127, 128)

### Import & Read Katakana

In [12]:
kana_record = 2052

In [13]:
# Reading in the Katakana ETL1 File
def read_ETL1(total_sheets, f):
    s = f.read(kana_record)
    r = struct.unpack('>H2sH6BI4H4B4x2016s4x', s)
    iF = Image.frombytes('F', (64, 63), r[18], 'bit', 4)
    iL = iF.convert('L')
    return r + (iL,)

In [14]:
def read_katakana():
    # Character type = 46, writers = 1411, y = 63, x = 64
    katakana = np.zeros([46, 1411, 63, 64], dtype=np.uint8)
    counter = 0
    moji = 0
    for k in range(7, 14):
        filename = 'data/ETL1/ETL1C_{:02d}'.format(k)
        with open(filename, 'rb') as f:
            total_sheets = 0
            categories = 8
            if k == 13:
                categories = 3
            for j in range(categories):
                incrimenter = 0
                person = 0
                sheets = 1411
                if k == 9 and j == 4:
                    sheets = 1410
                elif k == 12 and j == 1:
                    sheets = 1410
                for i in range(sheets):
                    r = read_ETL1(total_sheets, f)
                    total_sheets += sheets
                    if k < 11 or (k >= 11 and b' I' not in r[1] and b' E' not in r[1] and b'WI' not in r[1] and b' U' not in r[1] and b'WE' not in r[1]):
                        pixels = np.array(r[-1])
                        for row in pixels:
                            for i in range(len(row)):
                                if row[i] == 1 or row[i] == 2:
                                    row[i] = 0
                        katakana[moji, person] = pixels
                        incrimenter = 1
                        person += 1
                moji += incrimenter
    np.savez_compressed('data/katakana.npz', katakana)

In [15]:
# read_katakana()

In [16]:
katakana_data = np.load('data/katakana.npz')
kana_test = katakana_data.f.arr_0

In [17]:
kana_test.shape

(46, 1411, 63, 64)

## Visualizing Characters

### Visualizing Hiragana

In [18]:
# Visualizing Hiragana
def visualize_hiragana():
    for j in range(1, 2):
        filename = 'data/ETL8G/ETL8G_{:02d}'.format(j)
        with open(filename, 'rb') as f:
            for id_dataset in range(1):
                new_img = Image.new('L', (128*32, 128*30))
                for i in range(956):
                    r = read_ETL8G(f)
                    if b'.HIRA' in r[2]:
                        print(r[:4])

In [20]:
visualize_hiragana()

(1, 9250, b'A.HIRA  ', 1)
(1, 9252, b'I.HIRA  ', 16)
(1, 9254, b'U.HIRA  ', 31)
(1, 9256, b'E.HIRA  ', 46)
(1, 9258, b'O.HIRA  ', 61)
(1, 9259, b'KA.HIRA ', 76)
(1, 19007, b'HEI.HIRA', 83)
(1, 9260, b'GA.HIRA ', 91)
(2, 9261, b'KI.HIRA ', 1)
(2, 9262, b'GI.HIRA ', 16)
(2, 9263, b'KU.HIRA ', 31)
(2, 9264, b'GU.HIRA ', 46)
(2, 9265, b'KE.HIRA ', 61)
(2, 9266, b'GE.HIRA ', 76)
(2, 9267, b'KO.HIRA ', 91)
(3, 9268, b'GO.HIRA ', 193)
(3, 9269, b'SA.HIRA ', 208)
(3, 9270, b'ZA.HIRA ', 223)
(3, 9271, b'SHI.HIRA', 238)
(3, 9272, b'JI.HIRA ', 253)
(3, 9273, b'SU.HIRA ', 268)
(3, 9274, b'ZU.HIRA ', 283)
(4, 9275, b'SE.HIRA ', 289)
(4, 9276, b'ZE.HIRA ', 304)
(4, 9277, b'SO.HIRA ', 319)
(4, 9278, b'ZO.HIRA ', 334)
(4, 9279, b'TA.HIRA ', 349)
(4, 9280, b'DA.HIRA ', 364)
(4, 9281, b'CHI.HIRA', 378)
(5, 9282, b'JI.HIRA ', 385)
(5, 9284, b'TSU.HIRA', 400)
(5, 9285, b'ZU.HIRA ', 415)
(5, 9286, b'TE.HIRA ', 430)
(5, 9287, b'DE.HIRA ', 445)
(5, 9288, b'TO.HIRA ', 460)
(5, 9289, b'DO.HIRA ', 475)
(6, 9290

In [21]:
def one_kana():
    filename = 'data/ETL8G/ETL8G_01'
    counter = 0
    with open(filename, 'rb') as f:
        for id_dataset in range(5):
            for i in range(956):
                r = read_ETL8G(f)
                if b'.WO.' in r[2]:
                    iE = Image.eval(r[-1], lambda x: 255-x*16)
                    fn = 'img/kana_visualization.png'.format((r[0]-1)%20+1, hex(r[1])[-4:])
                    # iE.save(fn, 'PNG')

In [22]:
one_kana()

- Kana Sample Visualization ![kana](img/kana_visualization.png)

### Visualizing Kanji

In [23]:
def visualize_kanji():
    for j in range(1, 2):
        filename = 'data/ETL8G/ETL8G_{:02d}'.format(j)
        with open(filename, 'rb') as f:
            for id_dataset in range(1):
                new_img = Image.new('L', (128*32, 128*30))
                for i in range(956):
                    r = read_ETL8G(f)
                    if not (b'.HIRA' in r[2] or b'.WO.' in r[2]):
                        print(r[:4])

In [25]:
visualize_kanji()

(1, 12326, b'AI.MEDER', 2)
(1, 12369, b'I.YUDANE', 3)
(1, 12397, b'ICHI.HIT', 4)
(1, 12608, b'UN.KUMO ', 5)
(1, 12639, b'EN.MARU ', 6)
(1, 12838, b'OU.KIMI ', 7)
(1, 12863, b'KA.NANI ', 8)
(1, 12880, b'KA.HI   ', 9)
(1, 12913, b'KAI.AU  ', 10)
(1, 13100, b'KAI.KIZA', 11)
(1, 13143, b'KAKU.KAW', 12)
(1, 13361, b'KAN.TSUK', 13)
(1, 13403, b'KAN.YAKA', 14)
(1, 13429, b'KI.MARE ', 15)
(1, 13613, b'KI.SHIRU', 17)
(1, 13657, b'KYUU.YAS', 18)
(1, 13677, b'GYUU.USH', 19)
(1, 13862, b'KYOU.TOM', 20)
(1, 13898, b'KYOKU.MA', 21)
(1, 13927, b'KU.     ', 22)
(1, 14136, b'KEI.TSUN', 23)
(1, 14173, b'GEI.UERU', 24)
(1, 14196, b'KEN.WARI', 25)
(1, 14385, b'KEN.KEWA', 26)
(1, 14405, b'KO.INISH', 27)
(1, 14444, b'GO.KATAR', 28)
(1, 14461, b'KOU.KUCH', 29)
(1, 14657, b'KOU.MINA', 30)
(1, 14695, b'GOU.AU  ', 32)
(1, 14906, b'SA.SHIRA', 33)
(1, 14938, b'SAI.NA  ', 34)
(1, 14974, b'SATSU.HA', 35)
(1, 15165, b'SAN.KAIK', 36)
(1, 15183, b'SHI.HAJI', 37)
(1, 15200, b'SHI.SHIN', 38)
(1, 15217, b'SHI.MOTO', 39)


(8, 19059, b'HOU.SHIR', 754)
(8, 19282, b'BOKU.MAK', 755)
(8, 19505, b'MIN.TAMI', 756)
(8, 19532, b'MEN.OMO ', 757)
(8, 19572, b'YAKU.KUS', 758)
(8, 19773, b'YO.ATAER', 759)
(8, 19803, b'YOU.HI  ', 760)
(8, 20007, b'RITSU.NO', 762)
(8, 20044, b'RYOU.HAK', 763)
(8, 20068, b'REI.HIER', 764)
(8, 20287, b'ROKU.SHI', 765)
(9, 12365, b'I.YORI  ', 767)
(9, 12393, b'IKU.SODA', 768)
(9, 12587, b'U.AME   ', 769)
(9, 12631, b'EKI.MASU', 770)
(9, 12670, b'OU.KOTAE', 771)
(9, 12861, b'KA.KAWAR', 772)
(9, 12878, b'KA.UTAU ', 773)
(9, 12906, b'GA.ME   ', 774)
(9, 13096, b'KAI.    ', 775)
(9, 13136, b'KAKU.OBO', 776)
(9, 13355, b'KAN.SUSU', 777)
(9, 13398, b'KAN.AIDA', 778)
(9, 13424, b'KI.MOTOI', 779)
(9, 13610, b'KI.OSAME', 780)
(9, 13653, b'GYAKU.MU', 782)
(9, 13675, b'KYUU.TAM', 783)
(9, 13857, b'KYOU.SON', 784)
(9, 13896, b'GYOU.WAZ', 785)
(9, 13924, b'GIN.SHIR', 786)
(9, 14131, b'GUN.TSUW', 787)
(9, 14167, b'KEI.KAZO', 788)
(9, 14194, b'KEN.TAKE', 789)
(9, 14377, b'KEN.KAKE', 790)
(9, 14402, b'G

In [26]:
def one_kanji():
    filename = 'data/ETL8G/ETL8G_01'
    counter = 0
    with open(filename, 'rb') as f:
        for id_dataset in range(5):
            for i in range(956):
                r = read_ETL8G(f)
                if not (b'.HIRA' in r[2] or b'.WO.' in r[2]):
                    iE = Image.eval(r[-1], lambda x: 255-x*16)
                    fn = 'img/kanji_visualization.png'.format((r[0]-1)%20+1, hex(r[1])[-4:])
                    # iE.save(fn, 'PNG')

In [27]:
one_kanji()

- Kanji Sample Visualization: ![kanji](img/kanji_visualization.png)

## Feature Engineering

- The image dimensions need to be resized for the ML model to work on them. Initially images were resized to 32x32, but due to model performance, they were upsized to be 64x64.

### Joining Datasets

In [30]:
japanese_files = ['data/hiragana.npz', 'data/kanji.npz', 'data/katakana.npz']

In [33]:
# merged_data = [np.load(file) for file in japanese_files]
# merged_dict = {}
# for data in merged_data:
#     [merged_dict.update({k: v}) for k, v in data.items()]
# np.savez_compressed('data/japanese.npz', **merged_dict)

In [39]:
japanese_data = np.load('data/japanese.npz')['arr_0']

In [40]:
japanese_data.shape

(46, 1411, 63, 64)

### Resizing Hiragana Images

In [41]:
# 71 unique classes of hiragana (including dipthongs)
hiragana_class = 71

# Image dimensions
img_row, img_col = 64, 64

In [42]:
hiragana_table = np.load('data/hiragana.npz')['arr_0'].reshape([-1, 127, 128]).astype(np.float32)
hiragana_table = hiragana_table/np.max(hiragana_table)

In [43]:
hiragana_train = np.zeros([hiragana_class * 160, img_row, img_col], dtype = np.float32)

for i in range(hiragana_class * 160):
    hiragana_train[i] = skimage.transform.resize(hiragana_table[i], (img_row, img_col))

### Resizing Kanji Images

In [44]:
# 879 unique classes of kanji (most daily used kanji)
kanji_class = 873

In [45]:
kanji_table = np.load('data/kanji.npz')['arr_0'].reshape([-1, 127, 128]).astype(np.float32)
kanji_table = kanji_table/np.max(kanji_table)

- The kanji and hiragana characters share the ECT8 dataset, and they are not intuitively arranged. This accounts for this so hiragana are not mistakenly classified as kanji characters.

In [None]:
kanji_train = np.zeros([kanji_class * 160, img_row, img_col], dtype = np.float32)

for i in range((kanji_class + 4) * 160):
    if int(i / 160) != 88 and int( i / 160) != 219 and int( i / 160) != 349 and int( i / 160) != 457:
        if int(i / 160) < 88:
            kanji_train[i] = skimage.transform.resize(kanji_table[i], (img_row, img_col))
        if int(i/160) > 88 and int(i/160) < 219:
            kanji_train[i-160] = skimage.transform.resize(kanji_table[i], (img_row, img_col))
        if int(i/160) > 219 and int(i/160) < 349:
            kanji_train[i-320] = skimage.transform.resize(kanji_table[i], (img_row, img_col))
        if int(i/160) > 349 and int(i/160) < 457:
            if int(i/160) > 457:
                kanji_train[i-640] = skimage.transform.resize(kanji_table[i], (img_row, img_col))

### Resizing Katakana Images

In [None]:
# 71 frequently used katakana characters (including dipthongs)
katakana_class = 46

In [None]:
katakana_table = np.load('data/katakana.npz')['arr_0'].reshape([-1, 63, 64]).astype(np.float32)
katakana_table = katakana_table/np.max(katakana_table)

In [None]:
katakana_train = np.zeros([katakana_class * 1411, img_row, img_col], dtype = np.float32)

for i in range(katakana_class * 1411):
    katakana_train[i] = skimage.transform.resize(katakana_table[i], (img_row, img_col))