# Kanji Recognizer


by Aiyu Kamate
https://towardsdatascience.com/creating-a-japanese-handwriting-recognizer-70be12732889

##Installing all depenendencies needed
1. Python 3.X (any python3 version should work)
2. Tensorflow 2.2.0
3. Keras 2.4.3
4. Numpy 1.16.4
6. matplotlib — newest version
7. PIL — newest version
8. skimage — newest version
9. sklearn — newest version
10. corelmltools 3.2



### Check python version

In [None]:
!python --version

Python 3.6.9


### Install dependencies

In [None]:
!pip install tensorflow==2.2.0
!pip install keras==2.4.3
!pip install numpy
!pip install matplotlib
!pip install Pillow
!pip install scikit-image
!pip install sklearn
!pip install coremltools==3.2

## Preprocessing

### Reading dataset

Reading Kanji characteres to compress it on a numpy compressed file to make it more flexible to use.

In [None]:
import struct, os
from PIL import Image
import numpy as np

# Hiragana character to filter out images
label = ["あ", "い", "う", "え", "お", "か", "が",  "き", "ぎ",
         "く", "ぐ", "け", "げ", "こ", "ご", "さ", "ざ", "し",
         "じ", "す", "ず", "せ", "ぜ", "そ", "ぞ", "た", "だ",
         "ち", "ぢ", "つ", "づ", "て", "で", "と", "ど", "な",
         "に", "ぬ", "ね", "の", "は", "ば", "ぱ", "ひ", "び",
         "ぴ", "ふ", "ぶ", "ぷ", "へ", "べ", "ぺ", "ほ", "ぼ",
         "ぽ", "ま", "み", "む", "め", "も", "や", "ゆ", "よ",
         "ら", "り", "る", "れ", "ろ", "わ", "を", "ん", "っ",
         "ゃ", "ゅ", "ょ"]

def string_unicode_to_han(unicode):
    han_unicode = '\\u{}'.format(unicode)
    return han_unicode.encode('ascii').decode('unicode-escape')

def file_path_narray(filepath):
    im = Image.open(filepath)
    return im.convert('L')

def read_kanji():
    """
    881 - kanji excluding hiragana characters
    161 - images by writers
    127 - width
    128 - height
    """
    kanji = np.zeros([881, 161, 127, 128], dtype=np.uint8) 
    foldername = "../dataset/ETL8G/"
    i = 0
    print("Reading images...")
    for folder in os.scandir(foldername):
        # Decode unicode to han character
        han_char = string_unicode_to_han(folder.name[2:])

        if(han_char not in label):
            j = 0
            for file in os.scandir(folder.path):
                if file.name != ".char.txt":
                    # Convert file to Pillow image and then to numpy array
                    iL = file_path_narray(file.path)
                    kanji[i, j] =  np.array(iL)
                    j += 1
            i += 1
    print("Finished reading images")
    # Finish compressing kanji dataset
    print("Compressing images...")
    np.savez_compressed("kanji.npz", kanji)
    print("Finished compressing")

In [None]:
read_kanji()

### Converting data to training and test labels/images

In [None]:
import skimage.transform
import numpy as np
from sklearn.model_selection import train_test_split

kanji = np.load("kanji.npz")['arr_0'].reshape([-1, 63, 64]).astype(np.float32)

kanji = kanji/np.max(kanji) # make the numbers range from 0 to 1

# 51 is the number of different katakana (3 are duplicates so in the end there are 48 classes), 1411 writers.
train_images = np.zeros([51 * 1411, 48, 48], dtype=np.float32)

for i in range(51 * 1411): # change the image size to 48*48
    train_images[i] = skimage.transform.resize(kanji[i], (48, 48))

arr = np.arange(51) # create labels
train_labels = np.repeat(arr, 1411)

# In the actual code, I combined the duplicate classes here and had 48 classes in the end

# split the images/labels to train and test
train_images, test_images, train_labels, test_labels = train_test_split(train_images, train_labels, test_size=0.2)

np.savez_compressed("kanji_train_images.npz", train_images)
np.savez_compressed("kanji_train_labels.npz", train_labels)
np.savez_compressed("kanji_test_images.npz", test_images)
np.savez_compressed("kanji_test_labels.npz", test_labels)

### Visualizing training images

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6)).patch.set_facecolor('#000000')
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
plt.show()