In [None]:
# create tf.records
# https://www.kaggle.com/ryanholbrook/walkthrough-building-a-dataset-of-tfrecords

In [None]:
import os
import json
import pprint
import random
import time
import math

import matplotlib.pyplot as plt
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from matplotlib import pyplot as plt


%matplotlib inline

In [None]:
random.seed(123)

In [None]:
data_dir = './test_img/'
target_dir = './tf_records/test'

cv2_target_dir = target_dir+'_cv2'
ori_target_dir = target_dir+'_ori'


In [None]:
with open('esun_ocr_target.txt', 'r') as f:
    d = list(f.read())
d.append('isnull')

In [None]:
data_dir

In [None]:
def get_File(file_dir, shuffle=True):
    # The images in each subfolder
    images = []
    # The subfolders
    subfolders = []

    # Using "os.walk" function to grab all the files in each folder
    for dirPath, dirNames, fileNames in os.walk(file_dir):
        for name in fileNames:
            images.append(os.path.join(dirPath, name))

        for name in dirNames:
            subfolders.append(os.path.join(dirPath, name))

    # To record the labels of the image dataset
    labels = []
    label_2_id = {}
    for id_, a_folder in enumerate(subfolders):
        
        n_img = len(os.listdir(a_folder))
        label_char = a_folder.split('/')[-1]
        label_2_id[label_char] = id_
        labels = np.append(labels, n_img * [id_])

    labels = labels.astype(int)
    
    combine_lst = list(zip(images, labels))
    if shuffle:
        random.shuffle(combine_lst)
    images, labels = zip(*combine_lst)

    return images, labels, label_2_id

In [None]:
imgs, labs, label_2_id = get_File(data_dir)

In [None]:
len(labs)

In [None]:
import json
idx2label = {idx:label for label, idx in label_2_id.items()}

for target_dir in [cv2_target_dir, ori_target_dir]:
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    path = os.path.join(target_dir, 'label_map.json')
    with open(path, 'w', encoding='utf-8') as outfile:
        json.dump(label_2_id, outfile, ensure_ascii=False)

In [None]:

idx2label = {idx:label for label, idx in label_2_id.items()}

for target_dir in [cv2_target_dir, ori_target_dir]:
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    path = os.path.join(target_dir, 'label_map.json')
    with open(path, 'w', encoding='utf-8') as outfile:
        json.dump(label_2_id, outfile, ensure_ascii=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(imgs, labs, test_size=0.2, random_state=123, shuffle=True, stratify=labs)

In [None]:
class_weights = class_weight.compute_class_weight(
          'balanced',
          np.unique(y_train), 
          y_train)

for target_dir in [cv2_target_dir, ori_target_dir]:
    path = os.path.join(target_dir, 'class_weight.json')
    with open(path, 'w', encoding='utf-8') as outfile:
        json.dump(list(class_weights), outfile, ensure_ascii=False)

In [None]:
label_cnt_dict = {}
for i in label_2_id.values():
    label_cnt_dict[str(i)] = y_train.count(i)
    
for target_dir in [cv2_target_dir, ori_target_dir]:
    path = os.path.join(target_dir, 'label_cnt_dict.json')
    with open(path, 'w', encoding='utf-8') as fp:
        json.dump(label_cnt_dict, fp)

In [None]:
def make_example(encoded_image, label):
    image_feature = Feature(
        bytes_list=BytesList(value=[
            encoded_image,
        ]),
    )
    label_feature = Feature(
        int64_list=Int64List(value=[
            label,
        ])
    )

    features = Features(feature={
        'image': image_feature,
        'label': label_feature,
    })
    
    example = Example(features=features)
    
    return example.SerializeToString()

In [None]:
training_tf_records_qty = 32
val_tf_records_qty = 8

kernel_size = 3

In [None]:
qty_per_tf_records = math.ceil(len(X_train) / training_tf_records_qty)

NUM_SHARDS = 32

for target_dir in [cv2_target_dir, ori_target_dir]:
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)

    if not os.path.exists(os.path.join(target_dir, 'train')):
        os.mkdir(os.path.join(target_dir, 'train'))
    

PATH_cv2 = os.path.join(os.path.join(cv2_target_dir, 'train'), 'shard_train_{:02d}.tfrecord')
PATH_ori = os.path.join(os.path.join(ori_target_dir, 'train'), 'shard_train_{:02d}.tfrecord')

total_training_examples = len(X_train)
cnt = 0
for shard in range(training_tf_records_qty):
    with tf.io.TFRecordWriter(path=PATH_cv2.format(shard)) as f_cv2, tf.io.TFRecordWriter(path=PATH_ori.format(shard)) as f_ori: # , open('b', 'w') as b
        for idx, (img_path, lab) in enumerate(zip(X_train, y_train)):
            img=cv2.imdecode(np.fromfile(img_path,dtype=np.uint8),1)
            img = img[:,:,::-1]
            image = img
            img = cv2.GaussianBlur(img,(kernel_size, kernel_size), 0)
            img = cv2.Canny(img, 150, 200)
            img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)


            # 這段似乎有點智障!? 想一下怎麼改
            img = tf.convert_to_tensor(img)
            img = tf.image.convert_image_dtype(img, dtype=tf.uint8)
            img = tf.io.encode_jpeg(img).numpy()

            f_cv2.write(make_example(img, lab))

            image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_CUBIC)
            image = tf.image.convert_image_dtype(image, dtype=tf.uint8)
            image = tf.io.encode_jpeg(image).numpy()              

            f_ori.write(make_example(image, lab))
            cnt += 1
            if idx >= qty_per_tf_records-1:
                X_train = X_train[qty_per_tf_records:]
                y_train = y_train[qty_per_tf_records:]             
                break
                
assert cnt == total_training_examples

In [None]:
qty_per_tf_records = math.ceil(len(X_test) / val_tf_records_qty)

for target_dir in [cv2_target_dir, ori_target_dir]:
    if not os.path.exists(os.path.join(target_dir, 'val')):
        os.mkdir(os.path.join(target_dir, 'val'))

PATH_cv2 = os.path.join(os.path.join(cv2_target_dir, 'val'), 'shard_val_{:02d}.tfrecord')
PATH_ori = os.path.join(os.path.join(ori_target_dir, 'val'), 'shard_val_{:02d}.tfrecord')

total_val_examples = len(X_test)
cnt = 0
for shard in range(val_tf_records_qty):
    with tf.io.TFRecordWriter(path=PATH_cv2.format(shard)) as f_cv2, tf.io.TFRecordWriter(path=PATH_ori.format(shard)) as f_ori:
        for idx, (img_path, lab) in enumerate(zip(X_test, y_test)):
            img=cv2.imdecode(np.fromfile(img_path,dtype=np.uint8),1)
            img = img[:,:,::-1]

            image = img
            
            img = cv2.GaussianBlur(img,(kernel_size, kernel_size), 0)
            img = cv2.Canny(img, 150, 200)
            img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

            
            # 這段似乎有點智障!? 想一下怎麼改
            img = tf.convert_to_tensor(img)
            img = tf.image.convert_image_dtype(img, dtype=tf.uint8)
            img = tf.io.encode_jpeg(img).numpy()

            f_cv2.write(make_example(img, lab))
            
            image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_CUBIC)
            image = tf.image.convert_image_dtype(image, dtype=tf.uint8)
            image = tf.io.encode_jpeg(image).numpy()              

            f_ori.write(make_example(image, lab))            
            
            cnt += 1
            if idx >= qty_per_tf_records-1:
                X_test = X_test[qty_per_tf_records:]
                y_test = y_test[qty_per_tf_records:]                
                break
                
assert cnt == total_val_examples