In [1]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm.notebook as tqdm
import os
from pathlib import Path

%matplotlib inline

In [None]:
DATA_DIR_PATH = "../data/bengali_AI_handwritten_grapheme_classification"

DATASET_PATH_1 = "../data/bengali_centered/dataset_1"
DATASET_PATH_2 = "../data/bengali_centered/dataset_2"
DATASET_PATH_3 = "../data/bengali_centered/dataset_3"
DATASET_PATH_4 = "../data/bengali_centered/dataset_4"

IMG_HEIGHT = 137
IMG_WIDTH = 236

ROOT_CLASSES_NUM = 168
CONSONANT_CLASSES_NUM = 7
VOWEL_CLASSES_NUM = 11

## Read dataset

In [None]:
train_df = pd.read_csv("{}/train.csv".format(DATA_DIR_PATH))
test_df = pd.read_csv("{}/test.csv".format(DATA_DIR_PATH))
class_map_df = pd.read_csv("{}/class_map.csv".format(DATA_DIR_PATH))
sample_submission_df = pd.read_csv("{}/sample_submission.csv".format(DATA_DIR_PATH))

In [None]:
def read_parquet_data(dir_path: str, num_file: int, data_type: str) -> list:
    df_list = list()
    data_type = data_type.lower()
    
    for i in range(num_file):
        df = pd.read_parquet("{}/{}_image_data_{}.parquet".format(dir_path, data_type, i))
        df.set_index('image_id', inplace=True)
        df_list.append(df)
        
        print("Reading {} th parquet file is done".format(i))
    
    return df_list

In [None]:
# read train dataset
train_df_list = read_parquet_data(dir_path=DATA_DIR_PATH, num_file=4, data_type="train")

In [None]:
# read test dataset
test_df_list = read_parquet_data(dir_path=DATA_DIR_PATH, num_file=4, data_type="test")

## Utils

In [None]:
def reshape_img(img_data):
    if isinstance(img_data, pd.Series):
        img_data = img_data.to_numpy()
    
    return img_data.reshape(IMG_HEIGHT, IMG_WIDTH)

In [None]:
def show_img(img):
    fig = plt.figure(figsize=(15, 7))
    plt.xticks(range(0, IMG_WIDTH, 10))
    plt.yticks(range(0, IMG_HEIGHT, 5))
    plt.imshow(img, cmap='gray')
    plt.show()

In [None]:
def get_class_dict(class_map_df: pd.DataFrame) -> dict:
    class_dict = dict()

    for component_type in set(class_map_df['component_type']):
        output_df = class_map_df[class_map_df['component_type'] == component_type]
        output_df.drop('component_type', axis=1, inplace=True)
        output_df.index = output_df['label']

        class_dict[component_type] = output_df

    return class_dict

## Centering

In [None]:
def inverse_image(img, threshhold=10):
    img = img.astype(np.int64)

    mask = np.ones_like(img, dtype=np.int64) * 255
    img = abs(img - mask)
    
    img[img < threshhold] = 0
    
    return img

In [None]:
def remove_edge(images) :
    # 위 2 픽셀
    images[:, :2] = 0
    # 아래 2 픽셀
    images[:, IMG_HEIGHT-2:] = 0
    # 왼쪽 2 픽셀
    images[:, :, :2] = 0
    # 오른쪽 2 픽셀
    images[:, :, IMG_WIDTH-2:] = 0
    return images

In [None]:
def num_range(data):
    cut_image = np.array([])
    max_x = 0
    max_y = 0
    len_d = 0
    
    for i in data:
        x1 = 0
        y1 = 0
        x2 = 0
        y2 = 0
        for j in range(0, IMG_HEIGHT):
            for k in range(0, IMG_WIDTH):
                # x1 init
                if i[j][k] > 0 and x1 == 0:
                    x1 = k
                # y1 init
                if i[j][k] > 0 and y1 == 0:
                    y1 = j

                # x1 update
                if i[j][k] > 0 and k < x1:
                    x1 = k
                # y1 update
                if i[j][k] > 0 and j < y1:
                    y1 = j
                # x2 update
                if i[j][k] > 0 and x1 != 0 and k > x2:
                    x2 = k
                # y2 update
                if i[j][k] > 0 and y1 != 0 and j > y2:
                    y2 = j

        image_set = [x1, x2, y1, y2]

        if x2-x1+1 > max_x:
            max_x = x2-x1+1
        if y2-y1+1 > max_y:
            max_y = y2-y1+1

        cut_image = np.append(cut_image, image_set, axis=0)
        len_d += 1

    cut_image = cut_image.reshape(-1, 4)
    return cut_image # max_size

In [None]:
def pad_image(data, cut_image):
    padding_img = np.array([])
    center_size_x = IMG_WIDTH // 2
    center_size_y = IMG_HEIGHT // 2

    for i in range(len(cut_image)):
        init = np.zeros((137, 236))

        x_size = int((cut_image[i][1] - cut_image[i][0]) / 2)
        y_size = int((cut_image[i][3] - cut_image[i][2]) / 2)
        data_center_X = int((cut_image[i][0] + cut_image[i][1]) / 2)
        data_center_Y = int((cut_image[i][2] + cut_image[i][3]) / 2)

        init[center_size_y-y_size:center_size_y+y_size+1, center_size_x-x_size:center_size_x+x_size+1] \
        = data[i][data_center_Y-y_size:data_center_Y+y_size+1, data_center_X-x_size:data_center_X+x_size+1]

        padding_img = np.append(padding_img, init)

    padding_img = padding_img.reshape(-1, 137, 236)
    return padding_img

## Save centered images

In [None]:
def save_centered_image(img_df, dataset_path: str):
    Path(dataset_path).mkdir(parents=True, exist_ok=True)
    
    for image_name, row in tqdm.tqdm(img_df.iterrows(), total=img_df.shape[0]):
        img = inverse_image(row, threshhold=50)
        img = img.to_numpy()
        img = img.reshape(-1, IMG_HEIGHT, IMG_WIDTH)
        
        cropped_img = remove_edge(img)
        cut_image = num_range(cropped_img)
        pad_img = pad_image(img, cut_image)
        
        plt.imsave(os.path.join(dataset_path, "{}.jpg".format(image_name)), pad_img[0], cmap='gray')

In [None]:
def check_centered_image(row):
    img = inverse_image(row, threshhold=50)
    img = img.to_numpy()
    img = img.reshape(-1, IMG_HEIGHT, IMG_WIDTH)

    cropped_img = remove_edge(img)
    print("removed edge check : ", cropped_img[:,0])
    cut_image = num_range(cropped_img)
    print("cropped image position: ", cut_image)
    pad_img = pad_image(img, cut_image)

    return pad_img

## Note

In [None]:
"""
crop-crop-pad 한 결과 중 이상한 것
50320 - 원본 확인 필요
51120 - 원본 확인 필요

51694 - 오른쪽 점때문에 센터링 안됨
52225 - 위에 선
"""