In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Install Libary

In [None]:
!pip install -q simpletransformers
! pip install tesseract
! apt-get install tesseract-ocr-tha

# Read Data

In [None]:
import cv2
import numpy as np
import os
import pandas  as pd
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
image = Image.open("/kaggle/input/nithan-chadok-hybrid-ocr-ner/images/images/00000.jpg")
image

In [None]:
import pytesseract

output_tesseract = pytesseract.image_to_string(image, lang='tha')
print(output_tesseract)

# Data Understanding & Data Preparation

In [None]:
import numpy as np
import math

def getSkewAngle(cvImage) -> float:
    newImage = cvImage.copy()
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=2)

    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)
    firstbox = []
    lastbox = []
    box = []
    for c in contours:
        rect = cv2.boundingRect(c)
        x,y,w,h = rect
        box.append([x,y,x+w,y+h])

    s_box = sorted(box, key = lambda x: x[0])
    for x1,y1,x2,y2 in [s_box[0],s_box[-1]]:
        cv2.rectangle(newImage,(x1,y1),(x2,y2),(0,255,0),2)
    f_box = s_box[0]
    l_box = s_box[-1]
    if -f_box[0] + l_box[0] == 0:
        angle = 0
    else:
        m = (-l_box[1]+f_box[1])/(-f_box[0]+l_box[0])
        degree = math.atan(m)
        angle = degree * 180 / math.pi
    largestContour = contours[0]
    minAreaRect = cv2.minAreaRect(largestContour)
    cv2.imwrite("boxes.jpg", newImage)
    if angle <0:
        angle += 90
    elif angle > 0:
        angle -= 90
    return angle

def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage

def deskew(cvImage):
    angle = getSkewAngle(cvImage)
    return rotateImage(cvImage, -1.0 * angle)

In [None]:
def crop_image(image):
    grayinput = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binaryImage = cv2.threshold(grayinput, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernelSize = (5, 5)
    opIterations = 2
    morphKernel = cv2.getStructuringElement(cv2.MORPH_RECT, kernelSize)
    dilateImage = cv2.morphologyEx(
        binaryImage,
        cv2.MORPH_DILATE,
        morphKernel,
        None,
        None,
        opIterations,
        cv2.BORDER_REFLECT101
        )
    contours, _ = cv2.findContours(dilateImage, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    mean_row = np.array([c[:, 0].mean(0)[1] for c in contours])
    min_row = np.array([c[:, 0].min(0)[1] for c in contours], dtype='int')
    max_row = np.array([c[:, 0].max(0)[1] for c in contours], dtype='int')
    min_col = np.array([c[:, 0].min(0)[0] for c in contours], dtype='int')
    max_col = np.array([c[:, 0].max(0)[0] for c in contours], dtype='int')
    row_idx = np.argsort(mean_row)
    return row_idx, contours, min_row, max_row, min_col, max_col

def image_to_words(row_idx, contours, min_row, max_row, min_col, max_col, image):
    words = []
    for idx in range(len(row_idx)):
        im =  image[min_row[row_idx[idx]]:max_row[row_idx[idx]], min_col[row_idx[idx]]:max_col[row_idx[idx]]]
        word = pytesseract.image_to_string(im, lang='tha').replace("_","[!und:]").replace(" ","[!und:]").replace('\x0c', '').replace('\n', '').replace("'", '').replace('"', '')
        if len(word) == 0:
            word = '-'
        words.append(word)
    return words

In [None]:
image = np.array(Image.open('/kaggle/input/nithan-chadok-hybrid-ocr-ner/images/images/00000.jpg'))
image = deskew(image)
row_idx, contours, min_row, max_row, min_col, max_col = crop_image(image)

num_rows = 2
num_cols = 7
fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 3))

for idx, ax in enumerate(axes.flat):
    if idx < len(row_idx):
        im = image[min_row[row_idx[idx]]:max_row[row_idx[idx]], min_col[row_idx[idx]]:max_col[row_idx[idx]]]
        ax.imshow(im)
        ax.axis('off')
    else:
        ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
words = image_to_words(row_idx, contours, min_row, max_row, min_col, max_col, image)
print(words)

## Data Collection

In [None]:
src = '/kaggle/input/nithan-chadok-hybrid-ocr-ner/images/images/'
txt = []

for i in tqdm(range(len(os.listdir(src)))):
    image = deskew(np.array(Image.open(f'{src}{i:05d}.jpg')))
    image = cv2.resize(image, (600, 800), interpolation=cv2.INTER_AREA)
    row_idx, contours, min_row, max_row, min_col, max_col = crop_image(image)
    words = image_to_words(row_idx, contours, min_row, max_row, min_col, max_col, image)
    txt.extend(words)
print(txt)
print(len(txt))

In [None]:
df = pd.DataFrame({'Text': txt})
df.to_csv('super_data.csv', index=False)

# Modeling

In [None]:
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

In [None]:
txt = pd.read_csv('super_data.csv')
txt = txt['Text']
txt

## Read Model

In [None]:
import torch
from simpletransformers.ner import NERModel, NERArgs

ner_args = NERArgs()
ner_args.eval_batch_size = 32
ner_args.use_multiprocessing = True
ner_args.max_seq_length = 512
model3 = NERModel(
     "auto", "thanaphatt1/WangchanBERTa-LST20", args=ner_args, use_cuda=torch.cuda.is_available(), labels= NER_TAGS
)

## tokens and predictions

In [None]:
def split_into_sentences(tokens, tokens_per_sentence):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
test_tokens3 = split_into_sentences(txt, 225)
predictions3 = model3.predict(test_tokens3, False)

# Submission

In [None]:
tag_df = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/tag_list.csv')
tag_df

In [None]:
final_test_df3 = []

for i in predictions3[0]:
    for j in i:
        for k in j.values():
            result = tag_df[tag_df['tag'] == k]['class'].values[0]
            final_test_df3.append(result)

print(len(final_test_df3))
print(txt[60:80])
print(final_test_df3[60:80])
print(set(final_test_df3))

In [None]:
submit_df3 = pd.read_csv('/kaggle/input/nithan-chadok-hybrid-ocr-ner/sample_submission.csv')
submit_df3

In [None]:
final_result3 = pd.DataFrame(final_test_df3)
submit_df3['pred'] = final_result3
submit_df3

In [None]:
submit_df3.to_csv('Tesseract_Wanghan225.csv', index=False)