In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import uuid
import logging
import getpass
import numpy as np
import cv2
import os
import gc
import re
import time
import imageio
import random
import copy
import glob
import json
import shutil
import pickle
import traceback
from collections import Counter
from pprint import pprint as print

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from tqdm import tqdm, tqdm_notebook
from PIL import Image, ImageDraw
from shapely.geometry import Polygon

import requests
import pdf2image
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTTextContainer, LTLayoutContainer, LTTextLineHorizontal, LTAnno, LTChar
from pdfminer.converter import PDFPageAggregator

In [None]:
def imshow(img, figsize=(20, 20)):
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    ax.axis('off')
    ax.imshow(img)

In [None]:
min_word_num = 3

def cut_text(string, word_num):
    points = [0]
    w_n = 0
    for c, char in enumerate(string):
        if c == 0 and char.isspace():
            continue
        if c == len(string) - 1:
            if not char.isspace():
                w_n += 1
                if w_n >= min_word_num:
                    points.append(c)
            if char.isspace() and w_n >= min_word_num and w_n < word_num:
                points.append(c)
            else:
                continue
        if not char.isspace() and string[c + 1].isspace():
            w_n += 1
        if w_n == word_num:
            points.append(c)
            w_n = 0
    word_batches = []
    for p, point in enumerate(points):
        if p == 0:
            continue
        start = points[p - 1] if p == 1 else points[p - 1] + 1
        end = point
        for i in range(start, end + 1):
            if not string[i].isspace():
                start = i
                break
        for i in range(end, start - 1, -1):
            if not string[i].isspace():
                end = i
                break
        word_batches.append([start, end])
    return word_batches

def word_batches_in_line(line, word_num, page_height):
    result_word_batches = []
    text = line.get_text()
    word_batches = cut_text(text, word_num)
    for word_batch in word_batches:
        [start, end] = word_batch
        label = normalize("NFC", "")
        for w, word in enumerate(line):
            if w == start:
                char_start = word
            if w == end:
                char_end = word
            if w >= start and w <= end:
                label += normalize("NFC", word.get_text())
        sx0, sy0, sx1, sy1 = list(map(lambda i: int(i), [char_start.x0, char_start.y0, char_start.x1, char_start.y1]))
        sx_min = int(min(sx0, sx1))
        sy_min = int(min(page_height - sy0, page_height - sy1)) - 3
        sx_max = int(max(sx0, sx1))
        sy_max = int(max(page_height - sy0, page_height - sy1))

        ex0, ey0, ex1, ey1 = list(map(lambda i: int(i), [char_end.x0, char_end.y0, char_end.x1, char_end.y1]))
        ex_min = int(min(ex0, ex1))
        ey_min = int(min(page_height - ey0, page_height - ey1)) - 3
        ex_max = int(max(ex0, ex1))
        ey_max = int(max(page_height - ey0, page_height - ey1))

        wx_min = min(sx_min, ex_min)
        wy_min = min(sy_min, ey_min)
        wx_max = max(sx_max, ex_max)
        wy_max = max(sy_max, ey_max)

        result_word_batches.append([[wx_min, wy_min, wx_max, wy_max], label])
    return result_word_batches

def aug_line_text(line, page_height):
    text = line.get_text()
    num_words_in_line = len(text.split())
    result_aug_line_text = []
    if num_words_in_line < min_word_num:
        return result_aug_line_text

    for number in range(min_word_num, num_words_in_line + 1):
        word_batches_with_num = word_batches_in_line(line, number, page_height)
        for word_batch in word_batches_with_num:
            if word_batch not in result_aug_line_text:
                result_aug_line_text.append(word_batch)
    return result_aug_line_text

def line_process(img, lines, base_file_name, page_height):
    for l, line in enumerate(lines):
        if not normalize("NFC", line.get_text()):
            continue

        if l == 0:
            info_phrases = aug_line_text(line, page_height)
            for i, info_phrase in enumerate(info_phrases):
                [[x_min, y_min, x_max, y_max], label] = info_phrase
                try:
                    crop_img = img[y_min:y_max, x_min:x_max]
                    aug_imgs = [crop_img] + augmentation(crop_img)
                    new_file_name = base_file_name + "_line" + str(l) + "_phr" + str(i)
                    for a, aug_img in enumerate(aug_imgs):
                        file_name = new_file_name + "_aug" + str(a) + ".png"
                        cv2.imwrite(file_name, aug_img)
                        annotation_file.write(file_name + "\t" + label + "\n")
                except Exception:
                    continue

def paragraphs_process(img, paragraphs, index_page, pdf_path, page_height, scanned = False):
    pdf_file_name = pdf_path.split("/")[-1][:-4]
    pdf_file_name = pdf_file_name + "_scanned" if scanned else pdf_file_name
    for p, paragraph in enumerate(paragraphs):
        base_file_name = "./crop_images/" + pdf_file_name + "_page" + str(index_page) + "_para" + str(p)
        line_process(img, paragraph, base_file_name, page_height)

In [None]:
def convert_coords(obj):
    """convert pdfminer's coordinate to the standard format (from top-left)
    """
#     sx0, sy0, sx1, sy1 = list(map(int, [char.x0, char.y0, char.x1, char.y1]))
    try:
        sx0, sy0, sx1, sy1 = obj.x0, obj.y0, obj.x1, obj.y1
    except Exception:
        sx0, sy0, sx1, sy1 = obj
    sx_min = min(sx0, sx1)
    sy_min = min(page_height - sy0, page_height - sy1)
    sx_max = max(sx0, sx1)
    sy_max = max(page_height - sy0, page_height - sy1)
    
    y_delta = sy_max - sy_min
    
    return sx_min, sy_min, sx_max, sy_max

class Space:
    fontname = None
    def get_text(self):
        return " "

def get_info_LT(paragraphs):
    lines, chars = [], []
    for textbox in paragraphs:
        if isinstance(textbox, pdfminer.layout.LTTextBoxHorizontal):
            for textline in textbox:  # lines
                if isinstance(textline, pdfminer.layout.LTTextLineHorizontal):
                    textline_content = textline.get_text().strip()
                    if textline_content:
                        lines.append(textline)
                    else:
                        continue

                    for char in textline:
#                         text_content = char.get_text().strip()
                        if isinstance(char, pdfminer.layout.LTChar):
#                             print(repr(text_content))
#                             print(char.fontname)
                            chars.append(char)
                         
                # define rule to ignore merged lines
                chars.append(Space())

    return lines, chars

def get_words(chars, text_chars, debug=False):
    
#     text_chars = "".join([i.get_text() for i in chars])
    words, tmp = [], []
    for i, char in enumerate(text_chars):
#         if char.strip():
        if len(char.strip()) > 0:
            tmp.append(chars[i])
#             tmp.append(char)
        else:
            if tmp:
                if debug:
                    print(repr("".join([i.get_text() for i in tmp])))
                words.append(tmp)
            tmp = []

    if tmp:
        words.append(tmp)
    return words

def get_xyxy_from_LT(obj):
    x_min = min(i.x0 for i in obj)
    y_min = min(i.y0 for i in obj)
    x_max = max(i.x1 for i in obj)
    y_max = max(i.y1 for i in obj)
    return x_min, y_min, x_max, y_max

def expand_box(coord, h_scale=0.16, to_int=True):
    x1, y1, x2, y2 = coord
    h = y2 - y1
    y_delta = h * h_scale
    y1 = y1 - y_delta
    if to_int:  # for visualize
        x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))
    return x1, y1, x2, y2

def scale_coord(coord, x_ratio, y_ratio):
    x1, y1, x2, y2 = coord
    
    x1, x2 = x1 * x_ratio, x2 * x_ratio
    y1, y2 = y1 * y_ratio, y2 * y_ratio
    return x1, y1, x2, y2

def check_same_h(obj, thresh=2):
    # ?
    all_y0 = set([round(i.y0, 1) for i in obj])
    all_y1 = set([round(i.y1, 1) for i in obj])
    if len(all_y0) == 1 and len(all_y1) == 1:
        return True
    else:
        return False

all_minus = ["–", "—", "-", "+", ":"]
all_dots = ["…", "."]
def process_special_cases(obj, coord, h_scale=0.16, debug=False, to_int=True):

    _texts = "".join(i.get_text() for i in obj)
    if debug:
        if len(_texts) == 1 and _texts not in text_vocab:
            print("{} : {}".format(_texts, ord(_texts)))

    is_same_h = check_same_h(obj)
    if not is_same_h:
        if debug:
            print("{} : {}".format(_texts, len(_texts)))
            print([i.y0 for i in obj])
            print([i.y1 for i in obj])
        h_scale = 0.01
        
    if _texts in all_minus:
        h_scale = 0.0
        
    x1, y1, x2, y2 = expand_box(coord, h_scale=h_scale, to_int=to_int)

    # list minus chars
    # – - 
    if any(i in _texts for i in all_minus) and len(set(_texts)) == 1:
        # for `-`
        y_center = (y1 + y2) / 2
        y1 = (y1 + y_center) / 2
        y2 = (y2 + y_center) / 2
        y1 = int(y1)
        y2 = int(y2)
    elif any(i in _texts for i in all_dots) and len(set(_texts)) == 1:
        # for `.......`
        y1 += (y2 - y1) / 2
        y1 = int(y1)

    return x1, y1, x2, y2

In [None]:
tvpl_fd = "/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/thuvienphapluat/pdf/"
pdf_fps = sorted(glob.glob(os.path.join(tvpl_fd, "*.pdf")))
print(len(pdf_fps))

In [None]:
pdf_path = random.choice(pdf_fps)
print(pdf_path)
assert os.path.exists(pdf_path)

In [None]:
with open(pdf_path, "rb") as open_pdf:

    # pdf to images
    images = pdf2image.convert_from_path(pdf_path)
    print(len(images))

    # define pdfminer
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    parser = PDFParser(open_pdf)
    document = PDFDocument(parser)

    # loop for pages
    for p, page in enumerate(PDFPage.get_pages(open_pdf)):
        try:
            interpreter.process_page(page)
            paragraphs = device.get_result()

            [x_min, y_min, x_max, y_max] = page.mediabox
            page_height, page_width = int(abs(y_max - y_min)), int(abs(x_max - x_min))

            origin_img = np.array(images[p])
            h_origin, w_origin, _ = origin_img.shape
            x_ratio = h_origin / page_height
            y_ratio = w_origin / page_width

            beauty_img = cv2.resize(origin_img.copy(), (page_width, page_height))
#             scanned_img = scanned_augmentation(beauty_img.copy())
#             if p == 4:
#                 break
            break

#             paragraphs_process(beauty_img, paragraphs, p, pdf_path, page_height)
#             paragraphs_process(scanned_img, paragraphs, p, pdf_path, page_height, True)
        except Exception as e:
            print(e)
            continue

In [None]:
h_origin, w_origin, _ = np.array(images[0]).shape
y_ratio = h_origin / page_height
x_ratio = w_origin / page_width
print(x_ratio)
print(y_ratio)
print(origin_img.shape)
print(beauty_img.shape)

In [None]:
imshow(origin_img, (20, 20))

#### visualize word box

In [None]:
# len(glob.glob("/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/vanban.chinhphu.vn/pdf/*.pdf"))

In [None]:
def fix_length_special_cases(chars):
    text_chars = []
    for i in chars:
        _text = i.get_text()
        if len(_text) > 1:  # some chars have length > 1
            _text = _text[0]
        text_chars.append(_text)
    text_chars = "".join(text_chars)
    return text_chars

In [None]:
# 1 page
lines, chars = get_info_LT(paragraphs)
# text_chars = "".join([i.get_text() for i in chars])
text_chars = fix_length_special_cases(chars)
print("{} : {}".format(len(chars), len(text_chars)))
words = get_words(chars, text_chars, debug=False)
print(len(words))

Special cases

- các kí tự gạch đầu dòng --> box cần nhỏ hơn
- check underline

In [None]:
to_origin = True

if to_origin:
    vis_img = origin_img.copy()
else:
    vis_img = beauty_img.copy()

for word in words:
    _texts = "".join(i.get_text() for i in word)
    if list(set(_texts))[0] == "—" and len(_texts) > 1:
        continue

    coord = get_xyxy_from_LT(word)  # coord in pdf
    coord = convert_coords(coord)
    if to_origin:
        coord = scale_coord(coord, x_ratio, y_ratio)

    x1, y1, x2, y2 = process_special_cases(word, coord, h_scale=0.16, debug=False)
    cv2.rectangle(vis_img, (x1, y1), (x2, y2), (255, 0, 0), 1)

imshow(vis_img)
# imageio.imwrite("./data/words.png", vis_img)

#### visualize line box

In [None]:
# vis_img = beauty_img.copy()
vis_img = origin_img.copy()
for line in lines:
    coord = line.bbox
    coord = convert_coords(coord)
    coord = scale_coord(coord, x_ratio, y_ratio)

    coord = expand_box(coord, h_scale=0.16)
#     coord = process_special_cases(line, coord, h_scale=0.16)
    x1, y1, x2, y2 = coord
    
    cv2.rectangle(vis_img, (x1, y1), (x2, y2), (255, 0, 0), 1)
imshow(vis_img)
# imageio.imwrite("./data/lines.png", vis_img)

#### visualize char box

In [None]:
vis_img = beauty_img.copy()
# for word in words:
#     coords = get_xyxy_from_LT(word)  # coord in pdf
#     x1, y1, x2, y2 = convert_coords(coords)
#     x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))
#     cv2.rectangle(vis_img, (x1, y1), (x2, y2), (255, 0, 0), 1)

for word in words:
    for char in word:
        coord = convert_coords(char)
#         x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))
        x1, y1, x2, y2 = expand_box(coord)
        
        cv2.rectangle(vis_img, (x1, y1), (x2, y2), (255, 0, 0), 1)

imshow(vis_img)

### Data generator

In [None]:
def xyxy2xys(coord, to_int=True):
    if to_int:
        coord = list(map(int, coord))
    x1, y1, x2, y2 = coord
    p1 = (x1, y1)
    p2 = (x2, y1)
    p3 = (x2, y2)
    p4 = (x1, y2)
    return [p1, p2, p3, p4]

In [None]:
# word
pdf_fps = sorted(glob.glob(os.path.join("/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/thuvienphapluat/pdf/*.pdf")))

In [None]:
len(pdf_fps)

#### 1 page

In [None]:
text_vocab = "aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 "
punc_vocab = '!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~'
vocab = text_vocab + punc_vocab
set_vocab = set(vocab)

In [None]:
# 1 page
lines, chars = get_info_LT(paragraphs)
# text_chars = "".join([i.get_text() for i in chars])
text_chars = fix_length_special_cases(chars)
words = get_words(chars, text_chars)

In [None]:
for word in words:
    coord = get_xyxy_from_LT(word)  # coord in pdf
    coord = convert_coords(coord)
    if to_origin:
        coord = scale_coord(coord, x_ratio, y_ratio)

    x1, y1, x2, y2 = process_special_cases(word, coord, h_scale=0.16, debug=True)
    cv2.rectangle(vis_img, (x1, y1), (x2, y2), (255, 0, 0), 1)

In [None]:
def get_word_coords(words, xy_ratio=None, return_text=False, to_int=True):
    fourpoint_word_coords = []
    text_words = []

    for word in words:
        text = "".join(i.get_text()[0] for i in word)
        if list(set(text))[0] == "—" and len(text) > 1:
            continue
        
        # coord
        coord = get_xyxy_from_LT(word)
        coord = convert_coords(coord)
        if xy_ratio is not None:
            x_ratio, y_ratio = xy_ratio
            coord = scale_coord(coord, x_ratio, y_ratio)

#         coord = expand_box(coord, h_scale=0.16)
        coord = process_special_cases(word, coord, h_scale=0.16, debug=False, to_int=to_int)
        coord = xyxy2xys(coord, to_int=to_int)
        fourpoint_word_coords.append(coord)
        
        if return_text:
            # text
            text = "".join(i.get_text()[0] for i in word)  # fix
#             text = "".join(i.get_text() for i in word)  # fix
            text_words.append(text)

    fourpoint_word_coords = np.array(fourpoint_word_coords)
    fourpoint_word_coords = fourpoint_word_coords.transpose(2, 1, 0)
    if return_text:
        return fourpoint_word_coords, text_words
    else:
        return fourpoint_word_coords
    
# fourpoint_word_coords, text_words = get_word_coords(words, return_text=True)
# print(fourpoint_word_coords.shape)

In [None]:
def get_char_coords(chars, xy_ratio=None):
    fourpoint_char_coords = []
    for i, char in enumerate(chars):
        char_text = char.get_text()
        if isinstance(char, Space) or len(char_text.strip()) == 0:
            continue
        coord = convert_coords(char)
        if xy_ratio is not None:
            x_ratio, y_ratio = xy_ratio
            coord = scale_coord(coord, x_ratio, y_ratio)
        coord = expand_box(coord, h_scale=0.16)
        coord = xyxy2xys(coord, to_int=True)
        fourpoint_char_coords.append(coord)

    fourpoint_char_coords = np.array(fourpoint_char_coords).transpose(2, 1, 0)
    return fourpoint_char_coords

# fourpoint_char_coords = get_char_coords(chars)    
# print(fourpoint_char_coords.shape)

In [None]:
def get_page_info(paragraphs, xy_ratio=None):
    # 1 page
    lines, chars = get_info_LT(paragraphs)
#     text_chars = "".join([i.get_text() for i in chars])
    text_chars = fix_length_special_cases(chars)
#     if len(chars) != len(text_chars):  # error
#         return
    words = get_words(chars, text_chars)
    if len(words) == 0:
        return False
    
    try:
#         fourpoint_word_coords, text_words = get_word_coords(words, xy_ratio=xy_ratio)
#         fourpoint_char_coords = get_char_coords(chars, xy_ratio=xy_ratio) 
        fourpoint_word_coords = get_word_coords(words, xy_ratio=xy_ratio, return_text=False, to_int=False)
    except Exception as e:
        print("Error get_page_info: {}".format(e))
        return
    
    return fourpoint_word_coords
#     return fourpoint_word_coords, fourpoint_char_coords, text_words

# assert fourpoint_char_coords.shape[2] == sum(len(i) for i in text_words)

#### all pdfs

In [None]:
save_img_fd = "/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/thuvienphapluat/pdf_CRAFT_0906"

In [None]:
def remove_all_content(fd):
    shutil.rmtree(fd)
    os.mkdir(fd)
    
# remove_all_content(save_img_fd)

In [None]:
# pdf_fps = [os.path.join(tvpl_fd, "153ded28-4497-4ce0-b5a0-217e9dedf5c9.pdf")]
pdf_fps = sorted(glob.glob(os.path.join("/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/thuvienphapluat/pdf/*.pdf")))
print(len(pdf_fps))

In [None]:
imnames, charBB, wordBB, txt = [], [], [], []
valid_ids = {}
error_fps = []
no_meta_fps = []
to_origin = True
word_only = True  # save word bounding-box only, ignore char bbox

In [None]:
for pdf_fp in tqdm_notebook(pdf_fps):

    pdf_fn = pdf_fp.strip("/").split("/")[-1]
    pdf_id = pdf_fn[:-4]
    with open(pdf_fp, "rb") as open_pdf:
        # pdf to images
        try:
            images = pdf2image.convert_from_path(pdf_fp)
            valid_ids[pdf_id] = {
                "id": [],
                "img_fp": None
            }

            # define pdfminer
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            parser = PDFParser(open_pdf)
            document = PDFDocument(parser)
        except Exception:
            print(pdf_fp)
            continue

        # loop for pages
        for p, page in enumerate(PDFPage.get_pages(open_pdf)):
            try:
                interpreter.process_page(page)
                paragraphs = device.get_result()

                [x_min, y_min, x_max, y_max] = page.mediabox
                page_height, page_width = int(abs(y_max - y_min)), int(abs(x_max - x_min))

                origin_img = np.array(images[p])
                h_origin, w_origin, _ = origin_img.shape
                x_ratio = h_origin / page_height
                y_ratio = w_origin / page_width
                beauty_img = cv2.resize(origin_img.copy(), (page_width, page_height))

                if to_origin:
                    vis_img = origin_img.copy()
                    xy_ratio = (x_ratio, y_ratio)
                else:
                    vis_img = beauty_img.copy()
                    xy_ratio = None
                
                res = get_page_info(paragraphs, xy_ratio=(x_ratio, y_ratio))
                if res is None:
                    if pdf_fp not in error_fps:
                        error_fps.append(pdf_fp)
                    continue
                    
                if res is False:
                    print("{}:{} dont have metadata".format(pdf_fn[:-4], p))
                    no_meta_fps.append(pdf_fp)
                    continue

                random_id = str(uuid.uuid4())
                img_fn = "{}.png".format(random_id)
                save_img_fp = os.path.join(save_img_fd, img_fn)
                imageio.imwrite(save_img_fp, vis_img)

                valid_ids[pdf_id]["id"].append(p)
                valid_ids[pdf_id]["img_fp"] = save_img_fp
                if not word_only:
                    fourpoint_word_coords, fourpoint_char_coords, text_words = res
                    assert fourpoint_char_coords.shape[2] == sum(len(i) for i in text_words)

                    imnames.append(img_fn)
                    wordBB.append(fourpoint_word_coords)
                    charBB.append(fourpoint_char_coords)
                    txt.append(text_words)
                    del fourpoint_char_coords, fourpoint_word_coords, text_words
                else:
                    fourpoint_word_coords = copy.deepcopy(res)

                    imnames.append(img_fn)
                    wordBB.append(fourpoint_word_coords.astype(np.float16))
                    del fourpoint_word_coords

            except Exception as e:
                print(traceback.format_exc())
                print(e)
                if pdf_fp not in error_fps:
                    error_fps.append(pdf_fp)
                continue
        
            gc.collect()
            
        if len(valid_ids[pdf_id]["id"]) == 0:
            del valid_ids[pdf_id]
        gc.collect()

### save annotation

In [None]:
dump_dir = "/media/SUN-ASTERISK\phan.huy.hoang/My Passport/phanhoang/data/PDF_converter/thuvienphapluat/custom_synthtext/annotations/"

In [None]:
# wordBB2 = [i.astype(np.float16) for i in wordBB]
# imnames = [i.split("/")[-1] for i in imnames]

with open(os.path.join(dump_dir, "train_wordBB.pkl"), "wb") as f:
    pickle.dump(wordBB2, f)
          
with open(os.path.join(dump_dir, "train_imnames.pkl"), "wb") as f:
    pickle.dump(imnames, f)

In [None]:
random_id = random.choice(range(len(imnames)))
random_id = 0

img_fp = os.path.join(save_img_fd, imnames[random_id])
assert os.path.exists(img_fp)
img = cv2.imread(img_fp)[:, :, ::-1]
polys = wordBB[random_id]
print(polys.shape)

vis_img = img.copy()
for poly in polys.transpose(2, 1, 0):
    poly = poly.astype(int)
    cv2.polylines(vis_img, [poly], True, (255, 0, 0), thickness=1)

imshow(vis_img)