In [12]:
from tqdm import tqdm 
import os
import numpy as np
import pandas as pd
from PIL import Image


In [41]:
def parse_dgrl(root_dir):
    images = []
    labels = []
    char_found = False
    for file in tqdm(os.listdir(root_dir)):
        with open(os.path.join(root_dir, file), 'rb') as f:
            grayscale = False

            header_length = int.from_bytes(f.read(4), "little")
            f.read(header_length - 28)
            code_type = str(f.read(20), "ascii")
            code_length = int.from_bytes(f.read(2), "little")
            bits_per_pixel = int.from_bytes(f.read(2), "little")
            if bits_per_pixel == 8:
                grayscale = True
                
            image_height = int.from_bytes(f.read(4), "little")
            image_width = int.from_bytes(f.read(4), "little")
            num_lines = int.from_bytes(f.read(4), "little")

            for l in range(num_lines):
                char_number = int.from_bytes(f.read(4), "little")
                x = f.read(code_length * char_number)
                x = x.replace(b'\xff', b'')
                label = str(x, "gbk").replace('\x00', '')                    
                
                top_coord = int.from_bytes(f.read(4), "little")
                right_coord = int.from_bytes(f.read(4), "little")
                height = int.from_bytes(f.read(4), "little")
                width = int.from_bytes(f.read(4), "little")
                width = width if grayscale else (width + 7)/8
                
                image = np.zeros((height, width))
                
                for i in range(height):
                    iter_width = width if grayscale else (width + 7)/8 
                    for j in range(width):
                        image[i][j] = int.from_bytes(f.read(1), "little")

                PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
                file_name = file.partition(".")[0] + "-" + str(l)
                PIL_image.save('./dataset/chinese_lines/' + file_name + '.png')
                labels.append((file_name, label))
    
    d = pd.DataFrame(labels, columns=['file', 'label'])
    d.to_csv('../dataset/chinese_lines.csv', index=False, encoding='gbk')
    return d

In [43]:
parse_dgrl('../dataset/HWDB2.0Train')
print("Success!")

100%|██████████████████████████████████████████████████████████████████████████████| 1677/1677 [47:21<00:00,  1.69s/it]


Unnamed: 0,file,label
0,001-P16-0,"2002年以来,国内企业家包括许多著名企业家在内涉嫌违法犯罪被捕入"
1,001-P16-1,"狱的人数不断增多,此方面的报道也屡屡见诸极端。不是哪个被抓了,就"
2,001-P16-2,"是哪个被判了,或者是这个案子开庭了,那个案子判决了。总之,几乎月月都有这样"
3,001-P16-3,的新闻。
4,001-P16-4,"企业家落马、判刑、入狱、甚至犯死罪被执行死刑了,媒体关注的焦点"
...,...,...
16353,420-P20-3,"2007年暑运自7月1日起至8月31日止,共计62天。北京铁路局根"
16354,420-P20-4,客流调查对暑运客流情况进行预测。预计暑运期间将发送旅客
16355,420-P20-5,"人次,比去年同期增加283万人次,暑运高峰日将达到58万人次,比"
16356,420-P20-6,年同期高峰日增加5万人次。暑期客流以学生、旅游观光、休假疗
