In [3]:
config = """
Global:

Train:
  dataset:
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: False
          algorithm: Bert
          class_path: train_data/XFUND/class_list_xfun.txt
          use_textline_bbox_info: True
          order_method: "tb-yx" # one of [None, "tb-yx"]
      - VQATokenPad:
          max_seq_len: 512
          return_attention_mask: True
      - VQASerTokenChunk:
          max_seq_len: 512
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1
          mean: [ 123.675, 116.28, 103.53 ]
          std: [ 58.395, 57.12, 57.375 ]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
"""

import yaml

config = yaml.load(config, Loader=yaml.FullLoader)
dataset_config = config['Train']['dataset']
global_config = config['Global']

In [4]:
import json
import os
from ppocr.data.imaug import create_operators, transform
import paddle

data_dir = "train_data/XFUND/zh_train/image"
label_file = "train_data/XFUND/zh_train/train.json"
with open(label_file, 'r') as f:
    lines = f.readlines()

line = lines[1].strip()
file_name, label = line.split('\t')
img_path = os.path.join(data_dir, file_name)

data = {'img_path': img_path, 'label': label}
with open(data['img_path'], 'rb') as f:
    img = f.read()
    data['image'] = img

ops = create_operators(dataset_config['transforms'], global_config)

outs = transform(data, ops)
outs = [paddle.to_tensor(out, place='cpu') for out in outs]
batch = [paddle.unsqueeze(out, axis=0) for out in outs]

[32m[2022-12-15 00:35:59,926] [    INFO][0m - We are using <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> to load 'bert-base-chinese'.[0m
[32m[2022-12-15 00:35:59,927] [    INFO][0m - Already cached /home/kaihong/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2022-12-15 00:35:59,935] [    INFO][0m - tokenizer config file saved in /home/kaihong/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2022-12-15 00:35:59,936] [    INFO][0m - Special tokens file saved in /home/kaihong/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m


In [5]:
print(f"type of outs: {type(batch)}")
print(f"len of outs: {len(batch)}")
# keep_keys: [ 'input_ids', 'bbox', 'attention_mask',
# 'token_type_ids', 'image', 'labels']
print(f"shape of input_ids: {batch[0].shape}")
print(f"shape of bbox: {batch[1].shape}")
print(f"shape of attention_mask: {batch[2].shape}")
print(f"shape of token_type_ids: {batch[3].shape}")
print(f"shape of image: {batch[4].shape}")
print(f"shape of labels: {batch[5].shape}")


type of outs: <class 'list'>
len of outs: 6
shape of input_ids: [1, 512]
shape of bbox: [1, 512, 4]
shape of attention_mask: [1, 512]
shape of token_type_ids: [1, 512]
shape of image: [1, 3, 224, 224]
shape of labels: [1, 512]


In [None]:
# from paddlenlp.transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("layoutxlm-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
# input_ids = batch[0][0].numpy().tolist()
# print(tokenizer.decode(input_ids))

In [1]:
from ppocr.modeling.architectures import build_model
from ppocr.modeling.backbones import build_backbone

config = {
    'model_type': 'kie',
    'Transform': None,
    'Backbone': {
        'name': 'BertForSer',
        'pretrained': True,
        'checkpoints': None,
        'mode': 'base',
        'num_classes': 7
    }
}

model = build_model(config)

[32m[2022-12-15 00:35:47,782] [    INFO][0m - Already cached /home/kaihong/.paddlenlp/models/bert-base-chinese/bert-base-chinese.pdparams[0m
W1215 00:35:47.783761 1354768 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 12.0, Runtime API Version: 11.6
W1215 00:35:47.785861 1354768 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.
[32m[2022-12-15 00:35:50,916] [    INFO][0m - Weights from pretrained model not used in BertModel: ['cls.predictions.decoder_weight', 'cls.predictions.decoder_bias', 'cls.predictions.transform.weight', 'cls.predictions.transform.bias', 'cls.predictions.layer_norm.weight', 'cls.predictions.layer_norm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'][0m


In [6]:
preds = model(batch)

print(f"type of preds: {type(preds)}")
print(f"len of preds: {len(preds)}")
print(f"type of preds: {type(preds)}")
print(f"shape of preds['backbone_out']: {preds['backbone_out'].shape}")



type of preds: <class 'dict'>
len of preds: 1
type of preds: <class 'dict'>
shape of preds['backbone_out']: [1, 512, 7]


In [None]:
from ppocr.postprocess import build_post_process

config = {
    "name": "VQASerTokenLayoutLMPostProcess",
    "class_path": "train_data/XFUND/class_list_xfun.txt"
}

batch_numpy = [out.numpy() for out in batch]
post_process_class = build_post_process(config, global_config)
post_result = post_process_class(preds['backbone_out'], batch_numpy)
print(f"len of post_result: {len(post_result)}")

In [None]:
from ppocr.metrics import build_metric

config = {
    "name": "VQASerTokenMetric",
    "main_indicator": "hmean",
}

eval_class = build_metric(config)
eval_class(post_result, batch)
metric = eval_class.get_metric()
print(metric)