In [1]:
import argparse
import glob
import logging
import os
import random
import shutil
import pandas as pd
from PIL import Image
from pathlib import Path

import numpy as np
import torch
from seqeval.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
from tensorboardX import SummaryWriter
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
)

from StringUtils import *
from OCRUtils import *
from model import * 
from DataLoader import *


logger = logging.getLogger(__name__)

## Data preparation

In [19]:
SROIEtrain = pd.read_pickle('./SROIE2019Train')
SROIEtest = pd.read_pickle('./SROIE2019Test')

In [13]:
#image column for train set
image_column = []
SROIEtrainv1 = []


for indx ,row in tqdm(SROIEtrain.iterrows()):
    filename = row['filename']
    folder_path = './SROIE2019/train/img'
    image_path = os.path.join(folder_path, filename)
    base_name, extension = os.path.splitext(image_path)

    # Replace the extension with .jpg
    image_path = base_name + '.jpg'
    image = Image.open(image_path)
    if image.size[0] < 2000:
        image_column.append(image)
        SROIEtrainv1.append(row)
SROIEtrainv1 = pd.concat(SROIEtrainv1, axis=1).T
SROIEtrainv1['image'] = image_column


# # image column for test set
image_column = []
SROIEtestv1 = []


for indx ,row in tqdm(SROIEentities.iterrows()):
    filename = row['filename']
    folder_path = './SROIE2019/test/img'
    image_path = os.path.join(folder_path, filename)
    base_name, extension = os.path.splitext(image_path)

    # Replace the extension with .jpg
    image_path = base_name + '.jpg'
    image = Image.open(image_path)
    if image.size[0] < 2000:
        image_column.append(image)
        SROIEtestv1.append(row)
SROIEtestv1 = pd.concat(SROIEtestv1, axis=1).T
SROIEtestv1['image'] = image_column

347it [00:00, 1468.18it/s]


In [15]:
# drop indexes for train set
indexes_to_drop = []
for indx ,row in tqdm(SROIEtrainv1.iterrows()):
    image = row['image']
    image_array = np.array(image)
    ndim = image_array.ndim
    if ndim < 3 :
        print(indx, f'has only {ndim} dimensions ')
        indexes_to_drop.append(indx)

SROIEtrainv1.drop(index = indexes_to_drop, inplace= True)
SROIEtrainv1 = SROIEtrainv1.reset_index()

# # drop ondexes for test set
indexes_to_drop = []
for indx ,row in tqdm(SROIEtestv1.iterrows()):
    image = row['image']
    image_array = np.array(image)
    ndim = image_array.ndim
    if ndim < 3 :
        print(indx, f'has only {ndim} dimensions ')
        indexes_to_drop.append(indx)

SROIEtestv1.drop(index = indexes_to_drop, inplace= True)
SROIEtestv1 = SROIEtestv1.reset_index()

294it [00:01, 246.69it/s]


In [16]:
# normalized_boxes = []
# for indx, row in SROIEtrain.iterrows():
#     image = row['image']
#     width, height = image.size 

#     boxes = row['boxes']

#     normalized_boxes.append([normalize_box(box, width=width, height=height) for box in boxes])

# SROIEtrain['nboxes'] = normalized_boxes



# normalized_boxes = []
# for indx, row in SROIEtestv1.iterrows():
#     image = row['image']
#     width, height = image.size 

#     boxes = row['boxes']

#     normalized_boxes.append([normalize_box(box, width=width, height=height) for box in boxes])

# SROIEtestv1['nboxes'] = normalized_boxes



## Dataset preparation : Pytorch DataLoader and training arguments

In [None]:
from transformers import LayoutLMv3ImageProcessor
batch_size = 8
n_classes = 7

feature_extractor = LayoutLMv3ImageProcessor(apply_ocr= False, do_normalize= True, do_resize= True)
tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", max_length = 520)
processor = LayoutLMv3Processor(feature_extractor,tokenizer)
model = LayoutLMv3ForTokenClassification.from_pretrained(
            "microsoft/layoutlmv3-base",
            num_labels=n_classes
        )

train_dataset = TokenClassificationDataset(SROIEtrain, processor)
eval_dataset = TokenClassificationDataset(SROIEtest, processor)


train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
train_dataset,
sampler=train_sampler,
batch_size=batch_size,
collate_fn=None,
)

eval_sampler = RandomSampler(eval_dataset)
eval_dataloader = DataLoader(
eval_dataset,
sampler=eval_sampler,
batch_size=batch_size,
collate_fn=None,
)

### Model parameters, arguments

In [27]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
weight_decay = 1e-5
learning_rate = 5e-5
adam_epsilon = 1e-8
warmup_steps = 10
t_total = 3000
num_train_epochs = 5
gradient_accumulation_steps = 50
device = torch.device('cuda')
seed = 42

optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": weight_decay,
    },
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(
    optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon
)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

NameError: name 'model' is not defined

In [None]:
# Train!
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", num_train_epochs)
logger.info(
    "  Total train batch size (w. parallel, distributed & accumulation) = %d",
    batch_size
    * gradient_accumulation_steps,
)
logger.info("  Gradient Accumulation steps = %d", gradient_accumulation_steps)
logger.info("  Total optimization steps = %d", t_total)

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(
    int(num_train_epochs), desc="Epoch", disable=False
)
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(seed)  # Added here for reproductibility (even between python 2 and 3)
for _ in train_iterator:
    epoch_iterator = tqdm(
        train_dataloader, desc="Iteration", disable=False
    )
    for step, batch in enumerate(epoch_iterator):
        model.train()
        inputs = {
            "input_ids": batch[0].to(device),
            "attention_mask": batch[1].to(device),
            "labels": batch[3].to(device),
        }
        if model_type in ["layoutlm"]:
            inputs["bbox"] = batch[4].to(device)
        inputs["token_type_ids"] = (
            batch[2].to(device) if model_type in ["bert", "layoutlm"] else None
        )  # RoBERTa don"t use segment_ids

        outputs = model(**inputs)
        # model outputs are always tuple in pytorch-transformers (see doc)
        loss = outputs[0]

        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        tr_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            if fp16:
                torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), max_grad_norm
                )
            else:
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_grad_norm
                )
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if (
                local_rank in [-1, 0]
                and logging_steps > 0
                and global_step % logging_steps == 0
            ):
                # Log metrics
                if (
                    local_rank in [-1, 0] and evaluate_during_training
                ):  # Only evaluate when single GPU otherwise metrics may not average well
                    results, _ = evaluate(
                        args,
                        model,
                        tokenizer,
                        labels,
                        pad_token_label_id,
                        mode="dev",
                    )
                    for key, value in results.items():
                        tb_writer.add_scalar(
                            "eval_{}".format(key), value, global_step
                        )
                tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar(
                    "loss",
                    (tr_loss - logging_loss) / logging_steps,
                    global_step,
                )
                logging_loss = tr_loss

            if (
                local_rank in [-1, 0]
                and save_steps > 0
                and global_step % save_steps == 0
            ):
                # Save model checkpoint
                output_dir = os.path.join(
                    output_dir, "checkpoint-{}".format(global_step)
                )
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = (
                    model.module if hasattr(model, "module") else model
                )  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, "training_bin"))
                logger.info("Saving model checkpoint to %s", output_dir)

        if max_steps > 0 and global_step > max_steps:
            epoch_iterator.close()
            break
    if max_steps > 0 and global_step > max_steps:
        train_iterator.close()
        break

if local_rank in [-1, 0]:
    tb_writer.close()

return global_step, tr_loss / global_step