## Finetune LayoutLMv2 for document classification

### This notebook is intended to be run on kaggle with gpu enabled (use gpu P100)
#### Note: don't use T4x2 as multi-gpu training give errors

In [None]:
# clean previous training details
!rm -rf /kaggle/working/logs

In [None]:
# clean previous model saved
!rm -rf /kaggle/working/layoutlmv2-cls

In [1]:
!apt-get update && apt-get install -y \
    gcc \
    g++ \
    make \
    git \
    tesseract-ocr \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
 && apt-get clean \
 && rm -rf /var/lib/apt/lists/*

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease                                              
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]                             
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [79.8 kB]                 
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]                           
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]                                
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,804 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]                           
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,077 kB]                      
Get:

In [2]:
!pip install scikit-learn torch torchaudio torchvision transformers accelerate PyMuPDF pytesseract evaluate

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvid

In [3]:
!pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-j8fr48e6
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-j8fr48e6
  Resolved https://github.com/facebookresearch/detectron2.git to commit 0eeec6d5c0ae7e29028c3b976379f70c755209c4
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3-none-any.whl.metadata (370 bytes)
Collecting hydra-core>=

In [4]:
import os
import fitz
import io
from PIL import Image

label2id = {"inventory_monthly": 0, "inventory_monthly_category": 1, "invoices": 2, "purchase_orders": 3, "shipping_orders": 4}
id2label = {v: k for k, v in label2id.items()}

inventory_monthly_path = "/kaggle/input/company-documents-dataset/CompanyDocuments/inventory_monthly"
inventory_monthly_category_path = "/kaggle/input/company-documents-dataset/CompanyDocuments/inventory_monthly_category"
invoices_path = "/kaggle/input/company-documents-dataset/CompanyDocuments/invoices"
purchase_orders_path = "/kaggle/input/company-documents-dataset/CompanyDocuments/purchase_orders"
shipping_orders_path = "/kaggle/input/company-documents-dataset/CompanyDocuments/shipping_orders"

dataset = []

def add_data(folder_path, label):
    for filename in os.listdir(folder_path):
        full_path = os.path.join(folder_path, filename)
        if os.path.isfile(full_path):
            dataset.append({
                "doc_path": full_path,
                "label": label
            })



In [5]:
# add inventory_monthly data
add_data(inventory_monthly_path, "inventory_monthly")

# add inventory_monthly_category data
add_data(inventory_monthly_category_path, "inventory_monthly_category")

# add invoices data
add_data(invoices_path, "invoices")

# add purchase_orders data
add_data(purchase_orders_path, "purchase_orders")

# add shipping_orders data
add_data(shipping_orders_path, "shipping_orders")

In [6]:
from sklearn.model_selection import train_test_split

train_data, eval_data = train_test_split(dataset, test_size=0.2, random_state=42, stratify=[d["label"] for d in dataset])


In [7]:
import torch
from transformers import LayoutLMv2Processor
from PIL import Image
from torch.utils.data import Dataset

processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

def preprocess(sample):
    # Load PDF and render first page
    doc = fitz.open(sample["doc_path"])
    page = doc.load_page(0)
    pix = page.get_pixmap(dpi=200)

    # Convert Pixmap to bytes
    img_bytes = pix.tobytes("png")

    # Load into PIL.Image
    image = Image.open(io.BytesIO(img_bytes))
    
    encoding = processor(image, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    
    # Add label
    encoding["labels"] = torch.tensor(label2id[sample["label"]])
    
    return {k: v.squeeze(0) for k, v in encoding.items()}


class DocumentDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return preprocess(self.data[idx])
    

train_dataset = DocumentDataset(train_data, processor)
eval_dataset = DocumentDataset(eval_data, processor)

2025-07-06 09:04:09.086233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751792649.272400      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751792649.321627      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

In [8]:
from transformers import LayoutLMv2ForSequenceClassification

model = LayoutLMv2ForSequenceClassification.from_pretrained(
    "microsoft/layoutlmv2-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/802M [00:00<?, ?B/s]

Some weights of LayoutLMv2ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/802M [00:00<?, ?B/s]

In [10]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
metric = evaluate.load('accuracy')

# compute evaluation accuracy for reporting to tensorboard logs
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    eval_accuracy = metric.compute(predictions=predictions, references=labels)
    print("eval_accuracy: ", eval_accuracy)
    return eval_accuracy

training_args = TrainingArguments(
    output_dir="./layoutlmv2-cls",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_steps=26, # log metrics every 26 steps
    skip_memory_metrics=True, # skip reporting memory metrics for efficiency
    torch_empty_cache_steps=10, # release cached GPU memory held by PyTorch's allocator every 10 steps
    eval_strategy="steps",
    eval_steps=26, # evaluate at every 26 steps
    load_best_model_at_end=True, # only load best model at end
    save_strategy="steps",
    save_steps=26, # create model checkpoint at every 26 steps
    save_total_limit=1, # keep only best model checkpoint
    metric_for_best_model="eval_loss", # best model has lowest eval_loss
    greater_is_better=False, # lower eval_loss is better
    logging_dir="./logs",
    report_to=["tensorboard"],
    push_to_hub=False, # don't push to hub while training to avoid pushing overfitted model
    hub_model_id="", # hf model id
    hub_token=""  # hf token
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    processing_class=processor
)


In [11]:
# start training
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
26,0.7722,0.22491,0.921642
52,0.0828,0.045235,0.990672
78,0.026,0.04595,0.990672
104,0.0265,0.026703,0.990672
130,0.0263,0.006847,1.0
156,0.008,0.002586,1.0
182,0.0023,0.001386,1.0
208,0.0014,0.000905,1.0
234,0.0011,0.000804,1.0
260,0.0012,0.000757,1.0


eval_accuracy:  {'accuracy': 0.9216417910447762}
eval_accuracy:  {'accuracy': 0.9906716417910447}
eval_accuracy:  {'accuracy': 0.9906716417910447}
eval_accuracy:  {'accuracy': 0.9906716417910447}
eval_accuracy:  {'accuracy': 1.0}
eval_accuracy:  {'accuracy': 1.0}
eval_accuracy:  {'accuracy': 1.0}
eval_accuracy:  {'accuracy': 1.0}
eval_accuracy:  {'accuracy': 1.0}
eval_accuracy:  {'accuracy': 1.0}


TrainOutput(global_step=268, training_loss=0.09196773890246039, metrics={'train_runtime': 7770.6116, 'train_samples_per_second': 0.275, 'train_steps_per_second': 0.034, 'total_flos': 1155286567587840.0, 'train_loss': 0.09196773890246039, 'epoch': 1.0})

In [12]:
# push best model to Hugging Face Hub
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/802M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/navodPeiris/layoutlmv2-document-classifier/commit/f4e0e4834fd2867e60903808507cc30c081076c2', commit_message='End of training', commit_description='', oid='f4e0e4834fd2867e60903808507cc30c081076c2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/navodPeiris/layoutlmv2-document-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='navodPeiris/layoutlmv2-document-classifier'), pr_revision=None, pr_num=None)