# Fine-tuning LayoutLM using the SROIE dataset

This is my first attempt to fine-tune the LayoutLM model.

I used base codes from:https://www.kaggle.com/code/urbikn/layoutlm-using-the-sroie-dataset

I improved the model performance by

1)k-fold cross-validation
2)improving the possibility of labeling S-TOTAL

# 1. Pre-processing the dataset

In [None]:
import os
import glob
import json 
import random
from pathlib import Path
from difflib import SequenceMatcher


import cv2
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display
import matplotlib
from matplotlib import pyplot, patches

In [None]:
sroie_folder_path = Path('/kaggle/input/sroie-datasetv2/SROIE2019')
example_file = Path('X51005365187.txt')

In [None]:
def read_bbox_and_words(path: Path):
  bbox_and_words_list = []

  with open(path, 'r', errors='ignore') as f:
    for line in f.read().splitlines():
      if len(line) == 0:
        continue
        
      split_lines = line.split(",")

      bbox = np.array(split_lines[0:8], dtype=np.int32)
      text = ",".join(split_lines[8:])

      # From the splited line we save (filename, [bounding box points], text line).
      # The filename will be useful in the future
      bbox_and_words_list.append([path.stem, *bbox, text])
    
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = dataframe.drop(columns=['x1', 'y1', 'x3', 'y3'])

  return dataframe


# Example usage
bbox_file_path = sroie_folder_path / "test/box" / example_file
print("== File content ==")
!head -n 5 "{bbox_file_path}"

bbox = read_bbox_and_words(path=bbox_file_path)
print("\n== Dataframe ==")
bbox.head(5)

In [None]:
def read_entities(path: Path):
  with open(path, 'r') as f:
    data = json.load(f)

  dataframe = pd.DataFrame([data])
  return dataframe


# Example usage
entities_file_path = sroie_folder_path /  "test/entities" / example_file
print("== File content ==")
!head "{entities_file_path}"

entities = read_entities(path=entities_file_path)
print("\n\n== Dataframe ==")
entities

In [None]:
# Assign a label to the line by checking the similarity
# of the line and all the entities
def assign_line_label(line: str, entities: pd.DataFrame):
    line_set = line.replace(",", "").strip().split()
    for i, column in enumerate(entities):
        entity_values = entities.iloc[0, i].replace(",", "").strip()
        entity_set = entity_values.split()
        
        
        matches_count = 0
        for l in line_set:
            if any(SequenceMatcher(a=l, b=b).ratio() > 0.8 for b in entity_set):
                matches_count += 1
            
            if (column.upper() == 'ADDRESS' and (matches_count / len(line_set)) >= 0.5) or \
               (column.upper() != 'ADDRESS' and (matches_count == len(line_set))) or \
               matches_count == len(entity_set):
                return column.upper()

    return "O"


line = bbox.loc[1,"line"]
label = assign_line_label(line, entities)
print("Line:", line)
print("Assigned label:", label)

In [None]:
def assign_labels(words: pd.DataFrame, entities: pd.DataFrame):
    max_area = {"TOTAL": (0, -1), "DATE": (0, -1)}  # Value, index
    already_labeled = {"TOTAL": False,
                       "DATE": False,
                       "ADDRESS": False,
                       "COMPANY": False,
                       "O": False
    }

    # Go through every line in $words and assign it a label
    labels = []
    for i, line in enumerate(words['line']):
        label = assign_line_label(line, entities)

        already_labeled[label] = True
        if (label == "ADDRESS" and already_labeled["TOTAL"]) or \
           (label == "COMPANY" and (already_labeled["DATE"] or already_labeled["TOTAL"])):
            label = "O"

        # Assign to the largest bounding box
        if label in ["TOTAL", "DATE"]:
            x0_loc = words.columns.get_loc("x0")
            bbox = words.iloc[i, x0_loc:x0_loc+4].to_list()
            area = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1])

            if max_area[label][0] < area:
                max_area[label] = (area, i)

            label = "O"

        labels.append(label)

    labels[max_area["DATE"][1]] = "DATE"
    labels[max_area["TOTAL"][1]] = "TOTAL"

    words["label"] = labels
    return words


# Example usage
bbox_labeled = assign_labels(bbox, entities)
bbox_labeled.head(15)

In [None]:
def split_line(line: pd.Series):
  line_copy = line.copy()

  line_str = line_copy.loc["line"]
  words = line_str.split(" ")

  # Filter unwanted tokens
  words = [word for word in words if len(word) >= 1]

  x0, y0, x2, y2 = line_copy.loc[['x0', 'y0', 'x2', 'y2']]
  bbox_width = x2 - x0
  

  new_lines = []
  for index, word in enumerate(words):
    x2 = x0 + int(bbox_width * len(word)/len(line_str))
    line_copy.at['x0', 'x2', 'line'] = [x0, x2, word]
    new_lines.append(line_copy.to_list())
    x0 = x2 + 5 

  return new_lines


# Example usage
new_lines = split_line(bbox_labeled.loc[1])
print("Original row:")
display(bbox_labeled.loc[1:1,:])

print("Splitted row:")
pd.DataFrame(new_lines, columns=bbox_labeled.columns)

In [None]:
from time import perf_counter
def dataset_creator(folder: Path):
  bbox_folder = folder / 'box'
  entities_folder = folder / 'entities'
  img_folder = folder / 'img'

  # Sort by filename so that when zipping them together
  # we don't get some other file (just in case)
  entities_files = sorted(entities_folder.glob("*.txt"))
  bbox_files = sorted(bbox_folder.glob("*.txt"))
  img_files = sorted(img_folder.glob("*.jpg"))

  data = []

  print("Reading dataset:")
  for bbox_file, entities_file, img_file in tqdm(zip(bbox_files, entities_files, img_files), total=len(bbox_files)):            
    # Read the files
    bbox = read_bbox_and_words(bbox_file)
    entities = read_entities(entities_file)
    image = Image.open(img_file)

    # Assign labels to lines in bbox using entities
    bbox_labeled = assign_labels(bbox, entities)
    del bbox

    # Split lines into separate tokens
    new_bbox_l = []
    for index, row in bbox_labeled.iterrows():
      new_bbox_l += split_line(row)
    new_bbox = pd.DataFrame(new_bbox_l, columns=bbox_labeled.columns, dtype=np.int16)
    del bbox_labeled


    # Do another label assignment to keep the labeling more precise 
    for index, row in new_bbox.iterrows():
      label = row['label']

      if label != "O":
        entity_values = entities.iloc[0, entities.columns.get_loc(label.lower())]
        entity_set = entity_values.split()
        
        if any(SequenceMatcher(a=row['line'], b=b).ratio() > 0.7 for b in entity_set):
            label = "S-" + label
        else:
            label = "O"
      
      new_bbox.at[index, 'label'] = label

    width, height = image.size
  
    data.append([new_bbox, width, height])

  return data

In [None]:
dataset_train = dataset_creator(sroie_folder_path / 'train')
dataset_test = dataset_creator(sroie_folder_path / 'test')

In [None]:
def normalize(points: list, width: int, height: int) -> list:
  x0, y0, x2, y2 = [int(p) for p in points]
  
  x0 = int(1000 * (x0 / width))
  x2 = int(1000 * (x2 / width))
  y0 = int(1000 * (y0 / height))
  y2 = int(1000 * (y2 / height))

  return [x0, y0, x2, y2]


def write_dataset(dataset: list, output_dir: Path, name: str):
  print(f"Writing {name}ing dataset:")
  with open(output_dir / f"{name}.txt", "w+", encoding="utf8") as file, \
       open(output_dir / f"{name}_box.txt", "w+", encoding="utf8") as file_bbox, \
       open(output_dir / f"{name}_image.txt", "w+", encoding="utf8") as file_image:

      # Go through each dataset
      for datas in tqdm(dataset, total=len(dataset)):
        data, width, height = datas
        
        filename = data.iloc[0, data.columns.get_loc('filename')]

        # Go through every row in dataset
        for index, row in data.iterrows():
          bbox = [int(p) for p in row[['x0', 'y0', 'x2', 'y2']]]
          normalized_bbox = normalize(bbox, width, height)

          file.write("{}\t{}\n".format(row['line'], row['label']))
          file_bbox.write("{}\t{} {} {} {}\n".format(row['line'], *normalized_bbox))
          file_image.write("{}\t{} {} {} {}\t{} {}\t{}\n".format(row['line'], *bbox, width, height, filename))

        # Write a second newline to separate dataset from others
        file.write("\n")
        file_bbox.write("\n")
        file_image.write("\n")

In [None]:
dataset_directory = Path('/kaggle/working','dataset')

dataset_directory.mkdir(parents=True, exist_ok=True)

write_dataset(dataset_train, dataset_directory, 'train')
write_dataset(dataset_test, dataset_directory, 'test')

# Creating the 'labels.txt' file to the the model what categories to predict.
labels = ['COMPANY', 'DATE', 'ADDRESS', 'TOTAL']
IOB_tags = ['S']
with open(dataset_directory / 'labels.txt', 'w') as f:
  for tag in IOB_tags:
    for label in labels:
      f.write(f"{tag}-{label}\n")
  # Writes in the last label O - meant for all non labeled words
  f.write("O")

In [None]:
tra=open('/kaggle/working/dataset/train.txt','r').readlines()
trabox=open('/kaggle/working/dataset/train_box.txt','r').readlines()
traimage=open('/kaggle/working/dataset/train_image.txt','r').readlines()

tst=open('/kaggle/working/dataset/test.txt','r').readlines()
tstbox=open('/kaggle/working/dataset/test_box.txt','r').readlines()
tstimage=open('/kaggle/working/dataset/test_image.txt','r').readlines()

new_train=open('/kaggle/working/dataset/train_new.txt','w')
new_train_box=open('/kaggle/working/dataset/train_box_new.txt','w')
new_train_image=open('/kaggle/working/dataset/train_image_new.txt','w')

tra.extend(tst)
trabox.extend(tstbox)
traimage.extend(tstimage)



for i, j, k in zip(tra,trabox,traimage):
    new_train.write(i)
    new_train_box.write(j)
    new_train_image.write(k)

print(len(tra),len(trabox),len(traimage))
print(tra[113043],trabox[113043],traimage[113043])
tmp=open('/kaggle/working/dataset/train_box_new.txt','r').readlines()
len(tmp)

In [None]:
# !cat /kaggle/working/dataset/train_new.txt

In [None]:
tmp=open('/kaggle/working/dataset/train_new.txt','r').readlines()
len(tmp)

In [None]:
!mv /kaggle/working/dataset/train.txt /kaggle/working/dataset/train1.txt 
!mv /kaggle/working/dataset/train_box.txt /kaggle/working/dataset/train1_box.txt 
!mv /kaggle/working/dataset/train_image.txt /kaggle/working/dataset/train1_image.txt 

In [None]:
!mv /kaggle/working/dataset/train_new.txt /kaggle/working/dataset/train.txt 
!mv /kaggle/working/dataset/train_box_new.txt /kaggle/working/dataset/train_box.txt 
!mv /kaggle/working/dataset/train_image_new.txt /kaggle/working/dataset/train_image.txt 

In [None]:
tmp=open('/kaggle/working/dataset/train_box.txt','r').readlines()
len(tmp)

# 2. Fine tune LayoutLM
1)downloaded and transformed our dataset into a trainable and testable set

2)clone the LayoutLM Github project which contains the script to fine tune the model.

In [None]:
%%bash
git clone https://github.com/microsoft/unilm.git
cd unilm/layoutlm/deprecated
pip install .

## Split training data based on Kfold from sklearn

In [None]:
!mkdir /kaggle/working/dataset/trainfold1
!mkdir /kaggle/working/dataset/trainfold2
!mkdir /kaggle/working/dataset/trainfold3
!mkdir /kaggle/working/dataset/trainfold4
!mkdir /kaggle/working/dataset/trainfold5

In [None]:
!mkdir /kaggle/working/dataset/trainmodel1
!mkdir /kaggle/working/dataset/trainmodel2
!mkdir /kaggle/working/dataset/trainmodel3
!mkdir /kaggle/working/dataset/trainmodel4
!mkdir /kaggle/working/dataset/trainmodel5

In [None]:
Ximage=open('/kaggle/working/dataset/train_image.txt','r').readlines()
dict_files={}
dict_len={}
ct=0
for i in Ximage:
    if i=='\n':
        ct=ct+1
        continue
    i=i.strip('\n')
    i=i.split('\t')
    if i[-1] not in dict_files:
        dict_files[i[-1]]=[]
        dict_files[i[-1]].append(ct)
        dict_len[i[-1]]=1
    else:
        dict_len[i[-1]]=dict_len[i[-1]]+1
        dict_files[i[-1]]=dict_files[i[-1]][:1]
        dict_files[i[-1]].append(ct)
    ct=ct+1
len(dict_files)
    

In [None]:
ct=0
for i in dict_files.keys():
    ct=ct+1
    print (i,dict_files[i])
    if ct>3:
        break
ct=0
for i in dict_len.keys():
    ct=ct+1
    print (i,dict_len[i])
    if ct>3:
        break


In [None]:
from sklearn.model_selection import KFold  # 从sklearn导入KFold包

kf = KFold(n_splits=5,random_state=1211,shuffle=True)
kf

In [None]:
list_=[]

for i in dict_files.keys():
    tmp=dict_files[i]
    list_.append(tmp)
np_files=np.array(list_)
np_files
# ct=0


In [None]:
# ct=0
# for X_train,X_test in kf.split(np_files):
#     ct=ct+1
#     print(X_train,X_test)
#     W1=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train.txt','w')
#     W2=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_box.txt','w')
#     W3=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_image.txt','w')
# #     for i,j,k in zip(X[int(X_train[0]):int(X_train[-1])],Xbox[int(X_train[0]):int(X_train[-1])],Ximage[int(X_train[0]):int(X_train[-1])]):
# #         print(ct)
#     for i in range(len(X_train)):
#         for j in range(np_files[int(X_train[i])][0],np_files[int(X_train[i])][-1]+2)
#         W1.write(X[j])
#         W2.write(Xbox[j])
#         W3.write(Ximage[j])
#     break

In [None]:
import numpy as np 
X=open('/kaggle/working/dataset/train.txt','r').readlines()
X_np = np.array(X) 

Xbox=open('/kaggle/working/dataset/train_box.txt','r').readlines()
Xbox_np = np.array(Xbox) 

Ximage=open('/kaggle/working/dataset/train_image.txt','r').readlines()
Ximage_np = np.array(Ximage) 
len(X_np),len(Xbox_np),len(Ximage_np)

In [None]:
ct=0
for X_train,X_test in kf.split(np_files):
    ct=ct+1
#     print(X_train,X_test)
    W1=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train.txt','w')
    W2=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_box.txt','w')
    W3=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_image.txt','w')
#     for i,j,k in zip(X[int(X_train[0]):int(X_train[-1])],Xbox[int(X_train[0]):int(X_train[-1])],Ximage[int(X_train[0]):int(X_train[-1])]):
#         print(ct)
    for i in range(len(X_train)):
        for j in range(np_files[int(X_train[i])][0],np_files[int(X_train[i])][-1]+2):
          W1.write(X[j])
          W2.write(Xbox[j])
          W3.write(Ximage[j])
W1.close()
W2.close()
W3.close()
#     break

In [None]:
X=open('/kaggle/working/dataset/trainfold1/train.txt','r').readlines()
X_np = np.array(X) 

Xbox=open('/kaggle/working/dataset/trainfold2/train_box.txt','r').readlines()
Xbox_np = np.array(Xbox) 

Ximage=open('/kaggle/working/dataset/trainfold3/train_image.txt','r').readlines()
Ximage_np = np.array(Ximage) 
# len(X_np),len(Xbox_np),len(Ximage_np)
X_np[82:90],Xbox_np[82:90],Ximage_np[82:90]

In [None]:
# ct=0
# for X_train,X_test in kf.split(X_np):
#     print(X_train,X_test)
#     ct=ct+1
#     W1=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train.txt','w')
#     W2=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_box.txt','w')
#     W3=open('/kaggle/working/dataset/trainfold'+str(ct)+'/train_image.txt','w')
# #     for i,j,k in zip(X[int(X_train[0]):int(X_train[-1])],Xbox[int(X_train[0]):int(X_train[-1])],Ximage[int(X_train[0]):int(X_train[-1])]):
# #         print(ct)
#     for i in range(len(X_train)):
#         W1.write(X[int(X_train[i])])
#         W2.write(Xbox[int(X_train[i])])
#         W3.write(Ximage[int(X_train[i])])
# #     print()
# # len(X[int(X_train[0]):int(X_train[-1])])
# # type(int(X_train[0]))

# #     print(X_train,X_test)
      


In [None]:
tp=open('/kaggle/working/dataset/trainfold1/train.txt','r').readlines()
len(tp)

In [None]:
!cp ../../../../../../input/tst111/test.txt /kaggle/working/dataset
!cp ../../../../../../input/tst111/test_box.txt /kaggle/working/dataset
!cp ../../../../../../input/tst111/test_image.txt /kaggle/working/dataset

In [None]:
!cp ../../../../../../input/runseqr/run_seq_labeling_r.py /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling
!cp ../../../../../../input/d/qingxiaoxu/runseq/run_seq_labeling.py /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling

In [None]:
! rm -rf /kaggle/working/dataset/cached*

In [None]:
! rm -rf /kaggle/working/dataset/trainfold1/cached*

In [None]:
# pretrained_model_folder_input= sroie_folder_path / Path('layoutlm-base-uncased') # Define it so we can copy it into our working directory

# pretrained_model_folder=Path('/kaggle/working/layoutlm-base-uncased/') 
label_file=Path(dataset_directory, "labels.txt")

# Move to the script directory
os.chdir("/kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling")

In [None]:
! cp -r "{pretrained_model_folder}"/* /kaggle/working/dataset/trainmodel5
! cp -r "{pretrained_model_folder_input}"/* /kaggle/working/dataset/trainmodel5

! sed -i 's/"num_attention_heads": 16,/"num_attention_heads": 12,/' /kaggle/working/dataset/trainmodel5/config.json

In [None]:
ls /kaggle/working/dataset/trainfold5

In [None]:
# !cat /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling/pred.txt

## Training

The following steps should be repeated 5 times loading data from /kaggle/working/dataset/trainfold1-5

In [None]:
! rm -rf /kaggle/working/dataset/trainfold5/cached*

In [None]:
ls /kaggle/working/dataset/trainmodel5

In [None]:
#current5
! python run_seq_labeling_r.py \
                            --data_dir /kaggle/working/dataset/trainfold5 \
                            --labels /kaggle/working/dataset/labels.txt \
                            --model_name_or_path /kaggle/working/dataset/trainmodel5 \
                            --model_type layoutlm \
                            --max_seq_length 512 \
                            --do_lower_case \
                            --do_train \
                            --num_train_epochs 11 \
                            --logging_steps 50 \
                            --save_steps -1 \
                            --output_dir output \
                            --overwrite_output_dir \
                            --per_gpu_train_batch_size 8 \
                            --per_gpu_eval_batch_size 16

In [None]:
ls /kaggle/working/dataset

## Predicting

In [None]:
!cp ../../../../../../input/d/qingxiaoxu/runseq/run_seq_labeling.py /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling

In [None]:
!mv /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling/run_seq_labeling.py  /kaggle/working/unilm/layoutlm/deprecated/examples/seq_labeling/run_seq_labeling1.py 

In [None]:
! rm -rf /kaggle/working/dataset/cached*

In [None]:
ls /kaggle/working/dataset/ 

In [None]:
# Evaluate for test set and make predictions
#current fold5
! python run_seq_labeling.py \
                            --data_dir /kaggle/working/dataset \
                            --labels /kaggle/working/dataset/labels.txt \
                            --model_name_or_path /kaggle/working/dataset/trainmodel5 \
                            --model_type layoutlm \
                            --do_lower_case \
                            --max_seq_length 512 \
                            --do_predict \
                            --logging_steps 10 \
                            --save_steps -1 \
                            --output_dir output \
                            --per_gpu_eval_batch_size 8

In [None]:
ls

In [None]:
out_label=np.array(np.load('out_label_ids.npy'))
ct=0
tp=[]
for i in out_label[-6]:
      ct=ct+1
      if i>0: 
        tp.append(ct)


In [None]:
!mv datasum.npy o_datasum.npy

In [None]:
!mv datasumnew.npy datasum.npy

I found the S-TOTAL label F1 extremly low. However, one of the five models trained from the folds are capable of detecting the actual label of S-TOTAL. 

I simply added 20 on those tokens that are of much higher possibility being identified as S-TOTAL than the first 3 labels. 

In [None]:
data1=np.array(np.load('save_preds1.npy'))
data2=np.array(np.load('save_preds2.npy'))
data3=np.array(np.load('save_preds3.npy'))
data4=np.array(np.load('save_preds4.npy'))
data5=np.array(np.load('save_preds5.npy'))
datasum=np.array(np.load('datasum.npy'))
# len(data1),len(data1[0])
for i_ds in range(len(datasum)):
    for i in range(len(tp)):
      if datasum[i_ds][tp[i]][-2]>10:
        datasum[i_ds][tp[i]][-2]=datasum[i_ds][tp[i]][-2]+20

np.save('datasumnew.npy',datasum)
#         print(i,data1[-6][tp[i]],'\n',i,data2[-6][tp[i]])
#         print(i,data3[-6][tp[i]],'\n',i,data4[-6][tp[i]])
#         print(i,data5[-6][tp[i]],'\n',i,datasum[-6][tp[i]])
    

In [None]:
# !rm -rf save_preds1.npy
# !rm -rf save_preds2.npy
# !rm -rf save_preds3.npy
# !rm -rf save_preds4.npy
# !rm -rf save_preds5.npy
# !rm -rf datasum.npy
# !rm -rf save_preds.npy

In [None]:
!mv save_preds.npy save_preds5.npy

In [None]:
data1=np.array(np.load('save_preds1.npy'))
data2=np.array(np.load('save_preds2.npy'))
data3=np.array(np.load('save_preds3.npy'))
data4=np.array(np.load('save_preds4.npy'))
data5=np.array(np.load('save_preds5.npy'))
data_sum =data1+data2+data3+data4+data5
# data_sum =data1+data2+data3+data4+data5
np.save('datasum.npy',data_sum)

In [None]:
ls output

In [None]:
cp output/test_predictions.txt /kaggle/working/