## Set-up environment

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 120 kB 41.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 63.0 MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [None]:
!pip install -q datasets jiwer

[K     |████████████████████████████████| 365 kB 31.6 MB/s 
[K     |████████████████████████████████| 115 kB 68.5 MB/s 
[K     |████████████████████████████████| 212 kB 53.9 MB/s 
[K     |████████████████████████████████| 127 kB 66.2 MB/s 
[K     |████████████████████████████████| 1.4 MB 73.9 MB/s 
[K     |████████████████████████████████| 1.6 MB 72.0 MB/s 
[K     |████████████████████████████████| 104 kB 80.0 MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load IAM test set

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # some file names end with jp instead of jpg, the two lines below fix this
        if file_name.endswith('jp'):
          file_name = file_name + 'g'
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
# 2 Load dataset
import pandas as pd
path = '/content/drive/MyDrive/TrsOCR_utorial/training-data-ex/lines.txt'
df = pd.read_fwf(path, header=None)
df.rename(columns={0: "file_name", 8: "text"}, inplace=True)
del df[1]
del df[2] 
del df[3]
del df[4]
del df[5]
del df[6]
del df[7]
# some file names end with jp instead of jpg, let's fix this
df['file_name'] = df['file_name'].apply(lambda x: x + 'g' if x.endswith('jp') else x)
df.head()

Unnamed: 0,file_name,text
0,RALK987_1865_817_120_001-001,Csengery|Antal|r.|t.
1,RALK987_1865_817_120_001-002,Szabó|József|l.|t.
2,RALK987_1865_817_120_001-003,Tekintetes|úr!
3,RALK987_1865_817_120_001-004,Szilágyi|István|lev.|tagnak
4,RALK987_1865_817_120_001-005,az|e|havi|a|folyó|hó|22+én


In [None]:
def clean_text(input_text: str) -> str:
    text = input_text.replace('+', '-')
    text = text.replace('|', ' ')
    return text

In [None]:
def load_laia() -> pd.DataFrame:
  
    train_text = path
    df = pd.read_csv(path ,sep=' ', header=None)
    data = []
    print(train_text)
    with open(train_text) as infile:
        for line in infile:
            file_name, _, _, _, _, _, _, _, text = line.strip().split(' ')
            data.append((file_name, clean_text(text)))

    df = pd.DataFrame(data, columns=['file_name', 'text'])
    df.rename(columns={0: 'file_name', 8: 'text'}, inplace=True)
    df['file_name'] = df['file_name'].apply(lambda x: x + '.jpg')
    df = df[['file_name', 'text']]
    return df

In [None]:
from torch.utils.data.dataloader import DataLoader
from sklearn.model_selection import train_test_split
# --------------------------------------------------
def create_datasets(df: pd.DataFrame):
    train_df, test_df = train_test_split(df, test_size=0.1, random_state=42069)
    # we reset the indices to start from zero
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    train_dataset = DataLoader('/content/drive/MyDrive/TrsOCR_utorial/training-data-ex/img/',
                                batch_size=8,
                               )
    
    eval_dataset = DataLoader('/content/drive/MyDrive/TrsOCR_utorial/training-data-ex/img/',                    
                                batch_size=8,
                              )

    return train_dataset, eval_dataset

In [None]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
df = load_laia()
df.head()

/content/drive/MyDrive/TrsOCR_utorial/training-data-ex/lines.txt


Unnamed: 0,file_name,text
0,RALK987_1865_817_120_001-001.jpg,Csengery Antal r. t.
1,RALK987_1865_817_120_001-002.jpg,Szabó József l. t.
2,RALK987_1865_817_120_001-003.jpg,Tekintetes úr!
3,RALK987_1865_817_120_001-004.jpg,Szilágyi István lev. tagnak
4,RALK987_1865_817_120_001-005.jpg,az e havi a folyó hó 22-én


In [None]:
train_dataset, eval_dataset = create_datasets(df)

In [None]:
# from torch.utils.data import DataLoader

# test_dataloader = DataLoader(test_dataset, batch_size=8)

In [None]:
# to do siplit data set in folder 

In [None]:
batch = next(iter(eval_dataset))

In [None]:
batch

['/', 'c', 'o', 'n', 't', 'e', 'n', 't']

In [None]:
for k,v in batch.items():
  print(k, v.shape)

In [None]:
from transformers import TrOCRProcessor

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")

In [None]:
labels = batch["labels"]
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels, skip_special_tokens=True)
label_str

## Run evaluation

In [None]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.to(device)

In [None]:
from datasets import load_metric

cer = load_metric("cer")

Downloading:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

In [None]:
from tqdm.notebook import tqdm

print("Running evaluation...")

for batch in tqdm(eval_dataset):
    # predict using generate
    pixel_values = batch["pixel_values"].to(device)
    outputs = model.generate(pixel_values)

    # decode
    pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
    labels = batch["labels"]
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    # add batch to metric
    cer.add_batch(predictions=pred_str, references=label_str)

final_score = cer.compute()

Running evaluation...


  0%|          | 0/365 [00:00<?, ?it/s]

bert_model": "bert-base-multilingual-cased"

In [None]:
print("Character error rate on test set:", final_score)

0.038336078808735505
