In [2]:
import os
import torch
import urllib
import zipfile
import pandas as pd

from PIL import Image
from tqdm import tqdm
from datasets import load_metric
from torch.utils.data import Dataset
from transformers import TrOCRProcessor
from transformers import Seq2SeqTrainer
from transformers import default_data_collator
from transformers import Seq2SeqTrainingArguments
from transformers import VisionEncoderDecoderModel
from sklearn.model_selection import train_test_split

# Загрузка данных

In [3]:
!mkdir data
urllib.request.urlretrieve('https://storage.yandexcloud.net/datasouls-ods/materials/46b7bb85/datasets.zip', 'data/datasets.zip')
with zipfile.ZipFile("data/datasets.zip", "r") as f:
    f.extractall("data/")

# Подготовка данных

In [4]:
filespath = 'data/train/'
filenames_ims = sorted(next(os.walk(filespath+'images'), (None, None, []))[2])
filenames_words = sorted(next(os.walk(filespath+'words'), (None, None, []))[2])
texts = []
for filename in tqdm((filenames_words)):
    with open(filespath +'/words/' + filename) as f:
        texts.append(f.readline())
df =  pd.DataFrame({'file_name': filenames_ims, 'text': texts})

100%|██████████| 6196/6196 [00:00<00:00, 46237.22it/s]


In [5]:
train_df, test_df = train_test_split(df[:1000], test_size=0.1, shuffle=True)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [6]:
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        labels = self.processor.tokenizer(text, padding="max_length", max_length=self.max_target_length).input_ids
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [7]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-stage1')
train_dataset = IAMDataset(root_dir='data/train/images/', df=train_df, processor=processor)
eval_dataset = IAMDataset(root_dir='data/train/images/', df=test_df, processor=processor)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [8]:
encoding = train_dataset[0]
for k,v in encoding.items():
    print(k, v.shape)
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

pixel_values torch.Size([3, 384, 384])
labels torch.Size([128])
колко лимоноф свѣжих а что шу


# Загрузка и обучение модели

In [9]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [11]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True, 
    output_dir="./models_dir",
    num_train_epochs=3,
    save_steps=10350000,
    eval_steps=1035,
    metric_for_best_model="cer",
    greater_is_better=False
)

In [12]:
cer_metric = load_metric("cer")
wer_metric = load_metric("wer")
def compute_string_acc(predictions, references):
  string_acc = 0
  for preds, refs in zip(predictions, references):
    if preds==refs:
      string_acc = string_acc + 1
  return string_acc/len(predictions)

  cer_metric = load_metric("cer")


In [13]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
    # print(pred_str, label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    string_accuracy = compute_string_acc(predictions=pred_str, references=label_str)

    return {"cer": cer, "wer" : wer, "string acc" : string_accuracy}

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.image_processor_class,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 900
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2700
  Number of trainable parameters = 384864768


Step,Training Loss,Validation Loss,Cer,Wer,String acc
1035,3.7148,3.662571,0.846101,1.827027,0.01
2070,3.5829,3.614345,2.460663,4.920721,0.0


***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2700, training_loss=3.687672051323785, metrics={'train_runtime': 1308.9146, 'train_samples_per_second': 2.063, 'train_steps_per_second': 2.063, 'total_flos': 2.389217716784333e+18, 'train_loss': 3.687672051323785, 'epoch': 3.0})