<a href="https://colab.research.google.com/github/MSHQD/HWR/blob/main/model_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
!pip install hwb



# **Settings**

In [None]:
!pip install transformers -q
!pip install -q datasets jiwer
!pip install sentencepiece -q

In [None]:
!ls -hl

In [None]:
!git clone https://github.com/microsoft/unilm
!cp unilm/trocr/data_aug.py data_aug.py

In [None]:
import cv2
import random
import json
from PIL import Image
import os
from torch.utils.data import Dataset, DataLoader
import torch, torchvision
import warnings
from skimage import io
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torchvision
from torch.nn.utils.rnn import pad_sequence

warnings.filterwarnings("ignore")
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import shutil
from tqdm import tqdm
from matplotlib import pyplot as plt
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms


In [None]:
import torch
from torch import nn

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
SEED = 42
def set_seed(seed: int = 42, set_torch=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    if set_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(SEED)

# **Global variables**

In [None]:
DATA_DIR = "/content/train_recognition_small"
TRAIN_DIR = "/content/train_recognition_small/images"
# TRAIN_DIR = '/content/data'
PROJECT_DIR = '/content/drive/MyDrive'
MODEL_DIR = os.path.join(PROJECT_DIR, 'segmentation_models')
TRAIN_BATCH_SIZE = 10
model_type = 'small'
VAL_BATCH_SIZE = 64
img_size = (384, 384) #(256, 256) #

# **Load datasets**

In [None]:
import zipfile
zip1 = '/content/drive/MyDrive/train_recognition_small.zip'
extract_to1 = '/content/train_recognition_small'

with zipfile.ZipFile(zip1, 'r') as zip_ref:
    zip_ref.extractall(extract_to1)

In [None]:
hack_data = pd.read_json('train_recognition_small/train_recognition_small/labels_small.json')
hack_data = hack_data.rename(columns={
    'file_name': 'image',
    'text': 'label'
})

In [None]:
print(hack_data.head())

In [None]:
from pathlib import Path

hack_data['image'] = hack_data['image'].apply(lambda x: os.path.join('train_recognition_small/train_recognition_small/images', Path(x).name))

## see datasets intersection

In [None]:
def dhash(image, hashSize=8):
	resized = cv2.resize(image, (hashSize + 1, hashSize))
	diff = resized[:, 1:] > resized[:, :-1]
	return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

In [None]:
hack_hashes = [dhash(cv2.imread(path)) for path in tqdm(hack_data['image'])]

In [None]:
# second_stage_hashes = [dhash(cv2.imread(path)) for path in tqdm(second_stage_data['image'])]

In [None]:
# len(set(second_stage_hashes) | set(hack_hashes)), len(set(second_stage_hashes) & set(hack_hashes))

In [None]:
s = 'рядом'
plt.imshow(plt.imread(hack_data[hack_data['label'] == s].iloc[0]['image']))

In [None]:
plt.hist([len(el.split()) for el in hack_data['label']])
plt.show()

# **Building VAL dataset**

In [None]:
# def get_one_words(df):
#   mask = [len(el.split()) == 1 or (len(el.split()) == 2 and (len(el[0]) < 3 or len(el[1] < 3))) for el in df['label']]
#   return df[np.array(mask)]

In [None]:
hack_data.head()

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(hack_data, test_size=0.2, random_state=42, shuffle=True)

In [None]:
df_val.shape

In [None]:
df_val = df_val[['train_recognition_small' in el for el in df_val['image']]]

In [None]:
df_val.shape

In [None]:
def random_show(df):
  now = df.sample(1).iloc[0]
  img = plt.imread(now['image'])
  plt.imshow(img)
  plt.title(now['label'])
  plt.show()

In [None]:
random_show(hack_data)

In [None]:
df_train.shape, df_val.shape

In [None]:
plt.hist(hack_data['label'].apply(lambda x: len(x)))
plt.show()

# **Define dataset**

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, root_dir, df, transforms, tokenizer, feature_extractor, max_target_length=64):
        self.root_dir = root_dir
        self.df = df
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_target_length = max_target_length
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def _load_file(self, path):
      image = cv2.imread(path)
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
      return image

    def __getitem__(self, idx):
        el = self.df.iloc[idx]
        file_name = el['image']
        text = el['label']

        image = self._load_file(os.path.join(self.root_dir, file_name))
        image = self.transforms(image=image)['image']

        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values

        # add labels (input_ids) by encoding the text
        labels = self.tokenizer(text,
                                padding="max_length",
                                max_length=self.max_target_length).input_ids

        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
# from augmixations import HandWrittenBlot
from hwb import HandWrittenBlot
import albumentations as A


class AlbuHandWrittenBlot(A.DualTransform):
    def __init__(self, hwb, always_apply=False, p=0.5):
        super(AlbuHandWrittenBlot, self).__init__(always_apply, p)
        self.hwb = hwb

    def apply(self, image, **params):
        return self.hwb(image)


class AlbuPadding(A.DualTransform):
    def __init__(self, always_apply=False, p=0.5):
        super(AlbuPadding, self).__init__(always_apply, p)

    def apply(self, image, **params):
        zeros = np.zeros((128, 384, 3))
        image = np.concatenate([zeros, image, zeros], axis=0)
        return image.astype(np.uint8)

In [None]:
rectangle_info = {
    'x': (None, None),
    'y': (150, 220),
    'h': (60, 100),
    'w': (50, 80),
}

blot_params = {
    'incline': (-10, 10),
    'intensivity': (0.5, 0.9),
    'transparency': (0.05, 0.4),
    'count': (1, 3),
}

blots = HandWrittenBlot(rectangle_info, blot_params)

In [None]:
from albumentations.pytorch.transforms import ToTensorV2

data_transforms = {
    'train': A.Compose([
              A.Resize(128, 384),
              AlbuPadding(always_apply=True),
              AlbuHandWrittenBlot(blots, p=0.3),
              A.Rotate(limit=[-7, 7]),
              A.OneOf([
                A.ToGray(always_apply=True),
                A.CLAHE(always_apply=True, clip_limit=15),
              ], 0.3)
          ]),
    'val': A.Compose([
              A.Resize(128, 384),
              AlbuPadding(always_apply=True),
          ]),
}

In [None]:
plt.imshow(data_transforms['train'](image=np.ones((140, 312, 3)) * 255)['image'])

# **Define model**

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Загружаем модель и процессор
model = VisionEncoderDecoderModel.from_pretrained("raxtemur/trocr-base-ru")
processor = TrOCRProcessor.from_pretrained("raxtemur/trocr-base-ru")
# tokenizer = XLMRobertaTokenizer.from_pretrained('microsoft/trocr-small-handwritten')
feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer
model.eval()

# model = VisionEncoderDecoderModel.from_pretrained(f'microsoft/trocr-small-handwritten')
# model = VisionEncoderDecoderModel.from_pretrained(f"/content/drive/MyDrive/НТИ ИИ /team/sergey_models/tr_ocr_best_small_aug_nti2data")

In [None]:
def set_requires_grad(model, value) :
  for param in model.parameters():
    param.requires_grad = value

In [None]:
model = model.to(device)

In [None]:
model.decoder.resize_token_embeddings(len(tokenizer))

# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id =  tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
# model.config.encoder.image_size = img_size
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
tokenizer.save_pretrained(os.path.join(PROJECT_DIR, 'ocr_models', f'trocr-{model_type}-handwritten-tokenizer'))
processor.save_pretrained(os.path.join(PROJECT_DIR, 'ocr_models', f'trocr-{model_type}-handwritten-feature-extractor'))

# **Define loaders and try model**

In [None]:
from transformers import TrOCRProcessor

train_dataset = IAMDataset(root_dir='./',
                           df=df_train,
                           transforms=data_transforms['train'],
                           tokenizer=tokenizer,
                           feature_extractor=processor)

val_dataset = IAMDataset(root_dir='./',
                         df=df_val,
                         transforms=data_transforms['val'],
                         tokenizer=tokenizer,
                         feature_extractor=processor)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))

In [None]:
from torch.utils.data import DataLoader
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=16,)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=16,)

In [None]:
val_batch = next(iter(val_loader))
val_batch['pixel_values'].shape

In [None]:
for k, v in val_batch.items():
  val_batch[k] = v.to(device)

In [None]:
# %%time
y = model(**val_batch)

In [None]:
from datasets import load_metric

cer_metric = load_metric("cer")

In [None]:
def compute_cer(pred_ids, label_ids):
    pred_ids = pred_ids.cpu().numpy()
    label_ids = label_ids.cpu().numpy()

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return cer

# **Train model**

In [None]:
def plot_images(images_for_show):
  """Строит изображение на одном графике"""
  fig = plt.figure(figsize=(16, 16))

  columns = len(images_for_show)
  rows = 1
  for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(np.clip(images_for_show[i - 1], 0, 1))

  fig.subplots_adjust(wspace=0.1, hspace=0)
  plt.show()

In [None]:
def show_random_predict(model, test_loader, batch=None):
  """Выводит необходимую информацию после каждой эпохи"""
  if batch is None:
    batch = next(iter(test_loader))


  outputs = model.generate(batch["pixel_values"].to(device))
  now = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
  plot_images(torch.moveaxis(batch['pixel_values'][:4], 1, -1).detach().cpu().numpy())
  print([tokenizer.decode(pred.cpu().numpy(), skip_special_tokens=True) for pred in outputs[:4]])
  print([tokenizer.decode(el.cpu().numpy(), skip_special_tokens=True) for el in batch['labels'][:4]])

In [None]:
show_random_predict(model.train(True), val_loader)

In [None]:
show_random_predict(model.train(False), val_loader)

In [None]:
val_batch = next(iter(val_loader))
train_batch = next(iter(train_loader))

In [None]:
import time

def train_epoch(model, batch_gen, criterion, optimizer, is_train = True, full_cer = False) :
    epoch_loss = 0.0
    count = 0
    cer = 0.0
    cnt_batches = 0
    # model.train(True)
    model.train(is_train)

    for batch in tqdm(batch_gen) :
        start = time.time()
        cnt_batches += 1

        for k, v in batch.items():
            batch[k] = v.to(device)

        with torch.set_grad_enabled(is_train) :
            outputs = model(**batch)
            loss = outputs.loss

            if is_train :
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            count += 1
            epoch_loss += loss.item()

            if cnt_batches % 100 == 0 and is_train: # изменила с 1000 до 100, так как с батчем в 16 получается 4800/16 = 300 батчей
                # model.decoder.save_pretrained(f"/content/drive/MyDrive/НТИ ИИ /team/sergey_models/tr_ocr_last_{model_type}_decoder")
                # model.encoder.save_pretrained(f"/content/drive/MyDrive/НТИ ИИ /team/sergey_models/tr_ocr_last_{model_type}_encoder")
                model.save_pretrained(f"/content/drive/MyDrive/ocr_models/tr_ocr_last_{model_type}")

                outputs = model.generate(val_batch["pixel_values"].to(device))
                now = compute_cer(pred_ids=outputs, label_ids=val_batch["labels"])
                s = 'val batch cer = ' + str(now) + ' '

                outputs = model.generate(train_batch["pixel_values"].to(device))
                now = compute_cer(pred_ids=outputs, label_ids=train_batch["labels"])
                print(s + 'train batch cer = ', now)


            if not is_train and full_cer:
              outputs = model.generate(batch["pixel_values"])
              now = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
              cer += now

    if full_cer:
      now = cer / count
    else:
      outputs = model.generate(val_batch["pixel_values"].to(device))
      now = compute_cer(pred_ids=outputs, label_ids=val_batch["labels"])

    return epoch_loss / count, now


In [None]:
def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, num_epochs, verbose=True):
    loader = {'train': train_loader, 'test': test_loader}
    loss_history = {'train': [], 'test': []}
    cer_history = {'train': [], 'test': []}
    best_loss = 0.12595586647062199 # 0.6189

    for epoch in range(num_epochs):
        if verbose:
            print('Epoch {}/{}'.format(epoch, num_epochs - 1))
            print('-' * 10)

        for phase in ['train', 'test']:
            epoch_loss, epoch_cer = train_epoch(model, loader[phase], criterion, optimizer, phase == 'train', full_cer=True)
            if verbose:
                print('{} Loss: {:.4f} Cer: {:.4f}'.format(phase, epoch_loss, epoch_cer))
            loss_history[phase].append(epoch_loss)
            cer_history[phase].append(epoch_cer)

        if scheduler is not None:
            scheduler.step()

        if verbose:
            show_random_predict(model, test_loader, val_batch)
            print()

        if loss_history['test'][-1] < best_loss:
          best_loss = loss_history['test'][-1]
          print('updated best loss on {} epoch, now it {}'.format(epoch, best_loss))
          # model.decoder.save_pretrained(f"/content/drive/MyDrive/НТИ ИИ /team/sergey_models/tr_ocr_best_{model_type}_decoder")
          # model.encoder.save_pretrained(f"/content/drive/MyDrive/НТИ ИИ /team/sergey_models/tr_ocr_best_{model_type}_encoder")
          model.save_pretrained(f"/content/drive/MyDrive/ocr_models/tr_ocr_best_{model_type}")

    return loss_history, cer_history

In [None]:
epoch_loss, epoch_f1 = train_epoch(model, val_loader, None, None, False, True)
epoch_loss, epoch_f1

In [None]:
# epoch_loss, epoch_f1 = train_epoch(model, val_loader, None, None, False, True)
# epoch_loss, epoch_f1

In [None]:
from torch.optim import lr_scheduler
optimizer = optim.Adam(model.parameters(), lr=5e-5)
lr_scheduler = None

In [None]:
loss_history, cer_history = train_model(model, train_loader, val_loader, lr_scheduler, optimizer, None, 10)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_history, cer_history = train_model(model, train_loader, val_loader, None, optimizer, None, 2)

In [None]:
from torch.optim.lr_scheduler import StepLR
optimizer = optim.Adam(model.parameters(), lr=3e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

In [None]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer,
                          num_warmup_steps=500,
                          num_training_steps=len(train_loader) * 5) #5 = num_epochs

loss_history, cer_history = train_model(model, train_loader, val_loader, None, optimizer, scheduler, 5)