In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Fine Tunning TrOCR Small Printed

## Cài đặt những thư viện cần thiết

In [None]:
!nvcc --version

In [None]:
%pip install pyarrow==14.0.1
%pip install -q transformers
%pip install -q sentencepiece
%pip install -q jiwer
%pip install -q datasets
%pip install -q evaluate
%pip install -q -U accelerate

%pip install -q matplotlib
%pip install -q protobuf==3.20.1
%pip install -q tensorboard

## Thêm các thư viện cần thiết

In [None]:
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import glob as glob
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

from PIL import Image
from tqdm.notebook import tqdm
from dataclasses import dataclass
from torch.utils.data import Dataset
from urllib.request import urlretrieve
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator
)

In [None]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
@dataclass(frozen=True)
class TrainingConfig:
    BATCH_SIZE:    int = 48
    EPOCHS:        int = 10
    LEARNING_RATE: float = 0.00005

@dataclass(frozen=True)
class DatasetConfig:
    DATA_ROOT:     str = 'scut_data'

@dataclass(frozen=True)
class ModelConfig:
    MODEL_NAME: str = 'microsoft/trocr-small-printed'

In [None]:
def visualize(dataset_path):
    plt.figure(figsize=(15, 3))
    for i in range(15):
        plt.subplot(3, 5, i+1)
        all_images = os.listdir(f"{dataset_path}/scut_train")
        image = plt.imread(f"{dataset_path}/scut_train/{all_images[i]}")
        plt.imshow(image)
        plt.axis('off')
        plt.title(all_images[i].split('.')[0])
    plt.show()

visualize(DatasetConfig.DATA_ROOT)

In [None]:
train_df = pd.read_fwf(
    os.path.join(DatasetConfig.DATA_ROOT, 'scut_train.txt'), header=None
)
train_df.rename(columns={0: 'file_name', 1: 'text'}, inplace=True)
test_df = pd.read_fwf(
    os.path.join(DatasetConfig.DATA_ROOT, 'scut_test.txt'), header=None
)
test_df.rename(columns={0: 'file_name', 1: 'text'}, inplace=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Tăng cường dữ liệu

In [None]:
train_transforms = transforms.Compose([
    transforms.ColorJitter(brightness=.5, hue=.3),
    transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)),
])

## Thay đổi dữ liệu để phù hợp với định dạng đầu vào của mô hình

In [None]:
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)

In [None]:
class CustomOCRDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        image = Image.open(self.root_dir + file_name).convert('RGB')
        image = train_transforms(image)
        pixel_values = self.processor(image, return_tensors='pt').pixel_values
        labels = self.processor.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_target_length
        ).input_ids
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

### Mã hóa dữ liệu hình ảnh và văn bản thành dạng token

In [None]:
train_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, 'scut_train/'),
    df=train_df,
    processor=processor
)
valid_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, 'scut_test/'),
    df=test_df,
    processor=processor
)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(valid_dataset))

### Lấy thử 1 mẫu trong tệp dữ liệu huấn luyện

In [None]:
encoding = train_dataset[877]
for k,v in encoding.items():
    print(k, v.shape)

#### In ra mẫu trong tập dữ liệu sau khi áp dụng phương pháp tăng cường dữ liệu

In [None]:
image = Image.open(train_dataset.root_dir + train_df['file_name'][877]).convert("RGB")
image = train_transforms(image)
plt.imshow(image)
plt.axis('off')

In [None]:
labels = encoding['labels']
labels[labels == -100] = processor.tokenizer.pad_token_id
label_str = processor.decode(labels, skip_special_tokens=True)
print(label_str)

## Gọi mô hình VisionEncoderDecoderModel

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(ModelConfig.MODEL_NAME)
model.to(device)
print(model)

## Tổng tham số của mô hình

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

## Cấu hình mô hình

In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
optimizer = optim.AdamW(
    model.parameters(), lr=TrainingConfig.LEARNING_RATE, weight_decay=0.0005
)

## Đánh giá mô hình theo phương pháp Character Error Rate(CER)

In [None]:
cer_metric = evaluate.load('cer')

In [None]:
def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
def evaluate_model(model, processor, valid_dataset, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    cer_metric = evaluate.load('cer')
    model.eval()
    cer = 0.0

    for idx in tqdm(range(len(valid_dataset))):
        encoding = valid_dataset[idx]
        pixel_values = encoding['pixel_values'].unsqueeze(0).to(device)
        labels = encoding['labels'].unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model.generate(pixel_values)

        pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
        labels[labels == -100] = processor.tokenizer.pad_token_id
        label_str = processor.batch_decode(labels, skip_special_tokens=True)

        cer += cer_metric.compute(predictions=pred_str, references=label_str)

    return cer / len(valid_dataset)

In [None]:
pretrain_cer = evaluate_model(model, processor, valid_dataset)
print(f"CER for pre-trained model: {pretrain_cer}")

## Huấn luyện mô hình

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
    per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
    fp16=True,
    output_dir='CheckPoints/Seq2seq/seq2seq_model_printed_03/',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    report_to='tensorboard',
    num_train_epochs=TrainingConfig.EPOCHS
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=default_data_collator
)

In [None]:
res = trainer.train()

## Lưu mô hình


In [None]:
model.save_pretrained("CheckPoints/SaveModel/saved_model_03")
processor.save_pretrained("CheckPoints/SaveModel/saved_model_03")

## Đánh giá mô hình sau khi huấn luyện

In [None]:
fine_tuned_model = VisionEncoderDecoderModel.from_pretrained(
    'CheckPoints/SaveModel/saved_model_03').to(device)
fine_tuned_processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)

fine_tune_cer = evaluate_model(fine_tuned_model, fine_tuned_processor, valid_dataset)
print(f"CER for fine-tuned model: {fine_tune_cer}")

## Kết quả đánh giá mô hình trước khi huấn luyện và sau khi huấn luyện

In [None]:
print(f"CER for pre-trained model: {pretrain_cer}")
print(f"CER for fine-tuned model: {fine_tune_cer}")