In [None]:
!curl -L -o ocr-dataset.zip https://www.kaggle.com/api/v1/datasets/download/naneet1/ocr-dataset
!apt install unzip
!unzip ocr-dataset.zip -d ocr-dataset

In [None]:
!pip install transformers
!pip install tiktoken
!pip install protobuf
!pip install sentencepiece

In [None]:
import torch
from torch import nn
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader, random_split
from torch.amp import autocast, GradScaler
# from torch.utils.tensorboard import SummaryWriter

from transformers import ViTConfig, ViTModel, AutoTokenizer

from tqdm.notebook import tqdm
from PIL import Image
import json
import os
# from sklearn.model_selection import train_test_split

In [None]:
import json

# Open and read the JSON file
with open("/home/ocr-dataset/OCR_Images/labels.json", "r", encoding="utf-8") as file:
    data = json.load(file)  # Load JSON content into a Python dictionary

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

print("Pad Token:", tokenizer.pad_token)
print("Pad Token ID:", tokenizer.pad_token_id)

print("Special Tokens:", tokenizer.special_tokens_map)

# Print SOS and EOS (if they exist)
print("SOS Token:", tokenizer.bos_token)
print("SOS Token ID:", tokenizer.bos_token_id)

print("EOS Token:", tokenizer.eos_token)
print("EOS Token ID:", tokenizer.eos_token_id)

print("Vocab Size:", tokenizer.vocab_size)

Pad Token: <pad>
Pad Token ID: 1
Special Tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}
SOS Token: <s>
SOS Token ID: 0
EOS Token: </s>
EOS Token ID: 2
Vocab Size: 250002


In [None]:
class Custom_OCR_Dataset(Dataset):
    def __init__(self, data, transform=None, tokenizer=tokenizer):
        """
        Custom dataset to load images and text labels from extracted PDFs.

        :param data_folder: Path to the output folder containing images & labels.json.
        :param transform: Transformations for image preprocessing (optional).
        """
        self.data = data
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        root = "/home/ocr-dataset/OCR_Images/"
        img_path = os.path.join(root, self.data[idx]['image'].split("\\")[-2], self.data[idx]['image'].split("\\")[-1])
        raw_text = self.data[idx]['text']
        img = Image.open(img_path)
        tensor_img = self.transform(img)
        tokenized_text = self.tokenizer(raw_text, padding=True, truncation=True, max_length=512, stride=128, return_tensors="pt")
        return tensor_img, tokenized_text["input_ids"].squeeze(0)

In [None]:
def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)

    sequence_lens = [len(label) for label in labels]
    max_len = max(sequence_lens)

    padded_labels = torch.zeros(len(labels), max_len, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label

    return images, padded_labels

In [None]:
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((768,768))
])

train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomRotation((-5,5), expand=True),
    transforms.RandomApply(
        [transforms.GaussianBlur((5,5))],
        p=0.5
    ),
    transforms.Resize((768,768))
])

In [None]:
dataset = Custom_OCR_Dataset(data=data, transform=test_transform)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

torch.manual_seed(42)
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=3, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=3, shuffle=False, collate_fn=collate_fn)

In [None]:
img, label = next(iter(train_dataloader))
img.shape, label.shape

(torch.Size([3, 3, 768, 768]), torch.Size([3, 512]))

In [None]:
class VV_OCR(nn.Module):
    def __init__(self,
                 vocab_size=tokenizer.vocab_size,
                 vit_config=ViTConfig(image_size=768, hidden_size=768, intermediate_size=3072, num_attention_heads=12),
                 d_model=768,
                 nhead=8,
                 num_decoder_layers=6,
                 dim_feedforward=3072,
                 dropout=0.1,
                 batch_first=True,
                 pad_token=tokenizer.pad_token_id,
                 sos_token=tokenizer.bos_token_id,
                 eos_token=tokenizer.eos_token_id,
                 max_pred=512):

        super(OCR_V3, self).__init__()

        self.vit = ViTModel(vit_config)

        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.pad_token = pad_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_token)
        self.max_pred = max_pred

        self.fc_out = nn.Linear(d_model, vocab_size)


    def forward(self, img, y_tokens):

        encoded_output = self.vit(img).last_hidden_state

        if y_tokens.shape[-1] == 1:
            all_logits = torch.zeros(self.vocab_size).unsqueeze(0).unsqueeze(0).to(device)

            for i in range(self.max_pred-2):
                y_embedded = self.embedding(y_tokens)

                decoded_output = self.decoder(y_embedded, encoded_output)
                logit = self.fc_out(decoded_output)
                all_logits = torch.cat([all_logits, logit[:,-1,:].unsqueeze(0)], dim=-2)


                last_output_token = logit[:,-1,:].argmax(1).unsqueeze(0)
                y_tokens = torch.cat([y_tokens, last_output_token], dim=1)

                if last_output_token == self.eos_token:
                    return y_tokens, all_logits

            else:
                y_tokens = torch.cat([y_tokens, torch.tensor([[self.eos_token]]).to(device)], dim=1)
                return y_tokens, all_logits

        else:
            y_embedded = self.embedding(y_tokens)

            tgt_seq_len = y_embedded.shape[1]
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(y_embedded.device)

            tgt_key_padding_mask = (y_tokens == self.pad_token)

            decoded_output = self.decoder(y_embedded, encoded_output, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)

            logit = self.fc_out(decoded_output)

            return logit

In [None]:
def add_noise_to_sequence(seq, vocab_size=tokenizer.vocab_size, noise_prob=0.1):

    rand_mask = torch.rand_like(seq.float())  # Random values in [0, 1]
    replace_mask = rand_mask < noise_prob
    random_tokens = torch.randint(0, vocab_size, seq.shape, dtype=seq.dtype, device=seq.device)

    noisy_seq = torch.where(replace_mask, random_tokens, seq)
    return noisy_seq


def train_step(model, optimizer, dataloader, loss_fn, epoch):
    model.train()
    train_loss, total_correct, total_sample = 0, 0, 0

    for X, y in tqdm(dataloader, total=len(dataloader), desc="Training", unit="images"):
        X, y = X.to(device), y.to(device)
        y_noise = add_noise_to_sequence(y, noise_prob=0.3)

        y_logit = model(X, y)

        loss = loss_fn(y_logit.permute(0,2,1), y)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        y_pred = torch.argmax(y_logit, dim=2)
        total_correct += (y_pred == y).sum().item()
        total_sample += y_pred.numel()
        del X, y, y_logit, loss, y_pred

    acc = (total_correct / total_sample) * 100
    avg_loss = train_loss / len(dataloader)

    print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | Accuracy: {acc:.2f}%")

def test_step(model, loss_fn, epoch, dataloader):
    model.eval()
    test_loss, total_correct, total_sample = 0, 0, 0
    with torch.inference_mode():
        for X, y in tqdm(dataloader, total=len(dataloader), desc="Testing", unit="images"):
            X, y = X.to(device), y.to(device)

            y_logit = model(X, y)

            loss = loss_fn(y_logit.permute(0,2,1), y)
            test_loss += loss.item()

            y_pred = torch.argmax(y_logit, dim=2)
            total_correct += (y_pred == y).sum().item()
            total_sample += y_pred.numel()
            del X, y, y_logit, loss, y_pred

    acc = (total_correct / total_sample) * 100
    avg_loss = test_loss / len(dataloader)

    print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | Accuracy: {acc:.2f}%")

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

device='cuda'

configuration = ViTConfig(image_size=768, hidden_size=768, intermediate_size=3072, num_attention_heads=12)

model = VV_OCR(vocab_size=tokenizer.vocab_size, vit_config=configuration).to(device)
model.load_state_dict(torch.load(f="/home/Tr_Only_English.pth"))
# model = nn.DataParallel(model)

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

  model.load_state_dict(torch.load(f="/home/Tr_Only_English.pth"))


In [None]:
epochs = 1
torch.manual_seed(42)
torch.cuda.manual_seed(42)
for epoch in range(epochs):
  train_step(model=model,
             optimizer=optimizer,
             loss_fn=loss_fn,
             epoch=epoch,
             dataloader=train_dataloader)
  test_step(model=model,
            loss_fn=loss_fn,
            epoch=epoch,
            dataloader=test_dataloader)

Training:   0%|          | 0/1547 [00:00<?, ?images/s]



Epoch 0 | Loss: 1.6207 | Accuracy: 84.71%


Testing:   0%|          | 0/387 [00:00<?, ?images/s]

Epoch 0 | Loss: 0.2304 | Accuracy: 98.62%


In [None]:
torch.save(obj=model.state_dict(), f="Tr_Only_English.pth")

In [None]:
from IPython.display import FileLink
FileLink('Tr_Only_Spanish.pth')

In [None]:
# model.load_state_dict(torch.load(f='/kaggle/working/Tr_Only_Checking.pth'))

test_step(model=model,
        loss_fn=loss_fn,
        epoch=1,
        dataloader=test_dataloader)

In [None]:
x, y = test_dataset[0]

In [None]:
x.shape, y.shape

(torch.Size([3, 768, 768]), torch.Size([512]))

In [None]:
output = model(x.unsqueeze(0).to(device),y.unsqueeze(0).to(device))

In [None]:
output.squeeze(0).argmax(dim=1)

tensor([     0,      6, 237366,    541,    170, 102617,      4,   3036,  31746,
             4,    821,   6505,      4,    136,    339,   6921,   2082,      5,
           563,    150,  11050,     12,     62,   8347,    111,  58663,  22729,
         17055,   2320,  28029,    223,      4,  57553,      7,      4,    136,
         10548,      9,   9077,      7,      5,  30948,    214,  42477,    619,
         46473,      4,   4859,  27750,     12,    758,   1104,   5843,      4,
        140429,      5,    305,    378,   8894,    268,  20681,   1720,   9315,
            11,   1399,    353,    420,   1861,      4,     62,  94207,  14318,
         24004,   1399,      4,  11653,  15080,   1065, 114137,      4,    136,
         65261,   5024,   5428,      5,   1215,     66,  30170,   2320,  21373,
          2481,  63262,     12,    241,  24500,  45964,    136, 215543,   4393,
             9, 239879,  29569,  31897, 151575,    748,      5,     87, 129969,
         40266,      4,    201,  12498, 

In [None]:
y == output.squeeze(0).argmax(dim=1).to('cpu')

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True,  True,  True,  True, False,  True, False, False,  True,
         True,  True,  True, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [None]:
torch.cuda.empty_cache()

In [None]:
model.eval()
with torch.inference_mode():
    output_2 = model(x.unsqueeze(0).to(device),torch.tensor([[0]]).to(device))

In [None]:
output_2[0]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0