## Set-up environment

We install 🤗 Transformers, which includes many awesome models, including TrOCR by Microsoft Research.

In [None]:
%%capture
!pip install -q transformers

## Loading and Preparing image for the model

In [None]:
import requests
from PIL import Image

image_path = "your image goes here"
image = Image.open(image_path).convert("RGB")
image

In [None]:
import cv2
import numpy as np
from matplotlib import pyplot as plt

image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# Apply a binary threshold
# 127 is the threshold value; 255 is the max value for white
_, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
binary_image = cv2.fastNlMeansDenoising(binary_image, None, h=30)  # h is the filter strength; adjust as needed
binary_image = Image.fromarray(binary_image)  # Convert back to PIL Image

# Normalize pixel values (0-1 range)
np_image = np.array(binary_image)
binary_image = binary_image.convert('RGB')

# Display the original and binarized images
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title("Original Grayscale Image")
plt.imshow(image, cmap="gray")

plt.subplot(1, 2, 2)
plt.title("Binarized Image")
plt.imshow(binary_image, cmap="gray")

plt.show()


In [None]:
import cv2
import numpy as np
from PIL import Image
from google.colab.patches import cv2_imshow

# Load the image in grayscale
image = np_image

# Define parameters
block_width = 50  # Width of each block
threshold_ratio = 0.15  # Minimum ratio of black pixels to keep a block (adjust as needed)
height, width = image.shape  # Get the dimensions of the image

# Iterate through each block along the width
kept_blocks = []
for i in range(0, width, block_width):
    # Extract the block
    block = image[:, i:i + block_width]

    # Create a binary mask where black pixels are 0 and non-black pixels are 255
    _, binary_block = cv2.threshold(block, 128, 255, cv2.THRESH_BINARY_INV)

    # Calculate the ratio of black pixels in the block
    black_pixels = cv2.countNonZero(binary_block)  # Now we can count non-zero pixels
    total_pixels = block.shape[0] * block.shape[1]
    black_pixel_ratio = black_pixels / total_pixels

    # Check if the black pixel ratio meets the threshold
    if black_pixel_ratio >= threshold_ratio:
        kept_blocks.append(block)

# Combine kept blocks horizontally
if kept_blocks:
    result_image = np.hstack(kept_blocks)
else:
    result_image = np.zeros((height, block_width), dtype=np.uint8)  # Empty result if no blocks kept

# Convert result_image to RGB
result_image_rgb = cv2.cvtColor(result_image, cv2.COLOR_GRAY2RGB)

# Convert to PIL Image
pil_image = Image.fromarray(result_image_rgb)

# Display in Colab and PIL (optional for non-Colab environments)
cv2_imshow(result_image_rgb)
pil_image.show()


In [None]:
from transformers import TrOCRProcessor
import torch

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
# calling the processor is equivalent to calling the feature extractor



torch.Size([1, 3, 384, 384])


## Load model

Here we load a TrOCR model from the [hub](https://huggingface.co/models?other=trocr). TrOCR models are instances of [`VisionEncoderDecoderModel`](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder), which combine a vision encoder (like ViT, BEiT, DeiT, ...) with a language model as decoder (like BERT, RoBERTa, GPT-2, ...).

In [None]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")

config.json:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.23G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-23): 24 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Line

In [None]:
pixel_values = processor(binary_image, return_tensors="pt").pixel_values.to(device)
print(pixel_values.shape)

torch.Size([1, 3, 384, 384])


## Generate text

Finally, we can generate text autoregressively using the `.generate()` method. We use the tokenizer part of the `processor` to decode the generated id's back to text. Note that by default, greedy search is used for generation, but there are more fancy methods like beam search and top-k sampling, which are also supported. You can check out [this blog post](https://huggingface.co/blog/how-to-generate) for details.

In [None]:
with torch.no_grad():
  generated_ids = model.generate(pixel_values, num_beams=10, early_stopping=True)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

In [None]:
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

In [None]:
generated_ids = model.generate(pixel_values, do_sample=False, top_k=50)  # Greedy output is achieved by disabling sampling (do_sample=False)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

In [None]:
generated_ids = model.generate(pixel_values, num_beams=10, early_stopping=True)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)


In [None]:
generated_ids = model.generate(pixel_values, num_beams=5, early_stopping=True, do_sample=True, top_k=50)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

In [None]:
generated_ids = model.generate(pixel_values, num_beams=5, early_stopping=True, do_sample=True, top_p=0.9)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)