# Transformers for OCR Experiments

In [1]:
# !pip install -q transformers
# !pip install -q sentencepiece

In [1]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from tqdm.auto import tqdm
from urllib.request import urlretrieve
from zipfile import ZipFile

import numpy as np
import matplotlib as plt
import torch
import os
import glob


In [5]:
import torch
# Check the PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")

print(torch.version.cuda)

PyTorch version: 2.3.0
CUDA available: False
None


In [3]:
print(torch.cuda.is_available())

False


In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

### Helper Functions

In [3]:
# def download_and_unzip(url, save_path):
#     print(f"Downloading and extracting assets....", end="")

#     # Downloading zip file using urllib package
#     urlretrieve(url, save_path)

#     try: 
#         # Extracting zip file using the zipfile package
#         with ZipFile(save_path) as z: 
#             # Extract ZIP file contents in the same directory
#             z.extractall(os.path.save_path[0])
        
#         print("Done")
    
#     except Exception as e:
#         print("\nInvalid file.", e)

# URL = r"https://www.dropbox.com/scl/fi/news/images.zip?rlkey=54flzvhh9xxh45czb1c8n3fp3&dl=1"
# asset_zip_path = os.path.join(os.getcwd(), "images.zip")
# # Download if assest ZIP does not exists.
# if not os.path.exists(asset_zip_path):
#     download_and_unzip(URL, asset_zip_path)



def read_image(image_path):
    """
    :param image_path : String, path to the input image.

    Returns: 
        image : PIL Image.
    """
    image = Image.open(image_path).convert("RGB")
    return image
def ocr(image, processor, model):
    """+
    :param image: PIL Image. 
    :param processor: Huggingface OCR processor.
    :param model: HUggingface OCR model

    Returns: 
        generated_text: the OCR'd text string.
    """
    # We can directly perform OCR on cropped images.
    pixel_values = processor(image, return_tensors='pt').pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

def eval_new_data(data_path=None, num_samples=4, model=None, processor=None):
    image_paths = glob.glob(data_path)
    for i, image_path in tqdm(enumerate(image_paths), total=len(image_paths)):
        if i == num_samples:
            break
        image = read_image(image_path)
        text = ocr(image, processor, model)
        plt.figure(figsize=(7,4))
        plt.imshow(image)
        plt.axis('off')
        plt.show()

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-printed')
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-small-printed'
).to(device)


eval_new_data(
    data_path=os.path('1.jpg'),
    num_samples=1,
    model=model
)

config.json:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

VisionEncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

AssertionError: Torch not compiled with CUDA enabled

## DTrOcr

Ensure correct environment in jupyter notebook 

In [2]:
import sys 
print(sys.executable)

c:\Users\Lenovo\anaconda3\python.exe


make sure DTrOCr installation is within scope of notebooko

In [4]:
%pip install -r C:\Users\Lenovo\Documents\GitHub\Elec376_F24_group7\backend\ml\DTrOCR\requirements.txt

Note: you may need to restart the kernel to use updated packages.
Collecting Pillow==10.4.0
  Using cached pillow-10.4.0-cp39-cp39-win_amd64.whl (2.6 MB)
Collecting torch==2.3.1
  Using cached torch-2.3.1-cp39-cp39-win_amd64.whl (159.7 MB)
Collecting transformers==4.42.4
  Using cached transformers-4.42.4-py3-none-any.whl (9.3 MB)
Collecting mkl<=2021.4.0,>=2021.1.1
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl (228.5 MB)
Collecting tokenizers<0.20,>=0.19
  Using cached tokenizers-0.19.1-cp39-none-win_amd64.whl (2.2 MB)
Collecting intel-openmp==2021.*
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl (3.5 MB)
Collecting tbb==2021.*
  Using cached tbb-2021.13.1-py3-none-win_amd64.whl (286 kB)
Installing collected packages: tbb, intel-openmp, Pillow, mkl, torch, tokenizers, transformers
  Attempting uninstall: tbb
    Found existing installation: TBB 0.2


ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


In [4]:
import sys
sys.path.append(r'C:\Users\Lenovo\Documents\GitHub\Elec376_F24_group7\backend\ml\DTrOCR')

In [6]:
from dtrocr.config import DTrOCRConfig
from dtrocr.model import DTrOCRLMHeadModel
from dtrocr.processor import DTrOCRProcessor

from PIL import Image

config = DTrOCRConfig()
model = DTrOCRLMHeadModel(config)
processor = DTrOCRProcessor(DTrOCRConfig())

model.eval()        # set model to evaluation mode for deterministic behaviour
path_to_image = "1.jpg"  # path to image file

inputs = processor(
    images=Image.open(path_to_image).convert('RGB'),
    texts=processor.tokeniser.bos_token,
    return_tensors="pt"
)


In [7]:
model_output = model.generate(
    inputs=inputs, 
    processor=processor, 
    num_beams=3,    # defaults to 1 if not specified
    use_cache=True  # defaults to True if not specified
)

predicted_text = processor.tokeniser.decode(model_output[0], skip_special_tokens=True)

In [8]:
print(predicted_text)

 BringingUnix Bringing targ targ PAN targ billed targ targ targ targ targ PAN targ targ targ targvez PAN targ targ targUnix critic targ frustrationvez ANY targvez targ targ targ targ targ targ histories billed targ targ targ critic targvez targarton targ targ targ targ targ astounding targ155 targ next targ targ targ Jennings targ targ PAN targvez PAN targ critic billed targUnix targ targvez Bringing targ histories criticvez targ targ targincinnati targ targ targ Universities compan155 targ critic targ targ targ enclosed Bringing targ enclosed targ targ targ targ collected targ targ critic targ targ targ PAN Jennings targvez PAN targ targ PAN targ targ targ targ targ targ targ enclosed targ
