## Cleaning the text and converting pdf to images:

Final Code: 
If we want to send the full folder contanng pdfs

In [None]:
import cv2
import numpy as np
import fitz  # PyMuPDF
import os
from concurrent.futures import ThreadPoolExecutor

def pdf_to_images(pdf_path, output_folder):
    """Extracts images from PDF and saves them as PNGs."""
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_output_folder = os.path.join(output_folder, pdf_name)
    os.makedirs(pdf_output_folder, exist_ok=True)

    doc = fitz.open(pdf_path)
    image_paths = []

    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        image_path = os.path.join(pdf_output_folder, f"{pdf_name}_page_{i+1}.png")
        pix.save(image_path)
        image_paths.append(image_path)

    return image_paths

def remove_background_noise(image):
    """Removes dots and unwanted background noise using morphological operations."""
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.medianBlur(image, 3) 
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel, iterations=1)  # Open to remove dots
    return image

def enhance_contrast(image):
    """Applies CLAHE to improve contrast while preserving details."""
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    return clahe.apply(image)

def deskew_image(image):
    """Corrects skew in the document using Hough Line Transform."""
    edges = cv2.Canny(image, 50, 150, apertureSize=3)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=50, maxLineGap=10)

    if lines is None:
        return image  # No deskew needed

    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.arctan2(y2 - y1, x2 - x1) * (180 / np.pi)
        angles.append(angle)

    median_angle = np.median(angles)
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
    return cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)

def preprocess_image(image_path):
    """Applies noise removal, contrast enhancement, de-skewing, and adaptive thresholding."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    image = remove_background_noise(image)

    image = enhance_contrast(image)

    image = deskew_image(image)

    image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                  cv2.THRESH_BINARY, 15, 5)

    processed_path = image_path.replace(".png", "_cleaned.png")
    cv2.imwrite(processed_path, image)

    return processed_path

def clean_pdf_folder(input_folder, output_folder):
    """Processes all PDFs in a given folder."""
    pdf_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".pdf")]

    all_cleaned_images = []
    for pdf_path in pdf_files:
        image_paths = pdf_to_images(pdf_path, output_folder)

        with ThreadPoolExecutor() as executor:
            cleaned_images = list(executor.map(preprocess_image, image_paths))
            all_cleaned_images.extend(cleaned_images)

    return all_cleaned_images

if __name__ == "__main__":
    input_folder = "D:\OCR\Pdfs"  # Folder containing PDFs
    output_folder = "D:\OCR\Imgs"  # Folder to save cleaned images
    cleaned_files = clean_pdf_folder(input_folder, output_folder)
    print("Cleaned images:", cleaned_files)

Final Preprocessing code if we want to send pdf wise:

In [None]:
import cv2
import numpy as np
import fitz  # PyMuPDF
import os
from concurrent.futures import ThreadPoolExecutor

def pdf_to_images(pdf_path, output_folder):
    """Extracts images from PDF and saves them as PNGs."""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []
    
    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)  # Increased DPI for better OCR accuracy
        image_path = os.path.join(output_folder, f"page_{i+1}.png")
        pix.save(image_path)
        image_paths.append(image_path)
    
    return image_paths

def remove_background_noise(image):
    """Removes small dots and background noise using morphological operations."""
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.medianBlur(image, 3)  # Reduce small noise
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel, iterations=1)  # Open to remove dots
    return image

def enhance_contrast(image):
    """Applies CLAHE (Contrast Limited Adaptive Histogram Equalization)."""
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    return clahe.apply(image)

def deskew_image(image):
    """Corrects skew in the document using Hough Line Transform but avoids over-rotation."""
    edges = cv2.Canny(image, 50, 150, apertureSize=3)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=50, maxLineGap=10)

    if lines is None:
        return image  # No rotation needed

    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.arctan2(y2 - y1, x2 - x1) * (180 / np.pi)
        angles.append(angle)

    median_angle = np.median(angles)

    # **Limit rotation angle to avoid excessive rotation**
    if abs(median_angle) < 2 or abs(median_angle) > 10:
        return image  # Ignore small angles or extreme rotations

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
    return cv2.warpAffine(image, M, (w, h), borderMode=cv2.BORDER_REPLICATE)


def preprocess_image(image_path):
    """Enhances text visibility and removes background noise for OCR."""
    # Load image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Apply Non-Local Means Denoising (preserves text edges)
    image = cv2.fastNlMeansDenoising(image, None, h=30, templateWindowSize=7, searchWindowSize=21)

    # Adaptive Thresholding for better contrast
    image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    # Morphological Opening (Removes small noise dots)
    kernel = np.ones((1,1), np.uint8)
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

    # Save and return cleaned image path
    processed_path = image_path.replace(".png", "_cleaned.png")
    cv2.imwrite(processed_path, image)
    
    return processed_path

def clean_pdf(pdf_path, output_folder):
    """Cleans the PDF by converting pages to preprocessed images."""
    image_paths = pdf_to_images(pdf_path, output_folder)

    # Use threading for efficient parallel processing
    with ThreadPoolExecutor() as executor:
        cleaned_images = list(executor.map(preprocess_image, image_paths))

    return cleaned_images

# Example usage
if __name__ == "__main__":
    pdf_path = "D:\OCR\OCR_fld\Ezcaray - Vozes.pdf"
    output_folder = "D:\OCR\Imgs\Ezcaray"
    cleaned_files = clean_pdf(pdf_path, output_folder)
    print("Cleaned images:", cleaned_files)

## Using tessaract to do OCR :
To get a target text to compare our model with and also use it for fine Tuning the transformer..Just as a trial to see how tessaract performs on these images.

In [None]:
import pytesseract
from PIL import Image
import os

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\DELL\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

def ocr_image(image_path):
    """Performs OCR on a cleaned image."""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

def ocr_on_cleaned_images(cleaned_folder):
    """Runs OCR on all cleaned images and saves text to a file."""
    text_output = []
    
    for file in sorted(os.listdir(cleaned_folder)):
        #if file.endswith("_cleaned.png"):
        image_path = os.path.join(cleaned_folder, file)
        extracted_text = ocr_image(image_path)
        text_output.append(f"Page {file}: \n{extracted_text}\n{'='*50}")

    with open(os.path.join("D:\OCR\Extracted_texts", "Benudia.txt"), "w", encoding="utf-8") as f:
        f.write("\n".join(text_output))

    print("OCR completed! Text saved to the given path")

cleaned_folder = "D:\OCR\PreprocessedImages"
ocr_on_cleaned_images(cleaned_folder)

OCR completed! Text saved to output.txt


In [None]:
import pytesseract
from PIL import Image
import os

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\DELL\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

def ocr_image(image_path):
    """Performs OCR on a cleaned image."""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

def ocr_on_cleaned_images(root_folder):
    """Runs OCR on all cleaned images within each subfolder and saves text to separate files."""
    for subfolder in sorted(os.listdir(root_folder)):
        subfolder_path = os.path.join(root_folder, subfolder)
        
        if os.path.isdir(subfolder_path): 
            text_output = []
            
            for file in sorted(os.listdir(subfolder_path)):
                if file.endswith("_cleaned.png"): 
                    image_path = os.path.join(subfolder_path, file)
                    extracted_text = ocr_image(image_path)
                    text_output.append(f"Page {file}: \n{extracted_text}\n{'='*50}")
            
            output_file = os.path.join(root_folder, f"{subfolder}.txt")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write("\n".join(text_output))
            
            print(f"OCR completed for {subfolder}! Text saved to {output_file}")

cleaned_folder = "D:\\OCR\\trialprep"
ocr_on_cleaned_images(cleaned_folder)

In [None]:
!pip install torch torchvision transformers datasets pillow opencv-python

## Transformer based Model:

In [2]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import pytesseract
from PIL import Image
import cv2
import os

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\DELL\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
# I have used tesssaract to mark the boxes in this case.

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def get_text_regions(image_path):
    """Detects text regions using Tesseract and returns bounding boxes."""
    img = cv2.imread(image_path)
    d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

    boxes = []
    for i in range(len(d["text"])):
        if d["text"][i].strip(): 
            (x, y, w, h) = (d["left"][i], d["top"][i], d["width"][i], d["height"][i])
            boxes.append((x, y, x + w, y + h))
    
    return boxes

def perform_ocr(image_path):
    """Performs OCR using TrOCR on detected text regions."""
    img = Image.open(image_path).convert("RGB")
    boxes = get_text_regions(image_path)

    extracted_text = ""
    for box in boxes:
        cropped = img.crop(box)
        cropped = cropped.resize((384, 384))
        
        pixel_values = processor(images=cropped, return_tensors="pt").pixel_values.to(device)

        with torch.no_grad():
            output_ids = model.generate(pixel_values, max_length=128)

        extracted_text += processor.batch_decode(output_ids, skip_special_tokens=True)[0] + "\n"

    return extracted_text.strip()

cleaned_folder = "D:\OCR\Evalmgs\Ezcaray"
cleaned_images = [os.path.join(cleaned_folder, f) for f in os.listdir(cleaned_folder) if f.endswith("_cleaned.png")]

ocr_results = {}
for image_path in cleaned_images:
    extracted_text = perform_ocr(image_path)
    ocr_results[image_path] = extracted_text
    print(f"OCR Result for {image_path}:\n{extracted_text}\n")

output_txt = os.path.join(cleaned_folder, "ocr_results.txt")
with open(output_txt, "w", encoding="utf-8") as f:
    for img, text in ocr_results.items():
        f.write(f"{img}:\n{text}\n\n")

print(f"OCR completed. Results saved to {output_txt}")

  cleaned_folder = "D:\OCR\Evalmgs\Ezcaray"
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads

OCR Result for D:\OCR\Evalmgs\Ezcaray\page_1_cleaned.png:
----
J
***
CASHIER
-:
SENOR
ILVSTRISSIMO.
CASH
CUPADO
EN
EL
EXERCICIO
9.00
SR:
DE LAS
MITIONES.
EN
EL
***
OBILPADO
DE
GUADALA-
AMOUNT
XARA,REOIBI VNA
AC
V.S.I:
AMOUNT
AK
EN QUE MO LA NOTICIA DE
1
NOTICIA DE
COMO FU
FU
MAGGETTAD
(
QUE
DIOS
QUARDE)
TE
AVIA
SERVIDO
DE
HONRARME
CON
1A
MERCCO
DE
FU
PREDICADOR;
1
COMONO
TC
OPONE
1A
PREDICACION
DE
FU
MAGER
TAD
ALA
APOTOLICA,TUVE
PORDE
MI OBLI
GACION ADMITR EL FAVOR
:
FINDIEDO
2
V.S.I.EL
AGRADECIMIENTO.
EI
RCY
MI
FEROR(
QUE
DIOS
GUARDE)
HIZO
IA GRACIA ;
MAS
A.V.S.I.
TE TE
ACBE:
QUC
POP
MAS.
FRUTOS, QUE DIERA
1A
TIERRA
DE
PROMITION
>
NO
IOS
LORARA
TOYICS;
G.JOFUE,Y
CALCB
MO
IOS
FACEIN:
:DOS
TACARON
EL
FRUTO,
Y
DE
AMBOS
RECEFITO;
PARA
HALLAR,VN
GIMIL
PROPORCIONADO
AL'A
GRANDCZA
DE
V.S.I.
***
*
* 2
A

OCR completed. Results saved to D:\OCR\Evalmgs\Ezcaray\ocr_results.txt


In [None]:
#python -m spacy download es_core_news_sm --> Run this in the terminal to install it

### Using spacy to correct few spellings:
As we can see the outputs of the pretrained transformer model are boken and mostly wordwise. So, I am using Spacy to coorect it and give the text output in a formatted way as that is the ultimate requirement. We need to compare how this pretrained transformer model works wrt PyTessaract Library.

In [None]:
import spacy
import re

nlp = spacy.load("es_core_news_sm")

def correct_text(text):
    """Tokenizes and corrects OCR text."""
    doc = nlp(text)
    return " ".join([token.text for token in doc])

def words_to_sentences(text):
    """Converts word-by-word OCR output into structured sentences."""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return " ".join(sentences)

input_txt_file = "D:\\OCR\\Evalmgs\\Ezcaray\\ocr_results.txt"
output_txt_file = "D:\\OCR\\Evalmgs\\Ezcaray\\res.txt"

with open(input_txt_file, "r", encoding="utf-8") as f:
    ocr_text = f.readlines() 

structured_output = []
current_page = None
current_text = []

for line in ocr_text:
    line = line.strip()
    
    if re.match(r"D:\\OCR\\Evalmgs\\Ezcaray\\page_\d+_cleaned.png:", line):
        if current_page and current_text:
            corrected_text = correct_text(" ".join(current_text))
            structured_text = words_to_sentences(corrected_text)
            structured_output.append(f"{current_page}\n{structured_text}\n")
        
        current_page = line
        current_text = []
    
    else:
        current_text.append(line)

if current_page and current_text:
    corrected_text = correct_text(" ".join(current_text))
    structured_text = words_to_sentences(corrected_text)
    structured_output.append(f"📄 {current_page}\n{structured_text}\n")

with open(output_txt_file, "w", encoding="utf-8") as f:
    f.write("\n".join(structured_output))

print("OCR text structured and saved successfully!")

OCR text structured and saved successfully!


## Evaluation:

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

with open("D:\\OCR\\Evalmgs\\Ezcaray\\res.txt", "r", encoding="utf-8") as f:
    text1 = f.read()

with open("D:\\OCR\\textfiles\\Ezcaray\\page_1_cleaned.txt", "r", encoding="utf-8") as f:
    text2 = f.read()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])

similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print(f"Cosine Similarity: {similarity_score[0][0]:.4f}")

Cosine Similarity: 0.5707


In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

with open("D:\\OCR\\Evalmgs\\Ezcaray\\res.txt", "r", encoding="utf-8") as f:
    text1 = f.read()

with open("D:\\OCR\\textfiles\\Ezcaray\\page_1_cleaned.txt", "r", encoding="utf-8") as f:
    text2 = f.read()

embedding1 = model.encode(text1, convert_to_tensor=True)
embedding2 = model.encode(text2, convert_to_tensor=True)

similarity = util.pytorch_cos_sim(embedding1, embedding2)
print(f"BERT-based Similarity: {similarity.item():.4f}")

BERT-based Similarity: 0.5872


## Using Fine Tuned Transformers and CRNN:

## CRNN:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import resnet18
from PIL import Image
import os

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Image.MAX_IMAGE_PIXELS=None
# Define the CRNN Model
class CRNN(nn.Module):
    def __init__(self, num_classes, hidden_size=256, num_lstm_layers=2):
        super(CRNN, self).__init__()
        
        # Feature extractor (ResNet without fully connected layer)
        resnet = resnet18(pretrained=True)
        modules = list(resnet.children())[:-2]  # Remove FC layer
        self.cnn = nn.Sequential(*modules)
        
        # LSTM for sequence modeling
        self.rnn = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_lstm_layers, bidirectional=True, batch_first=True)
        
        # Linear layer for classification
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # BiLSTM doubles output size
    
    def forward(self, x):
        features = self.cnn(x)  # Extract visual features (B, C, H, W)
        features = features.squeeze(2) if features.shape[2] == 1 else features  # Ensure H=1
        print(features.shape)
        features = features.permute(0, 2, 1)  # (B, W, C)
        features = features.squeeze(2).permute(0, 2, 1)  # (B, W, C)
        
        rnn_out, _ = self.rnn(features)  # BiLSTM output (B, W, 2*hidden_size)
        output = self.fc(rnn_out)  # Convert to character logits
        return output

# OCR Dataset
class OCRDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        return image, label

# Load data function
def load_data(image_root, text_root):
    paths, labels = [], []
    for folder in os.listdir(image_root):
        img_dir = os.path.join(image_root, folder)
        txt_dir = os.path.join(text_root, folder)
        
        if not os.path.isdir(img_dir) or not os.path.isdir(txt_dir):
            continue
            
        for img_file in os.listdir(img_dir):
            if img_file.endswith(".png"):
                base = img_file[:-4]
                txt_path = os.path.join(txt_dir, f"{base}.txt")
                if os.path.exists(txt_path):
                    with open(txt_path, "r", encoding="utf-8") as f:
                        paths.append(os.path.join(img_dir, img_file))
                        labels.append(f.read().strip())
    return paths, labels

# Data transform
transform = transforms.Compose([
    transforms.Resize((128, 512)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load data
train_paths, train_labels = load_data("D:/OCR/Imgs", "D:/OCR/textfiles")
eval_paths, eval_labels = load_data("D:/OCR/Evalmgs", "D:/OCR/EvalText")

train_dataset = OCRDataset(train_paths, train_labels, transform)
eval_dataset = OCRDataset(eval_paths, eval_labels, transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

num_classes = 128  # Adjust based on vocabulary size
model = CRNN(num_classes=num_classes).to(device)
criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for images, labels in train_loader:
            images = images.to(device)
            optimizer.zero_grad()
            
            outputs = model(images)
            target_lengths = torch.IntTensor([len(label) for label in labels])
            target = torch.cat([torch.IntTensor([ord(c) for c in label]) for label in labels])
            
            input_lengths = torch.full((outputs.size(0),), outputs.size(1), dtype=torch.int32)
            loss = criterion(outputs.permute(1, 0, 2), target, input_lengths, target_lengths)
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

train_model(model, train_loader, criterion, optimizer)

torch.save(model.state_dict(), "crnn_ocr.pth")
print("Training completed!")


## Fine Tuned Transformer:

## Final Code for Fine Tuning the Pre Trained Transformer Model: 
Needs more Optimization.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import evaluate
import os

metric = evaluate.load("cer")
Image.MAX_IMAGE_PIXELS=None

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
# Ensure pad_token_id and decoder_start_token_id are set
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id

# Fix decoder_start_token_id issue
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id  # <-- ADD THIS

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Freeze all layers except the last few in the decoder
for param in model.parameters():
    param.requires_grad = False
for param in model.decoder.model.decoder.layers[-5:].parameters():
    param.requires_grad = True

# Custom Dataset Class
class OCRDataset(Dataset):
    def __init__(self, img_folder, text_folder, processor):
        self.img_paths = []
        self.texts = []
        self.processor = processor
        
        for subdir in os.listdir(img_folder):
            img_subfolder = os.path.join(img_folder, subdir)
            text_subfolder = os.path.join(text_folder, subdir)
            if not os.path.isdir(img_subfolder):
                continue
            
            for img_name in os.listdir(img_subfolder):
                if img_name.endswith("_cleaned.png"):
                    img_path = os.path.join(img_subfolder, img_name)
                    text_path = os.path.join(text_subfolder, img_name.replace(".png", ".txt"))
                    
                    if os.path.exists(text_path):
                        with open(text_path, "r", encoding="utf-8") as f:
                            text = f.read().strip()
                        self.img_paths.append(img_path)
                        self.texts.append(text)
    
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert("RGB")
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(self.texts[idx], return_tensors="pt", max_length=512, truncation=True).input_ids.squeeze(0)
        return {"pixel_values": pixel_values, "labels": labels}

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)
    return {"pixel_values": pixel_values, "labels": labels_padded}

train_dataset = OCRDataset("D:\\OCR\\Imgs", "D:\\OCR\\textfiles", processor)
val_dataset = OCRDataset("D:\\OCR\\Evalmgs", "D:\\OCR\\EvalText", processor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")
    
    model.eval()
    total_cer = 0
    
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            
            output_ids = model.generate(pixel_values)
            predictions = processor.batch_decode(output_ids, skip_special_tokens=True)
            references = [processor.tokenizer.decode(lbl[lbl!=-100], skip_special_tokens=True) for lbl in labels]
            
            total_cer += metric.compute(predictions=predictions, references=references)
    
    avg_cer = total_cer / len(val_loader)
    print(f"Epoch {epoch+1}, Validation CER: {avg_cer:.4f}")

model.save_pretrained("D:\\OCR\\fine_tuned_trocr1")
processor.save_pretrained("D:\\OCR\\fine_tuned_trocr1")
print("Fine-tuning complete. Model saved.")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Epoch 1, Training Loss: 8.9213
Epoch 1, Validation CER: 0.9770
Epoch 2, Training Loss: 7.4886
Epoch 2, Validation CER: 0.9794
Epoch 3, Training Loss: 7.1206
Epoch 3, Validation CER: 1.0000
Epoch 4, Training Loss: 7.0006
Epoch 4, Validation CER: 0.9790
Epoch 5, Training Loss: 6.9628
Epoch 5, Validation CER: 0.9955
Fine-tuning complete. Model saved.


## Using other model:

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import evaluate
import os

metric = evaluate.load("cer")
Image.MAX_IMAGE_PIXELS=None

processor = TrOCRProcessor.from_pretrained("qantev/trocr-large-spanish", do_rescale=False)
model = VisionEncoderDecoderModel.from_pretrained("qantev/trocr-large-spanish")

model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id 

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Freeze all layers except the last few in the decoder
for param in model.parameters():
    param.requires_grad = False
for param in model.decoder.model.decoder.layers[-5:].parameters():
    param.requires_grad = True

# Custom Dataset Class
class OCRDataset(Dataset):
    def __init__(self, img_folder, text_folder, processor):
        self.img_paths = []
        self.texts = []
        self.processor = processor
        
        for subdir in os.listdir(img_folder):
            img_subfolder = os.path.join(img_folder, subdir)
            text_subfolder = os.path.join(text_folder, subdir)
            if not os.path.isdir(img_subfolder):
                continue
            
            for img_name in os.listdir(img_subfolder):
                if img_name.endswith("_cleaned.png"):
                    img_path = os.path.join(img_subfolder, img_name)
                    text_path = os.path.join(text_subfolder, img_name.replace(".png", ".txt"))
                    
                    if os.path.exists(text_path):
                        with open(text_path, "r", encoding="utf-8") as f:
                            text = f.read().strip()
                        self.img_paths.append(img_path)
                        self.texts.append(text)
    
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert("RGB")
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        labels = self.processor.tokenizer(self.texts[idx], return_tensors="pt", max_length=512, truncation=True).input_ids.squeeze(0)
        return {"pixel_values": pixel_values, "labels": labels}

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)
    return {"pixel_values": pixel_values, "labels": labels_padded}

train_dataset = OCRDataset("D:\\OCR\\Imgs", "D:\\OCR\\textfiles", processor)
val_dataset = OCRDataset("D:\\OCR\\Evalmgs", "D:\\OCR\\EvalText", processor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")
    
    model.eval()
    total_cer = 0
    
    with torch.no_grad():
        for batch in val_loader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            
            output_ids = model.generate(pixel_values)
            predictions = processor.batch_decode(output_ids, skip_special_tokens=True)
            references = [processor.tokenizer.decode(lbl[lbl!=-100], skip_special_tokens=True) for lbl in labels]
            
            total_cer += metric.compute(predictions=predictions, references=references)
    
    avg_cer = total_cer / len(val_loader)
    print(f"Epoch {epoch+1}, Validation CER: {avg_cer:.4f}")

model.save_pretrained("D:\\OCR\\fine_tuned_trocr2")
processor.save_pretrained("D:\\OCR\\fine_tuned_trocr2")
print("Fine-tuning complete. Model saved.")

## Testing Code for Fine Tuned Transformer:

In [None]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import os

# Load fine-tuned model and processor
model_path = "D:\\OCR\\fine_tuned_trocr"
processor = TrOCRProcessor.from_pretrained(model_path)
model = VisionEncoderDecoderModel.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# OCR Function
def perform_ocr(img_folder, output_file):
    results = []
    
    for img_name in os.listdir(img_folder):
        if img_name.endswith(".png") or img_name.endswith(".jpg"):
            img_path = os.path.join(img_folder, img_name)
            image = Image.open(img_path).convert("RGB")
            
            # Process image
            pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
            
            # Generate text
            output_ids = model.generate(pixel_values)
            text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
            
            results.append(f"{img_name}: {text}")
    
    # Save results to file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(results))
    
    print(f"OCR completed. Results saved to {output_file}")

# Example Usage
img_folder = "D:\OCR\Evalmgs\Ezcaray"
output_file = "D:\\OCR\\output1.txt"
perform_ocr(img_folder, output_file)
