#### Text detection with Pytesseract Library of Python:
This is done ro find the position of the text in the image

In [None]:
import cv2
import pytesseract
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
input_dir = "D:\OCR_final\OCR\training_file\processed_images"
output_dir = "D:\OCR_final\out"
os.makedirs(output_dir, exist_ok=True)
image_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.jpg')]

In [None]:
for img_path in tqdm(image_files):
    image = cv2.imread(img_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    kernel = np.ones((2, 2), np.uint8)
    morph = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1)
    morph = cv2.dilate(morph, kernel, iterations=1)

    data = pytesseract.image_to_data(morph, output_type=pytesseract.Output.DICT)
    for i in range(len(data['text'])):
        if int(data['conf'][i]) > 30:  # Filter low-confidence words
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

    output_path = os.path.join(output_dir, os.path.basename(img_path))
    cv2.imwrite(output_path, image)

In [None]:
import os
import re
from glob import glob

In [None]:
img_dir = "D:\OCR_final\out"
txt_dir = "D:\OCR_final\OCR\training_file\transcript"
image_files = sorted(glob(os.path.join(img_dir, "*.jpg")))

In [None]:
def extract_doc_and_page(image_filename):
    match = re.match(r"(.*)-page_(\d+)", image_filename)
    if match:
        doc_name, page_num = match.groups()
        return doc_name, f"page{page_num}.txt"
    return None, None

In [None]:
image_text_pairs = []
for img_path in image_files:
    img_name = os.path.basename(img_path).replace(".jpg", "")
    doc_name, page_filename = extract_doc_and_page(img_name)

    if doc_name and page_filename:
        transcript_folder = os.path.join(txt_dir, f"transcript_{doc_name}")
        text_path = os.path.join(transcript_folder, page_filename)

        if os.path.exists(text_path):
            image_text_pairs.append((img_path, text_path))

print(f"Found {len(image_text_pairs)} image-text pairs")
for i, (img, txt) in enumerate(image_text_pairs[:5]): 
    print(f"Sample {i+1}:")
    print(f"  Image Path    : {img}")
    print(f"  Transcript Path: {txt}")
    print("-" * 50)

In [None]:
import albumentations as A
import cv2
import numpy as np
from tqdm import tqdm

In [None]:
augmentations = A.Compose([
    A.Rotate(limit=3, p=0.5), 
    A.RandomBrightnessContrast(p=0.5),  
    A.GaussNoise(var_limit=(10.0, 50.0), p=0.4),  
    A.MotionBlur(blur_limit=3, p=0.3),
])

In [None]:
augmented_data_dir = "D:\OCR_final\OCR\aug"
os.makedirs(augmented_data_dir, exist_ok=True)

num_augmentations = 10 
augmented_pairs = []

for img_path, text_path in tqdm(image_text_pairs):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    img_name = os.path.basename(img_path)
    orig_img_path = os.path.join(augmented_data_dir, img_name)
    cv2.imwrite(orig_img_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    augmented_pairs.append((orig_img_path, text_path))

    for i in range(1, num_augmentations + 1):
        augmented_img = augmentations(image=img)["image"]
        aug_img_name = img_name.replace(".jpg", f"_aug{i}.jpg")

        aug_img_path = os.path.join(augmented_data_dir, aug_img_name)
        cv2.imwrite(aug_img_path, cv2.cvtColor(augmented_img, cv2.COLOR_RGB2BGR))

        augmented_pairs.append((aug_img_path, text_path))

In [None]:
for i, (img, txt) in enumerate(augmented_pairs[:5]):
    print(f"Augmented Sample {i+1}:")
    print(f"  Image Path    : {img}")
    print(f"  Transcript Path: {txt}")
    print("-" * 50)

In [None]:
!pip install transformers torch torchvision Pillow datasets

In [None]:
from torch.utils.data import Dataset
from PIL import Image
from torch.nn.utils.rnn import pad_sequence
import torch

class OCRDataset(Dataset):
    def __init__(self, image_text_pairs, processor,max_length=512):
        self.image_text_pairs = image_text_pairs
        self.processor = processor
        self.max_length=max_length

    def normalize_text(self, text):
        """Apply historical OCR normalization rules"""
        text = text.replace("ſ", "s") 
        text = text.replace("ç", "z") 
        text = text.replace("q̄", "que") 
        text = text.replace("u", "v") if "v" in text else text.replace("v", "u") 
        text = text.translate(str.maketrans("áéíóú", "aeiou"))  
        return text



    def __len__(self):
        return len(self.image_text_pairs)


    def __getitem__(self, idx):
        img_path, text_path = self.image_text_pairs[idx]

        image = Image.open(img_path).convert("RGB")

        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()

        text = self.normalize_text(text)


        encoding = self.processor(
            image,
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
            )

        return {
            "pixel_values": encoding["pixel_values"].squeeze(0),  
            "labels": encoding["labels"].squeeze(0),   
            }

In [None]:
def collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])

    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)
    return {'pixel_values': pixel_values, 'labels': labels}

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(device)
print(model)

In [None]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(augmented_pairs, test_size=0.2, random_state=42)

In [None]:
train_dataset = OCRDataset(train_pairs, processor)
val_dataset = OCRDataset(val_pairs, processor)

In [None]:
def postprocess_ocr_output(text):
    text = text.replace("ſ", "s") 
    text = text.replace("ç", "z")  
    text = text.replace("q̄", "que") 
    text = text.translate(str.maketrans("áéíóú", "aeiou")) 
    return text

In [None]:
import torch
import torch.nn as nn
from transformers import Trainer

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, processor,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ctc_loss = nn.CTCLoss(blank=processor.tokenizer.pad_token_id, zero_infinity=True)
        self.processor=processor

    def compute_loss(self, model, inputs,num_items_in_batch=None, return_outputs=False):
        pixel_values = inputs["pixel_values"]
        labels = inputs["labels"]

        outputs = model(pixel_values, labels=labels)
        logits = outputs.logits 
        loss = outputs.loss 

        input_lengths = torch.full(
            (logits.shape[0],), logits.shape[1], dtype=torch.long
        )
        target_lengths = (labels != -100).sum(dim=-1) 

        labels = torch.where(labels == -100, processor.tokenizer.pad_token_id, labels)

        loss = self.ctc_loss(
            logits.permute(1, 0, 2).log_softmax(2), 
            labels,
            input_lengths,
            target_lengths
        )

        pred_texts = self.processor.batch_decode(logits.argmax(dim=-1), skip_special_tokens=True)
        gt_texts = self.processor.batch_decode(labels, skip_special_tokens=True)

        pred_texts = [postprocess_ocr_output(txt) for txt in pred_texts]  
        gt_texts = [postprocess_ocr_output(txt) for txt in gt_texts]

        return (loss, outputs) if return_outputs else loss

In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/human_ai_folder/ocr_model",
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=4,
    num_train_epochs=15,
    logging_dir="/content/drive/MyDrive/human_ai_folder/log",
    logging_steps=50,
    logging_strategy="steps",
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    fp16=True,
    push_to_hub=False
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
    processor=processor
)

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
accelerator.state._reset_state()
accelerator = Accelerator()  

trainer.accelerator = accelerator

In [None]:
trainer.train()

In [None]:
image_path = "D:\OCR_final\OCR\training_file\processed_images\Buendia-page_1.jpg"
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [19]:
data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

In [None]:
for i in range(len(data['text'])):
    if int(data['conf'][i]) > 0:  
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

plt.figure(figsize=(10, 10))
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.show()

In [None]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
from PIL import Image
import torch

checkpoint_path = "D:\OCR_final\OCR\ocr_model\checkpoint"  

model = VisionEncoderDecoderModel.from_pretrained(checkpoint_path)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model.eval()

In [None]:
def process_text_regions(image, data):
    extracted_text = ""

    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if text:
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            cropped_region = image[y:y+h, x:x+w]
            pil_img = Image.fromarray(cropped_region).convert("RGB")

            pixel_values = processor(pil_img, return_tensors="pt").pixel_values
            generated_ids = model.generate(pixel_values)
            decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

            extracted_text += decoded_text + " "

    return extracted_text

final_text = process_text_regions(gray, data)
print("Extracted Text:")
print(final_text)

Extracted Text:
RIEDAD DEL AFUMPTO,VIAUDOLE GRAUE,Y ONDEROFO,FIN UC IE ENCUMBRE LA AFETACION : QUE CS ALAJA MILY ETHIMABLE DC LA PRUDENCIA, QUE LA PLUMA POR 0 MAS REMONTADA GO- UICTHE CUS BUELOS AL PEFO DE LAS MATERIAS. UANTOS CO- NOCCEN AL AUTOR,LE BAN OYPO FACEQUENTEMERE EN 109 PUL- DITOS, Y LE BAN VENERADO CON EL PRIMER CREDITO CN CFTE EXERCICIO,CON GUE TABLEN 1 LINEA,A QUE ILEGA FU ELOGUEN- CIA,NUMCA INFERIOR ALA DE TULIO,Y DEMOTHENES.LA VILI- LIDAD DEL LIBRO TE CONOCE DEL KN,QUE POR EL TE PRETENCE. SU ENTERNZA CS LA IDEA DE VIN PRINCIPE PERFETO, Y GEN- OD EFFE O ALMA > 0- CABEZA DEL CUERPO DE LA REPUBLICAS BIEN TE DEJA ENTENDER,QUAN BENIGNAS INFUECIAS CAUFARA FU'VIRUD CN LAS COFTUMBERS DE BUS VARALLOS 1 Y QUANTQ MEJOR OBRARAN EFTOSADUERTIDOS DE FU EXEMPLO.COM QUE. BALLARAN EN ERTA OBRA LOS PRINCIPES, QUE APRENDER, Y 2 LOS. VAFALLOS,QUEIMITARY TODOS MADTA ENTERNZA CHRIFILA - NA,Y POLITICA CONDUCE,PARA 656 SOUERNAR, Y OBEDECER. ARDUA EMPREFA ES,AMIMAR LOS OJOS A- ESFERA TAN SUPERIOR

In [None]:
def normalize_text(text):
    """Apply historical OCR normalization rules"""
    text = text.replace("ſ", "s") 
    text = text.replace("ç", "z")
    text = text.replace("q̄", "que")  
    text = text.replace("u", "v") if "v" in text else text.replace("v", "u")  
    text = text.translate(str.maketrans("áéíóú", "aeiou"))  
    return text

In [None]:
transcript_path = "OCR/test_file/processed_images/Buendia-page_4.jpg"
with open(transcript_path, "r", encoding="utf-8") as f:
    ground_truth_text = f.read().strip()

ground_truth_text=normalize_text(ground_truth_text)

In [31]:
print("\nGround Truth Transcript:\n", ground_truth_text)


Ground Truth Transcript:
 eriedad del asvmpto, vsavdole grave, y ponderoso, sin qve
le encvmbre la afectacion; Qve es alaja mvy estimable
de la prvdencia, qve la plvma por si mas remontada qo-
vierne svs bvelos al peso de las materias. Qvantos co-
nocen al Avtor, le han oydo freqventemente en los pvl-
pitos, y le han venerado con el primer credito en este
exercicio, con qve saben la linea, a qve llega sv eloqven-
cia, nvnca inferior a la de Tvllio, y Demosthenes. La vti-
lidad del libre se conoce del fin, qve por el se pretende.
Sv enseñanza es la Idea de vn Principe Perfecto, y siend-
do este o alma, o cabeza del cverpo de la Repvblica,
bien se deja entender, qvan benignas inflvencias cavsara
sv virtvd en las costvmbres de svs Vasallos, y qvanto
mejor obraran estos advertidos de sv exemplo. Con qve
hallaran en esta obra los Principes, qve aprender, y los
Vasallos, qve imitar, y todos qvanta enseñanza Christia-
na, y Politica condvce, para saber governar, y obedecer.
Ardva empresa es,

In [None]:
!pip install spacy
!python -m spacy download es_core_news_sm


In [None]:
import re
import spacy

nlp = spacy.load("es_core_news_sm")

def correct_case(text):
    text = text.lower()

    doc = nlp(text)

    corrected_text = " ".join(
        [sent.text.capitalize() for sent in doc.sents]
    )

    return corrected_text
extracted_text = final_text
clean_text = correct_case(extracted_text)
print(clean_text)


Riedad del afumpto,viaudole graue,y onderofo,fin uc ie encumbre la afetacion : que cs alaja mily ethimable dc la prudencia, que la pluma por 0 mas remontada go- uicthe cus buelos al pefo de las materias. Uantos co- noccen al autor,le ban oypo facequentemere en 109 pul- ditos, y le ban venerado con el primer credito cn cfte exercicio,con gue tablen 1 linea,a que ilega fu eloguen- cia,numca inferior ala de tulio,y demothenes.la vili- lidad del libro te conoce del kn,que por el te pretence. Su enternza cs la idea de vin principe perfeto, y gen- od effe o alma > 0- cabeza del cuerpo de la republicas bien te deja entender,quan benignas infuecias caufara fu'virud cn las coftumbers de bus varallos 1 y quantq mejor obraran eftosaduertidos de fu exemplo.com que. Ballaran en erta obra los principes, que aprender, y 2 los. Vafallos,queimitary todos madta enternza chrifila - na,y politica conduce,para 656 souernar, y obedecer. Ardua emprefa es,amimar los ojos a- esfera tan superior, como la de vd 