# Usage of External Dataset

In [10]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Change directory if needed
# !cd "/content/drive/My Drive/OCRTraining"

# Check directory
!ls "/content/drive/My Drive/OCRTraining/PassportDataset/"

Mounted at /content/drive
dejavu-fonts-ttf-2.37  extract	mutated_passports    patrick_hand_font	  test
DejaVuSans-Bold.ttf    img	original_annotation  synthetic_passports  train


# Usage of Labelme for Annotation

In [None]:
!pip install -U labelme2coco



In [None]:
!labelme

/bin/bash: line 1: labelme: command not found


# Split Train and Test
For Indian Passport Dataset

In [None]:
# Imports
import os
import random
import shutil

# File paths
dataset_path = "/content/drive/My Drive/OCRTraining/PassportDataset"
images_folder = os.path.join(dataset_path, "mutated_passports")  # Base path to mutated image folder
extracted_annotation_path = os.path.join(dataset_path, "synthetic_passports/labels.txt")  # Image_Path|Fields
train_folder = os.path.join(dataset_path, "train")
test_folder = os.path.join(dataset_path, "test")
train_folder_img = os.path.join(train_folder, "images")
test_folder_img = os.path.join(test_folder, "images")
train_annotation_txt = os.path.join(train_folder, "train_annotation.txt")
test_annotation_txt = os.path.join(test_folder, "test_annotation.txt")

# Ensure output folders exist
os.makedirs(train_folder_img, exist_ok=True)
os.makedirs(test_folder_img, exist_ok=True)

# Load annotations
with open(extracted_annotation_path, 'r') as f:
    annotations = f.readlines()

# Debug: Check the first few lines of annotations
print("First few lines of labels.txt:")
for i, line in enumerate(annotations[:5]):
    print(f"Line {i+1}: {line.strip()}")

# Skip the header
annotations = annotations[1:]  # Remove the header: "Image_Path|Fields"

# Shuffle the annotations randomly
random.shuffle(annotations)

# Split annotations: 80% train, 20% test
split_index = int(0.8 * len(annotations))
train_annotations = annotations[:split_index]
test_annotations = annotations[split_index:]

# Function to move images and update annotations with their new paths
def move_images_and_update_annotations(annotations, src_folder, dest_folder_img):
    updated_annotations = []
    for line in annotations:
        line_parts = line.strip().split("|")
        if len(line_parts) != 2:
            print(f"Skipping invalid annotation line: {line.strip()}")
            continue

        img_file = os.path.basename(line_parts[0])  # Extract image file name
        label = line_parts[1]  # Extract JSON string for fields
        src_path = os.path.join(src_folder, img_file)
        dest_path = os.path.join(dest_folder_img, img_file)

        if os.path.exists(src_path):  # Check if the source file exists
            shutil.copy(src_path, dest_path)  # Move image to destination
            '''
            Update the path in the annotation to point to the new end folder, for instance:
            From:
            /content/drive/My Drive/OCRTraining/PassportDataset/synthetic_passports/passport_1.jpg | {"fullname": "John Doe", "nationality": "USA", ...}
            To:
            /content/drive/My Drive/OCRTraining/PassportDataset/mutated_passports/passport_1.jpg | {"fullname": "John Doe", "nationality": "USA", ...}
            '''
            updated_annotations.append(f"{dest_path}\t{label}\n")
        else:
            print(f"Warning: File not found: {src_path}")
    return updated_annotations

# Move images and update train and test annotations
train_annotations_updated = move_images_and_update_annotations(train_annotations, images_folder, train_folder_img)
test_annotations_updated = move_images_and_update_annotations(test_annotations, images_folder, test_folder_img)

# Write and save updated train and test annotation files
with open(train_annotation_txt, 'w') as f:
    f.writelines(train_annotations_updated)

with open(test_annotation_txt, 'w') as f:
    f.writelines(test_annotations_updated)

print(f"Dataset split completed:")
print(f"Train dataset: {len(train_annotations_updated)} images")
print(f"Test dataset: {len(test_annotations_updated)} images")

First few lines of labels.txt:
Line 1: Image_Path|Fields
Line 2: /content/drive/My Drive/OCRTraining/PassportDataset/synthetic_passports/vpassport_284.jpg|{"fullname": "\u00f2Bo\u1ee4hy\u0110h\u1eb2\u1ed7rB", "nationality": "q\u1ef5O\u1ec6\u1edb\u0168aA", "pob": "\u00d2\u1ed9IdtOAexWB\u00c1\u0129\u1eed\u1ec4", "dob": "31/10/1967", "sex": "N\u1eef/F", "cmnd": "328219921", "passportid": "C7242158", "nameid": "VNM\u00d2BO\u1ee4HY\u0110H\u1eb2\u1ed6RB<", "passportid2": "C7242158"}
Line 3: /content/drive/My Drive/OCRTraining/PassportDataset/synthetic_passports/vpassport_378.jpg|{"fullname": "GX\u1ec5\u00d2\u1ef2\u1ea4a\u1eb5m\u00da\u01b0s", "nationality": "G\u1eb8u\u1ebb\u1ea9\u1eef\u00e8Q", "pob": "u\u1ef0Sx\u1edat\u1ec9\u1eb5\u1ec1\u1ec8\u1ee8qfW\u00e0", "dob": "31/10/1967", "sex": "N\u1eef/F", "cmnd": "404436753", "passportid": "N9063181", "nameid": "VNMGX\u1ec4\u00d2\u1ef2\u1ea4A\u1eb4M\u00da\u01afS<", "passportid2": "N9063181"}
Line 4: /content/drive/My Drive/OCRTraining/PassportDatase

# Data Box Labelling from Original Template
For Indian Passport Dataset

In [None]:
import os
import json
import cv2

# Paths
dataset_path = "/content/drive/My Drive/OCRTraining/PassportDataset"
image_folder = os.path.join(dataset_path, "train/images")
label_folder = os.path.join(dataset_path, "train/labels")
template_json_path = os.path.join(dataset_path, "synthetic_passports/template/passport_template.json")
os.makedirs(label_folder, exist_ok=True)

# YOLO class mapping
classes = {
    "fullname": 0,
    "nationality": 1,
    "dob": 2,
    "sex": 3,
    "pob": 4,
    "cmnd": 5,
    "passportid": 6,
    "nameid": 7,
    "passportid2": 8
}

# Convert polygon points to YOLO format
def convert_to_yolo(points, img_width, img_height):
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]
    x_min, x_max = min(x_coords), max(x_coords)
    y_min, y_max = min(y_coords), max(y_coords)
    x_center = (x_min + x_max) / 2 / img_width
    y_center = (y_min + y_max) / 2 / img_height
    bbox_width = (x_max - x_min) / img_width
    bbox_height = (y_max - y_min) / img_height
    return x_center, y_center, bbox_width, bbox_height

# Draw bounding boxes for debug images
def draw_bounding_boxes(img, points, label_name, color=(0, 255, 0)):
    x_coords = [p[0] for p in points]
    y_coords = [p[1] for p in points]
    x_min, x_max = int(min(x_coords)), int(max(x_coords))
    y_min, y_max = int(min(y_coords)), int(max(y_coords))
    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 2)
    cv2.putText(img, label_name, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Generate label files for all images using the template
# Write label YOLO format: <object_class> <x_center> <y_center> <width> <height>
def generate_labels_from_template(image_folder, label_folder, template_json_path):
    # Load template JSON
    with open(template_json_path, "r") as f:
        template_data = json.load(f)

    #  Make sure the image is in a correct extension format
    shapes = template_data["shapes"]  # Get field shapes
    for img_file in os.listdir(image_folder):
        if not img_file.endswith(".jpg"):
            continue

        # Define input and output paths
        image_path = os.path.join(image_folder, img_file)
        label_path = os.path.join(label_folder, f"{os.path.splitext(img_file)[0]}.txt")
        debug_image_path = os.path.join(label_folder, f"{os.path.splitext(img_file)[0]}_debug.jpg")

        # Read the image to get dimensions
        img = cv2.imread(image_path)
        if img is None:
            print(f"Error: Could not load {image_path}. Skipping.")
            continue
        img_height, img_width = img.shape[:2]

        # Init empty labels array lists that we can later add in data
        labels = []
        for shape in shapes:
            label_name = shape["label"]
            points = shape["points"]

            # Hardly happens but a checker to ensure the class labelled is valid
            if label_name not in classes:
                print(f"Warning: {label_name} is not in class mapping. Skipping.")
                continue

            # Mapping the YOLO coordination format label file with corresponding to the class id of that image
            class_id = classes[label_name]
            x_center, y_center, bbox_width, bbox_height = convert_to_yolo(points, img_width, img_height)
            labels.append(f"{class_id} {x_center:.6f} {y_center:.6f} {bbox_width:.6f} {bbox_height:.6f}")

            # Draw bounding boxes for debugging (draw surround that field by coordinated pts, and the label name or class)
            draw_bounding_boxes(img, points, label_name)

        # Save labels to file
        with open(label_path, "w") as f:
            f.write("\n".join(labels))
        print(f"Saved labels for {img_file} to {label_path}")

        # Save the debug image
        cv2.imwrite(debug_image_path, img)
        print(f"Saved debug image to {debug_image_path}")

# Main script
if __name__ == "__main__":
    generate_labels_from_template(image_folder, label_folder, template_json_path)
    print("Label generation complete!")

Saved labels for vpassport_1219.jpg to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_1219.txt
Saved debug image to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_1219_debug.jpg
Saved labels for vpassport_688.jpg to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_688.txt
Saved debug image to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_688_debug.jpg
Saved labels for vpassport_2666.jpg to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_2666.txt
Saved debug image to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_2666_debug.jpg
Saved labels for vpassport_1138.jpg to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_1138.txt
Saved debug image to /content/drive/My Drive/OCRTraining/PassportDataset/train/labels/vpassport_1138_debug.jpg
Saved labels for vpassport_1907.jpg to /content/drive/My Drive/OCRT

# Using Fast R-CNN (Pytorch) to Train Best Model
For Synthetic Dataset

**Faster R-CNN** (via PyTorch or TensorFlow Object Detection API):

Accepts bounding box coordinates in the format `x_min, y_min, x_max, y_max`.
Can handle custom datasets and outputs confidence scores for each predicted bounding box and class.

**Dataset Class**:\
Reads images and corresponding label files.
Prepares the data in the required format for PyTorch.

**Model:**\
Uses Faster R-CNN with a ResNet-50 backbone.
Adjusts the final layer for the number of classes.

**Accuracy Computation:**\
Add evaluation logic using IoU (Intersection over Union) or another metric.

**Result:**\
The best model with accuracy above 70% will be saved in /content/drive/My Drive/OCRTraining/PassportDataset/train/model/.


In [None]:
import os
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader, Dataset
import cv2
import json
import numpy as np
from tqdm import tqdm

# Dataset Class
class PassportDataset(Dataset):
    def __init__(self, image_dir, label_dir, transforms=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_file = self.image_files[idx]
        label_file = os.path.splitext(image_file)[0] + ".txt"

        # Load image
        image_path = os.path.join(self.image_dir, image_file)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Load labels
        label_path = os.path.join(self.label_dir, label_file)
        with open(label_path, 'r') as f:
            lines = f.readlines()

        boxes = []
        labels = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            class_id, x_min, y_min, x_max, y_max = map(int, parts)
            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(class_id)

        # Convert to tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {"boxes": boxes, "labels": labels}

        if self.transforms:
            image = self.transforms(image)

        return F.to_tensor(image), target

# Define Model
def get_model(num_classes):
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

# Training Loop
def train_one_epoch(model, optimizer, data_loader, device):
    model.train()
    epoch_loss = 0
    for images, targets in tqdm(data_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    return epoch_loss / len(data_loader)

# Compute IoU
def compute_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    iou = inter_area / (box1_area + box2_area - inter_area + 1e-6)
    return iou

# Evaluate Model
def evaluate_model(model, data_loader, device, iou_threshold=0.5):
    model.eval()
    total_predictions = 0
    correct_predictions = 0

    with torch.no_grad():
        for images, targets in tqdm(data_loader):
            images = list(image.to(device) for image in images)
            outputs = model(images)

            for i, output in enumerate(outputs):
                pred_boxes = output['boxes'].cpu().numpy()
                pred_labels = output['labels'].cpu().numpy()
                gt_boxes = targets[i]['boxes'].cpu().numpy()
                gt_labels = targets[i]['labels'].cpu().numpy()

                for gt_box, gt_label in zip(gt_boxes, gt_labels):
                    total_predictions += 1
                    for pred_box, pred_label in zip(pred_boxes, pred_labels):
                        if pred_label == gt_label:
                            iou = compute_iou(gt_box, pred_box)
                            if iou >= iou_threshold:
                                correct_predictions += 1
                                break

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Paths
train_image_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/train/images/"
train_label_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/train/labels/"
test_image_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/test/images/"
test_label_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/test/labels/"
model_save_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/train/model/"

# Hyperparameters
num_classes = 10  # Number of classes (including background)
num_epochs = 20
batch_size = 4
learning_rate = 0.005

# Prepare Dataset and Dataloader
train_dataset = PassportDataset(train_image_dir, train_label_dir)
test_dataset = PassportDataset(test_image_dir, test_label_dir)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

# Device
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")

# Initialize Model
model = get_model(num_classes).to(device)

# Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0005)

# Train and Save Best Model
best_accuracy = 0
lowest_loss = float('inf')  # Track the lowest loss

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, optimizer, train_loader, device)
    print(f"Epoch {epoch + 1}, Loss: {train_loss:.4f}")

    # Evaluate Accuracy
    current_accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch + 1}, Accuracy: {current_accuracy:.4f}")

    # Save model only if conditions are met
    if train_loss <= 0.07:  # Skip saving if loss > 0.07
        # Save the best model so the one with the highest accuracy (from history)
        # Or scenarioing 2 equals accuracy then it will be determined by the loss metric
        if current_accuracy > best_accuracy or (current_accuracy == best_accuracy and train_loss < lowest_loss):
            best_accuracy = current_accuracy
            lowest_loss = train_loss
            model_path = os.path.join(model_save_dir, f"best_model_{best_accuracy:.2f}_{train_loss:.4f}.pth")
            torch.save(model.state_dict(), model_path)
            print(f"Saved Best Model with Accuracy: {best_accuracy:.2f}, Loss: {lowest_loss:.4f}")
# Done
print("Training Complete!")

100%|██████████| 417/417 [3:24:40<00:00, 29.45s/it]


Epoch 1, Loss: 0.4530


100%|██████████| 150/150 [41:47<00:00, 16.72s/it]


Epoch 1, Accuracy: 0.8875


100%|██████████| 417/417 [2:07:57<00:00, 18.41s/it]


Epoch 2, Loss: 0.1351


100%|██████████| 150/150 [17:04<00:00,  6.83s/it]


Epoch 2, Accuracy: 0.8875


100%|██████████| 417/417 [2:08:15<00:00, 18.45s/it]


Epoch 3, Loss: 0.0874


100%|██████████| 150/150 [18:15<00:00,  7.30s/it]


Epoch 3, Accuracy: 0.8873


100%|██████████| 417/417 [2:06:11<00:00, 18.16s/it]


Epoch 4, Loss: 0.0678


100%|██████████| 150/150 [16:42<00:00,  6.68s/it]


Epoch 4, Accuracy: 0.8875
Saved Best Model with Accuracy: 0.89, Loss: 0.0678


 26%|██▌       | 107/417 [32:15<1:32:44, 17.95s/it]

# Manual Testing on Best Model Accuracy 1
This script having error when rotating the rectangle\
For Synthetic Dataset

Import Dependencies TourchVision and FastRCNN

In [13]:
# OpenCV TorchVision
!pip install torch torchvision opencv-python
# Install PyTorch that matches the Colab environment
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu11
# Install Detectron2 engine from GitHub
!pip install 'git+https://github.com/facebookresearch/detectron2.git'


Looking in indexes: https://download.pytorch.org/whl/cu11
Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-2jag7g2o
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-2jag7g2o
  Resolved https://github.com/facebookresearch/detectron2.git to commit b1c43ffbc995426a9a6b5c667730091a384e0fa4
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3

In [3]:
# Install detection dependencies
!pip install detectron2



In [8]:
# Clone to Detectron2 GitHub for configs file usage
!git clone https://github.com/facebookresearch/detectron2.git

fatal: destination path 'detectron2' already exists and is not an empty directory.
detectron2  drive  sample_data


In [11]:
# Make sure config yaml file exist
!ls detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml

detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml


In [19]:
import os
import cv2
import torch
import numpy as np
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.utils.visualizer import Visualizer
from detectron2.structures import RotatedBoxes

# Class Mapping
classes = {
    0: "fullname",
    1: "nationality",
    2: "dob",
    3: "sex",
    4: "pob",
    5: "cmnd",
    6: "passportid",
    7: "nameid",
    8: "passportid2"
}

# Configure Detectron2
def configure_detectron2(model_path, num_classes):
    """
    Configure Detectron2 with proper settings.
    """
    cfg = get_cfg()
    config_path = "detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
    assert os.path.exists(config_path), f"Configuration file not found at {config_path}"
    cfg.merge_from_file(config_path)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = num_classes  # Ensure it matches training configuration
    cfg.MODEL.WEIGHTS = model_path
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
    cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    return cfg


def draw_axis_aligned_box(image, box, color, class_name, score):
    """
    Draw an axis-aligned bounding box with the given parameters.
    """
    x_min, y_min, x_max, y_max = map(int, box)
    cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
    cv2.putText(
        image, f"{class_name} ({score:.2f})", (x_min, y_min - 10),
        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA
    )


def visualize_predictions(image_path, predictor, output_path):
    """
    Visualize predictions with axis-aligned bounding boxes.
    """
    image = cv2.imread(image_path)
    outputs = predictor(image)

    # Predictions
    predictions = outputs["instances"].to("cpu")
    boxes = predictions.pred_boxes.tensor.numpy()  # Axis-aligned boxes
    scores = predictions.scores.numpy()
    pred_classes = predictions.pred_classes.numpy()

    for box, score, class_id in zip(boxes, scores, pred_classes):
        class_name = classes.get(class_id, "unknown")
        draw_axis_aligned_box(image, box, (0, 255, 0), class_name, score)

    # Debug outputs
    print(f"Predicted Boxes: {boxes}")
    print(f"Predicted Scores: {scores}")
    print(f"Predicted Classes: {pred_classes}")

    # Save output
    cv2.imwrite(output_path, image)
    print(f"Predictions saved to {output_path}")


# Test ID and Model
test_id = 2893  # Example
num_classes = len(classes)

# Paths
model_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/train/model/best_model_{ba}_{bl}.pth"
image_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/test/images/vpassport_{test_id}.jpg"
output_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_passport/predicted_vpassport_{test_id}.jpg"

# Load Model and Predictor
cfg = configure_detectron2(model_path, num_classes)
predictor = DefaultPredictor(cfg)

# Run Predictions
visualize_predictions(image_path, predictor, output_path)

  return torch.load(f, map_location=torch.device("cpu"))
backbone.bottom_up.res2.0.conv1.norm.{bias, weight}
backbone.bottom_up.res2.0.conv1.weight
backbone.bottom_up.res2.0.conv2.norm.{bias, weight}
backbone.bottom_up.res2.0.conv2.weight
backbone.bottom_up.res2.0.conv3.norm.{bias, weight}
backbone.bottom_up.res2.0.conv3.weight
backbone.bottom_up.res2.0.shortcut.norm.{bias, weight}
backbone.bottom_up.res2.0.shortcut.weight
backbone.bottom_up.res2.1.conv1.norm.{bias, weight}
backbone.bottom_up.res2.1.conv1.weight
backbone.bottom_up.res2.1.conv2.norm.{bias, weight}
backbone.bottom_up.res2.1.conv2.weight
backbone.bottom_up.res2.1.conv3.norm.{bias, weight}
backbone.bottom_up.res2.1.conv3.weight
backbone.bottom_up.res2.2.conv1.norm.{bias, weight}
backbone.bottom_up.res2.2.conv1.weight
backbone.bottom_up.res2.2.conv2.norm.{bias, weight}
backbone.bottom_up.res2.2.conv2.weight
backbone.bottom_up.res2.2.conv3.norm.{bias, weight}
backbone.bottom_up.res2.2.conv3.weight
backbone.bottom_up.res3.0.c

Predicted Boxes: [[ 367.3512     0.       376.62714 1679.9999 ]]
Predicted Scores: [1.]
Predicted Classes: [0]
Predictions saved to /content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_passport/predicted_vpassport_2893.jpg


# Manual Testing on Best Model Accuracy 2
This shows only right-angled rectangle sketch\
For Synthetic Dataset

In [4]:
import os
import torch
import cv2
import numpy as np
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F

# Class Mapping
classes = {
    0: "fullname",
    1: "nationality",
    2: "dob",
    3: "sex",
    4: "pob",
    5: "cmnd",
    6: "passportid",
    7: "nameid",
    8: "passportid2"
}

# Load Model
def load_model(model_path, num_classes):
    """
    Load a Faster R-CNN model with a custom number of classes and weights.
    """
    model = fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    map_location = torch.device('cpu') if not torch.cuda.is_available() else None
    model.load_state_dict(torch.load(model_path, map_location=map_location))
    model.eval()
    return model

# Draw Polygon
def draw_polygon(image, box, color, class_name, score):
    """
    Draws a polygon matching the coordinates of the bounding box.
    """
    points = np.array([
        [box[0], box[1]],  # x_min, y_min
        [box[2], box[1]],  # x_max, y_min
        [box[2], box[3]],  # x_max, y_max
        [box[0], box[3]]   # x_min, y_max
    ], np.int32)
    cv2.polylines(image, [points], isClosed=True, color=color, thickness=2)
    cv2.putText(
        image, f"{class_name} ({score:.2f})", (int(box[0]), int(box[1] - 10)),
        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA
    )

# Visualize Predictions
def visualize_predictions(image_path, model, device, output_path):
    """
    Visualize predictions on an image using the trained model.
    """
    image = cv2.imread(image_path)
    original_image = image.copy()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    tensor_image = F.to_tensor(image).unsqueeze(0).to(device)

    # Make predictions
    with torch.no_grad():
        predictions = model(tensor_image)[0]

    boxes = predictions['boxes'].cpu().numpy()
    labels = predictions['labels'].cpu().numpy()
    scores = predictions['scores'].cpu().numpy()

    detected_classes = set()

    for box, label, score in zip(boxes, labels, scores):
        if score < 0.000000001:  # Confidence threshold (should lower this)
            continue
        class_name = classes[label]
        detected_classes.add(class_name)
        print(f"{class_name} {box[0]}, {box[1]}, {box[2]}, {box[3]}")
        draw_polygon(original_image, box, (0, 255, 0), class_name, score)

    # Check for missing classes
    missing_classes = set(classes.values()) - detected_classes
    if missing_classes:
        print(f"Detected field for {missing_classes} is missing, please try again")

    # Save and show results
    cv2.imwrite(output_path, original_image)
    print(f"Predictions saved to {output_path}")

# Test ID, change this to match your testing image
test_id = 288   # Example usage
# Model ID, change this to match your testing model
ba = 0.89       # Best model accuracy threshold
bl = 0.0599     # Best model loss threshold

# Paths
model_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/train/model/best_model_{ba}_{bl}.pth"
image_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/test/images/vpassport_{test_id}.jpg"
output_path = f"/content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_passport/predicted_vpassport_{test_id}.jpg"

# Load Model and Device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = load_model(model_path, num_classes=10).to(device)

# Run Prediction and Visualization
visualize_predictions(image_path, model, device, output_path)

  model.load_state_dict(torch.load(model_path, map_location=map_location))


nameid 347.2960205078125, 1330.955078125, 781.1043090820312, 1401.799560546875
sex 623.9873657226562, 1158.076904296875, 749.3565673828125, 1197.85302734375
pob 938.0919799804688, 1119.719482421875, 1088.7918701171875, 1166.02685546875
dob 628.4304809570312, 1094.3599853515625, 802.179443359375, 1136.3515625
cmnd 928.7145385742188, 1185.157958984375, 1056.8521728515625, 1221.48681640625
nationality 940.6444702148438, 1055.2967529296875, 1229.8485107421875, 1106.7686767578125
passportid 997.57763671875, 975.9032592773438, 1131.196044921875, 1016.792724609375
passportid2 308.67071533203125, 1376.48876953125, 473.3555603027344, 1425.3828125
Detected field for {'fullname'} is missing, please try again
Predictions saved to /content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_passport/predicted_vpassport_288.jpg


# Import VietOCR dependency

In [6]:
!pip install vietocr



# Custom Augmentor
For Indian Passport Dataset

In [None]:
from vietocr.tool.config import Cfg
from vietocr.model.trainer import Trainer
from imgaug import augmenters as iaa
import numpy as np
from PIL import Image
import cv2

# Define Custom Augmentor
class MyAugmentor:
    def __init__(self):
        self.aug = iaa.Sequential([
            iaa.Affine(rotate=(-10, 10)),  # Small random rotations
            iaa.GaussianBlur(sigma=(0, 0.1)),  # Blur intensity reduced
            iaa.AdditiveGaussianNoise(scale=(0, 0.01 * 255)),  # Noise intensity lowered
        ])

    # Detect if image is rotated in any of the 90, 180 or 270 degrees
    def correct_orientation_cv2(self, img):
        gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        edges = cv2.Canny(thresh, 50, 150)
        lines = cv2.HoughLines(edges, 1, np.pi / 180, 100)
        angles = []

        if lines is not None:
            for rho, theta in lines[:, 0]:
                angle = (theta * 180 / np.pi) % 180
                if 80 <= angle <= 100 or 260 <= angle <= 280:
                    angles.append(angle)

        if len(angles) > 0:
            avg_angle = np.mean(angles)
            if avg_angle > 90:
                avg_angle -= 180
            return avg_angle
        return 0

    def __call__(self, img):
        angle = self.correct_orientation_cv2(img)
        img = img.rotate(-angle, expand=True) if abs(angle) > 1 else img

        # Apply augmentations and return as PIL Image
        augmented = self.aug(image=np.array(img))  # Convert PIL to NumPy, apply augmentation
        return Image.fromarray(augmented)  # Convert back to PIL Image

# Crop Selective Fields and Annotate
For Indian Passport Dataset and optional for Synthetic Dataset

In [None]:
import os
import json
import random
from PIL import Image

# File paths
dataset_path = "/content/drive/My Drive/OCRTraining/PassportDataset"
images_folder = os.path.join(dataset_path, "img")  # Path to the original images
annotation_path = os.path.join(dataset_path, "original_annotation/annotation.json")  # Bounding box annotation
extracted_annotation_path = os.path.join(dataset_path, "extract/annotation.txt")  # Text annotations
train_folder = os.path.join(dataset_path, "train/crops")
test_folder = os.path.join(dataset_path, "test/crops")

# Ensure output folders exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Load annotation.json
with open(annotation_path, "r") as f:
    annotation_data = json.load(f)

# Load extracted text annotations
with open(extracted_annotation_path, "r") as f:
    text_annotations = f.readlines()

# Map extracted annotations into a dictionary
# Format: {image_file: [firstname, lastname, dob, country, gender, cardnumber]}
text_annotation_map = {}
for line in text_annotations:
    parts = line.strip().split("\t")
    img_file = os.path.basename(parts[0])  # Extract only the image file name
    text_annotation_map[img_file] = parts[1:]

# Map category_id to field indices in text_annotation_map
category_map = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5}  # Maps category_id to the correct index in text_annotation_map

# Split images into train and test
def split_images(annotation_data, train_ratio=0.8):
    images = annotation_data["images"]
    annotations = annotation_data["annotations"]

    # Shuffle and split the data
    random.shuffle(images)
    split_idx = int(len(images) * train_ratio)
    train_images = images[:split_idx]
    test_images = images[split_idx:]

    # Get corresponding annotations
    train_ids = {img["id"] for img in train_images}
    train_annotations = [anno for anno in annotations if anno["image_id"] in train_ids]
    test_annotations = [anno for anno in annotations if anno["image_id"] not in train_ids]

    return train_images, train_annotations, test_images, test_annotations

# Crop and save fields from images
def crop_and_save_fields(images, annotations, output_folder, category_map, text_annotation_map):
    for anno in annotations:
        image_id = anno["image_id"]
        category_id = anno["category_id"]
        bbox = anno["bbox"]  # [x, y, width, height]

        # Get the corresponding image file
        image_file = next((img["file_name"] for img in images if img["id"] == image_id), None)
        if image_file:
            img_path = os.path.join(images_folder, image_file)
            if os.path.exists(img_path):
                # Open and crop the image
                img = Image.open(img_path)
                cropped = img.crop((bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]))

                # Check if category_id exists in category_map
                field_name = next((key for key, val in category_map.items() if val == category_id), None)
                if not field_name:
                    print(f"Warning: category_id {category_id} not found in category_map.")
                    continue  # Skip this annotation

                # Save cropped image
                cropped_img_path = os.path.join(output_folder, f"{image_id}_{field_name}.jpg")
                cropped.save(cropped_img_path)

                # Save the label using the extracted annotation
                if image_file in text_annotation_map:
                    label = text_annotation_map[image_file][category_map[category_id]]
                else:
                    label = "N/A"  # Default to N/A if no matching annotation found

                label_path = cropped_img_path.replace(".jpg", ".txt")
                with open(label_path, "w") as f:
                    f.write(label)

# Split the data
train_images, train_annotations, test_images, test_annotations = split_images(annotation_data)

# Crop and save for train and test
crop_and_save_fields(train_images, train_annotations, train_folder, category_map, text_annotation_map)
crop_and_save_fields(test_images, test_annotations, test_folder, category_map, text_annotation_map)

print("Cropping and Annotation complete!")


Cropping and Annotation complete!


# GPU Usage on Google Collab

In [None]:
# Optional: Check if device/VM have GPU (for cuda)
# CHECK GPU USAGE CAPABILITY
import torch

# # Check if GPU is available and set the device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # If cuda is not available means that we are using CPU
# print(f"Using {device} for training")

if torch.cuda.is_available():
  print(f"GPU: {torch.cuda.get_device_name(0)} is available")
else:
  print("Run on CPU ONLY")

# On Google Collab, make sure to change runtime to GPU in advance

GPU: Tesla T4 is available


# Train Dataset with VietOCR
For Indian Passport Dataset

In [None]:
from vietocr.tool.config import Cfg
from vietocr.model.trainer import Trainer
from imgaug import augmenters as iaa
import numpy as np
from PIL import Image
import cv2

# Configuration for the Model
config = Cfg.load_config_from_name('vgg_transformer')
'''
Summary of chosen config model:

| Feature                         | **vgg_transformer**                      | **vgg_seq2seq**                   |
|---------------------------------|------------------------------------------|-----------------------------------|
| **Decoder Type**                | Transformer                              | Seq2Seq with LSTM/GRU             |
| **Speed**                       | Slower on short text, faster on long text| Faster on short text              |
| **Accuracy**                    | Higher for longer text and complex data  | Slightly lower for long sequences |
| **Resource Usage**              | High (Memory, GPU)                       | Lower (Memory, GPU)               |
| **Best for**                    | Long text, complex sequences             | Short text, limited hardware      |
'''

# Update dataset parameters
dataset_params = {
    'name': 'passport',  # Dataset name
    'data_root': '/content/drive/My Drive/OCRTraining/PassportDataset',  # Base folder
    'train_annotation': 'train/train_annotation.txt',  # Train annotation file
    'valid_annotation': 'test/test_annotation.txt'  # Validation annotation file
}

# Update training parameters
params = {
    'print_every': 100,  # Print loss every 100 iterations
    'valid_every': 500,  # Validate every 500 iterations
    'iters': 5000,  # Number of training iterations
    'export': '/content/drive/My Drive/OCRTraining/PassportDataset/weights/passportocr.pth',  # Path to save weights
    'metrics': 20  # Number of test samples used for evaluation
}

# Update configurations
config['trainer'].update(params)
config['dataset'].update(dataset_params)
config['device'] = 'cpu'  # Use GPU if available, change to 'cpu' otherwise

# Initialize Trainer with Custom Augmentor
trainer = Trainer(config, pretrained=True, augmentor=MyAugmentor())

# Visualize dataset (optional)
trainer.visualize_dataset()

# Train the model
trainer.train()

# Save final configuration
trainer.config.save('/content/drive/My Drive/OCRTraining/PassportDataset/config.yml')


18533it [00:15, 1166.24it/s]
  state_dict = torch.load(filename, map_location=torch.device(self.device))
Create train_passport:   0%|                                                 | 0/80 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

# Predict Dataset with VietOCR
For Synthetic Dataset

- We use pre-trained vgg_transformer model from VietOCR for this task.
- Pre-trained model already sustainable for text recognisation.
- For manual training with text detection, review the methods of:
  - Applying custom augumentor.
  - Crop and annotate fields.
  - Train with cropped fields from the Indian Dataset

In [None]:
import os
import torch
import cv2
import json
import numpy as np
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg

# Class Mapping
classes = {
    0: "fullname",
    1: "nationality",
    2: "dob",
    3: "sex",
    4: "pob",
    5: "cmnd",
    6: "passportid",
    7: "nameid",
    8: "passportid2"
}

# Load Faster R-CNN Model
def load_model(model_path, num_classes):
    """
    Load a Faster R-CNN model with a custom number of classes and weights.
    """
    model = fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    map_location = torch.device('cpu') if not torch.cuda.is_available() else None
    model.load_state_dict(torch.load(model_path, map_location=map_location))
    model.eval()
    return model

# Load VietOCR
def load_vietocr_model():
    """
    Load the VietOCR model for text recognition.
    """
    config = Cfg.load_config_from_name('vgg_transformer')
    # Use either vgg_transformer or vgg_seq2seq, statistic comparision can be found in the above note block
    config['weights'] = ocr_model_weights_path

    config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
    config['predictor']['beamsearch'] = False
    return Predictor(config)

# Read Text Using VietOCR
def read_text_with_vietocr(image, model):
    """
    Use VietOCR to read text from an image.
    """
    from PIL import Image
    pil_image = Image.fromarray(image)
    text = model.predict(pil_image)
    return text

# Process Bounding Boxes
def process_bbox_with_vietocr(image, model, bbox, field_name):
    """
    Extract the region of interest (ROI) from the image, adjust offsets for 'pob' and 'cmnd', and recognize text.
    """
    x_min, y_min, x_max, y_max = map(int, bbox)

    # Expand bbox for specific fields
    if field_name in ['pob', 'cmnd']:
        offset = 20  # Adjust offset as needed
        x_min = max(0, x_min - offset)
        y_min = max(0, y_min - offset)
        x_max = min(image.shape[1], x_max + offset)
        y_max = min(image.shape[0], y_max + offset)

    roi = image[y_min:y_max, x_min:x_max]
    return read_text_with_vietocr(roi, model)

# Detect and Extract Text
def detect_and_extract_text(image_path, detection_model, ocr_model, output_json_path):
    """
    Detect fields in an image, extract text using VietOCR, and save results in a JSON file.
    """
    # Load the image
    image = cv2.imread(image_path)
    original_image = image.copy()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    tensor_image = F.to_tensor(image).unsqueeze(0).to(device)

    # Make predictions
    with torch.no_grad():
        predictions = detection_model(tensor_image)[0]

    boxes = predictions['boxes'].cpu().numpy()
    labels = predictions['labels'].cpu().numpy()
    scores = predictions['scores'].cpu().numpy()

    detected_classes = set()
    result = {}

    for box, label, score in zip(boxes, labels, scores):
        if score < 0.5:  # Confidence threshold
            continue
        class_name = classes[label]
        detected_classes.add(class_name)

        # Extract text with VietOCR
        text = process_bbox_with_vietocr(original_image, ocr_model, box, class_name)
        result[class_name] = text
        print(f"Extracted {class_name}: {text}")

    # Check for missing classes
    missing_classes = set(classes.values()) - detected_classes
    if missing_classes:
        print(f"Detected field for {missing_classes} is missing, please try again")

    # Save result to JSON
    with open(output_json_path, "w") as json_file:
        json.dump(result, json_file, indent=4)
    print(f"Text results saved to {output_json_path}")

# Paths
test_images_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/test/images/"
output_text_dir = "/content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_text/"
os.makedirs(output_text_dir, exist_ok=True)

# Model Paths
detection_model_path = "/content/drive/My Drive/OCRTraining/PassportDataset/train/model/best_model_0.89_0.0599.pth"
ocr_model_weights_path = "/content/drive/My Drive/OCRTraining/PassportDataset/train/model/text_detection_weight.pth"  # Path to VietOCR weights

'''
If using pre-trained model, download from here:
https://vocr.vn/data/vietocr/vgg_transformer.pth
'''

# Load Models
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
detection_model = load_model(detection_model_path, num_classes=10).to(device)
ocr_model = load_vietocr_model()

# Process All Test Images
for image_name in os.listdir(test_images_dir):
    if image_name.endswith((".jpg", ".jpeg", ".png")):  # Supported image formats
        print(f"Processing {image_name}...")
        image_path = os.path.join(test_images_dir, image_name)
        output_json_path = os.path.join(output_text_dir, f"{os.path.splitext(image_name)[0]}.json")
        detect_and_extract_text(image_path, detection_model, ocr_model, output_json_path)


  model.load_state_dict(torch.load(model_path, map_location=map_location))


Processing vpassport_1805.jpg...
Extracted nameid: 03100101001960
Extracted dob: GI33/11/161
Extracted sex: Nam
Extracted passportid: Y53H69%
Extracted cmnd: Construction
Extracted pob: Stransform
Extracted nationality: Nơi sinh/Phám tịch
Extracted passportid2: YS316034
Detected field for {'fullname'} is missing, please try again
Text results saved to /content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_text/vpassport_1805.json
Processing vpassport_2978.jpg...
Extracted dob: 15/06/2002
Extracted nationality: 0360000888000
Extracted nameid: Constructionalists
Extracted pob: RECONTONISTS
Extracted passportid: W9455324
Extracted sex: Nam/M
Extracted cmnd: GROUNDING
Extracted passportid2: WOASS332A
Extracted sex: Dateoff
Extracted dob: Sextailing
Extracted nationality: 035324
Extracted passportid: DICORVIET
Detected field for {'fullname'} is missing, please try again
Text results saved to /content/drive/My Drive/OCRTraining/PassportDataset/test/predicted_text/vpassport_2978.j

# Evaluate Result on Test Set
For Indian Passport Dataset

In [None]:
from vietocr.tool.predictor import Predictor
from PIL import Image
import os

# Load the trained model and configuration
predictor = Predictor(config)

# Path to the test crops folder
test_crops_folder = '/content/drive/My Drive/OCRTraining/PassportDataset/test/crops'
test_images = [f for f in os.listdir(test_crops_folder) if f.endswith('.jpg')]

correct = 0
total = len(test_images)

for img_name in test_images:
    img_path = os.path.join(test_crops_folder, img_name)
    txt_path = img_path.replace('.jpg', '.txt')

    with open(txt_path, 'r') as f:
        ground_truth = f.read().strip()

    # Load the image using PIL
    image = Image.open(img_path)
    prediction = predictor.predict(image).strip()

    print(f"GT: {ground_truth}, Predicted: {prediction}")
    if prediction == ground_truth:
        correct += 1

# Accuracy per cent
accuracy = correct / total * 100
print(f"Test Accuracy: {accuracy:.2f}%")


Model weight /tmp/vgg_transformer.pth exsits. Ignore download!
GT: firstname, Predicted: cursier
GT: lastname, Predicted: imm
GT: dob, Predicted: 20/16/1131
GT: country, Predicted: cuss
GT: gender, Predicted: e
GT: cardnumber, Predicted: 13463400
GT: firstname, Predicted: I
GT: lastname, Predicted: T
GT: dob, Predicted: 
GT: country, Predicted: E
GT: gender, Predicted: 7
GT: cardnumber, Predicted: 
GT: firstname, Predicted: VIVHISON 474S
GT: lastname, Predicted: 1NY3 80097H
GT: dob, Predicted: 2861/50/20
GT: country, Predicted: INVIGNI
GT: gender, Predicted: w
GT: cardnumber, Predicted: 160E1 79H
GT: firstname, Predicted: hon12 91vaud
GT: lastname, Predicted: nojhax
GT: dob, Predicted: ocerysover
GT: country, Predicted: NAIGNI
GT: gender, Predicted: M
GT: cardnumber, Predicted: revoearl
GT: firstname, Predicted: CURVIO S1860
GT: lastname, Predicted: canlen
GT: dob, Predicted: 19/07/1990
GT: country, Predicted: MOLAN
GT: gender, Predicted: m
GT: cardnumber, Predicted: 1440791
GT: firstn

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

test_sample = os.listdir(test_crops_folder)[:5]  # Load a few test samples
for img_name in test_sample:
    img_path = os.path.join(test_crops_folder, img_name)
    txt_path = img_path.replace('.jpg', '.txt')

    with open(txt_path, 'r') as f:
        label = f.read().strip()

    img = Image.open(img_path)
    plt.imshow(img)
    plt.title(f"Label: {label}")
    plt.show()


# Inference Post Training
For Indian Passport Dataset

In [None]:
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg
from PIL import Image
import cv2
import numpy as np

# Load the configuration and pretrained model
config = Cfg.load_config_from_file('/content/drive/My Drive/OCRTraining/PassportDataset/config.yml')
config['weights'] = '/content/drive/My Drive/OCRTraining/PassportDataset/weights/passportocr.pth'
config['device'] = 'cuda:0'

detector = Predictor(config)

# Function to correct orientation
def correct_orientation(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Detect skew/rotation using text region properties
    def find_orientation(img):
        # Threshold the image
        _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Detect edges
        edges = cv2.Canny(thresh, 50, 150)

        # Hough Line Transformation to detect lines
        lines = cv2.HoughLines(edges, 1, np.pi / 180, 100)
        angles = []

        if lines is not None:
            for rho, theta in lines[:, 0]:
                angle = (theta * 180 / np.pi) % 180  # Convert to degrees
                if 80 <= angle <= 100 or 260 <= angle <= 280:  # Horizontal lines
                    angles.append(angle)

        # Average detected angle
        if len(angles) > 0:
            avg_angle = np.mean(angles)
            if avg_angle > 90:  # Handle upside-down case
                avg_angle -= 180
            return avg_angle
        return 0  # No correction needed

    angle = find_orientation(img)
    print(f"Detected rotation angle: {angle}")

    # Correct the rotation if needed
    if abs(angle) > 1:  # Apply correction only for significant angles
        (h, w) = img.shape[:2]
        center = (w // 2, h // 2)
        rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
        corrected_img = cv2.warpAffine(img, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
        return Image.fromarray(corrected_img)  # Convert to PIL Image
    else:
        return Image.open(image_path)

# Correct orientation and perform OCR
img_path = '/content/drive/My Drive/OCRTraining/PassportDataset/test/crops/59_dob.jpg'
corrected_img = correct_orientation(img_path)

# Perform OCR
prediction = detector.predict(corrected_img, return_prob=True) # Use "return_prob=False" if don't want to show probability
print(f"Predicted Text: {prediction}")


Detected rotation angle: 0
Predicted Text: ('dob', 0.9305401047070821)
