In [None]:
import os                    # import OS module for operating system dependent functionality
import numpy as np           # import NumPy library for array computing 
import pandas as pd          # import Pandas library for data manipulation and analysis
import matplotlib.pyplot as plt  # import Matplotlib library for data visualization
from PIL import Image        # import Image module from Python Imaging Library to open and manipulate images
import cv2 as cv             # import OpenCV library for computer vision tasks
import torch                 # import PyTorch library for machine learning
import json                  # import JSON module to work with JSON data
import torchvision          # import TorchVision library for computer vision tasks
from engine import train_one_epoch, evaluate  # import train_one_epoch and evaluate functions from engine module
import utils                 # import utils module for various utility functions
import random                # import random module for generating random numbers and sequences

# Image Data Extraction and Class Creation

In [None]:
# Creating Class for Data Extraction and Processing

class OCRDataset(torch.utils.data.Dataset):
    
    def __init__(self, root):
        # Constructor for the class
        # Sets the root directory where the images and JSON file is stored
        self.root = root

        # Loads the list of image files and removes the last element as it is a JSON file
        self.imgs = list(os.listdir(root))[:-1]

        # Loads the JSON file containing the annotations for the images
        self.data = json.load(open(os.path.join(self.root, 'result.json')))

        
    def __getitem__(self, idx):
        
        # Load Images
        image_id = self.data['images'][idx]['id']
        image_path = self.data['images'][idx]['file_name'][10:]

        # Extracting the image from the file path
        image = Image.open(os.path.join(self.root, image_path)).convert('RGB')

        # Extracting the image information from the JSON file except MASK
        target = {}
        target['boxes'] = self.data['annotations'][idx]['bbox']
        target['labels'] = self.data['annotations'][idx]['category_id']+1
        target['image_id'] = self.data['annotations'][idx]['image_id']
        target['area'] = self.data['annotations'][idx]['area']
        target['iscrowd'] = self.data['annotations'][idx]['iscrowd']
        
        # Creating a numpy array containing four points, which form a rectangle
        # Each point is extracted from the JSON file using its corresponding index, 
        # and its coordinates are converted to integers using the int() function
        pts = np.array([[int(self.data['annotations'][idx]['segmentation'][0][0]), int(self.data['annotations'][idx]['segmentation'][0][1])],
                        [int(self.data['annotations'][idx]['segmentation'][0][2]), int(self.data['annotations'][idx]['segmentation'][0][3])],
                        [int(self.data['annotations'][idx]['segmentation'][0][4]), int(self.data['annotations'][idx]['segmentation'][0][5])],
                        [int(self.data['annotations'][idx]['segmentation'][0][6]), int(self.data['annotations'][idx]['segmentation'][0][7])]])

        # Sorting the points from left to right based on their x-coordinates
        xSorted = pts[np.argsort(pts[:,0]),:]

        # Splitting the sorted points into left-most and right-most points
        leftMost = xSorted[:2,:]
        rightMost = xSorted[2:,:]

        # Sorting the left-most points from top to bottom based on their y-coordinates
        leftMost = leftMost[np.argsort(leftMost[:,1]), :]

        # Extracting the top-left and bottom-left points from the sorted left-most points
        (tl, bl) = leftMost

        # Calculating the distances between the top-left point and the two right-most points,
        # and then extracting the right-most point with the greatest distance
        D = dist.cdist(tl[np.newaxis], rightMost, 'euclidean')[0]
        (br, tr) = rightMost[np.argsort(D)[::-1], :]

        # Creating an array of keypoints for the rectangle, where each keypoint 
        # is represented as a 3-element array containing the x-coordinate, y-coordinate,
        # and a flag indicating the keypoint visibility (1 = visible, 0 = invisible)
        # The coordinates of each keypoint are calculated based on the extracted 
        # top-left, top-right, bottom-right, and bottom-left points of the rectangle
        target['keypoints'] = np.array([[max(tl[0],0), max(tl[1],0),1],
                                        [max(tr[0],0), max(tr[1],0),1],
                                        [max(br[0],0), max(br[1],0),1],
                                        [max(bl[0],0), max(bl[1],0),1]])


        # Calculating the (x1,y1) and (x2,y2) of the bounding boxes from width and height
        target['boxes'] = [target['boxes'][0], target['boxes'][1], target['boxes'][0]+target['boxes'][2], target['boxes'][1]+target['boxes'][3]]
        
        # Reshaping
        target['boxes'] = np.expand_dims(target['boxes'], axis=0)

        # Converting all arrays to tensors for compatibility with PyTorch
        target['boxes'] = torch.as_tensor(target['boxes'], dtype=torch.float32)
        target['labels'] = torch.tensor([target['labels']], dtype=torch.int64)
        target['image_id'] = torch.tensor([target['image_id']])
        target['area'] = torch.tensor([target['area']])
        target['iscrowd'] = torch.tensor([target['iscrowd']], dtype=torch.int64)
        target['keypoints'] = torchl.tensor([target['keypoints']], dtype=torch.float32)

        return image, target
            
    def __len__(self):
        return len(self.imgs)

In [None]:
# Defining direcotries and loading the json file
root = 'path/to/images'
json_file = 'path/to/saved/result.json'
data = json.load(open(json_file))

In [None]:
x = OCRDataset(root=root, transforms=None)
img, target = x.__getitem__(1000)

In [None]:
img

In [None]:
target

# Model Training

In [None]:
def get_keypoints(num__keypoints):
    
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=False,
                                                                   pretrained_backbone=True,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes=5,
                                                                   trainable_backbone_layers=5)
    return model

In [None]:
def main():
    
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 5

    # use our dataset and defined transformations
    dataset = OCRDataset('path/to/images', get_transform(train=True))
    dataset_test = OCRDataset('path/to/images', get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(1073).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:1000])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[1000:1073])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=0,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_keypoints(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # let's train it for 100 epochs
    num_epochs = 100

    for epoch in range(num_epochs):

        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)

        # update the learning rate
        lr_scheduler.step()

        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)
    
    torch.save(model.state_dict(), 'ocr_keypoints.pth')

    print("That's it!")
    
    return model

In [None]:
if __name__ == "__main__":
    model = main()

In [None]:
# pick one image from the test set

num = random.randrange(1073, 1341)
img, _ = OCRDataset('path/to/images', get_transform(train=False))[num]

# put the model in evaluation mode
model.eval()
with torch.no_grad():
    prediction = model([img.to(torch.device('cpu'))])