# Import Libraries

In [1]:
import torch, torchvision
print(torch.__version__,torch.cuda.is_available())

import matplotlib.pyplot as plt
import cv2
from imutils.object_detection import non_max_suppression

from craft_text_detector import Craft
import matplotlib.pyplot as plt
import os
import pytesseract
import numpy as np
from datetime import datetime

import ultralytics
from ultralytics import YOLO
ultralytics.checks()

import imutils

import glob
import  traceback

Ultralytics YOLOv8.0.132  Python-3.8.17 torch-2.0.1 CUDA:0 (NVIDIA GeForce GTX 960M, 4096MiB)
Setup complete  (8 CPUs, 15.8 GB RAM, 175.2/238.5 GB disk)


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Load Detection Model

In [4]:
model = YOLO('best.pt')

# Tesseract

In [5]:
pytesseract.pytesseract.tesseract_cmd = r'C://Program Files//Tesseract-OCR//tesseract.exe'

os.environ["TESSDATA_PREFIX"] =  "C://Program Files//Tesseract-OCR//tessdata"
custom_config = r'-c preserve_interword_spaces=5 --oem 3 --psm 4 '

# Craft Detector

In [6]:
craft_detector = Craft( crop_type="poly", cuda=True)



# Craft Module

In [7]:
class CraftModule:
    def __init__(self,craft_detector,craft_extractor):
        self.craft_extractor = craft_extractor
        self.craft_detector = craft_detector
        self.boxes = None
        self.image = None
        self.file = None
    
    def plot_image(self,img):
        plt.axis('off');
        plt.imshow(cv2.cvtColor(img,cv2.COLOR_BGR2RGB));
        plt.show();

    def rotate_image(self):
        img = self.image.copy()
        image = img.copy()
        boxes = self.craft_detector.detect_text(img)['boxes']
        angled_boxes = []

        # Get angled boxes
        for box in boxes:
            if len(box) >= 3:  # Ensure at least 3 points for a polygon
                # Convert box points to numpy array for easier manipulation
                box_points = np.array(box, dtype=np.int32).reshape(-1, 2)

                # Calculate the minimum bounding rectangle
                rotated_rect = cv2.minAreaRect(box_points)
                box_vertices = cv2.boxPoints(rotated_rect)
                box_vertices = np.int0(box_vertices)
                angled_boxes.append(box_vertices)
                cv2.drawContours(img, [box_vertices], 0, (255, 0, 0), 2)
        # plt.imshow(img)

        try:
            # Get largest contour and rotate on basis of that
            largest_contour = max(angled_boxes, key=cv2.contourArea)
            rows,cols = img.shape[:2]
            [vx,vy,x,y] = cv2.fitLine(largest_contour, cv2.DIST_L2,0,0.01,0.01)
            lefty = (-x*vy/vx) + y
            righty = ((cols-x)*vy/vx)+y

            angle_rad = np.arctan2(vy, vx)
            angle = np.degrees(angle_rad)[0]

            height, width = img.shape[:2]
            center = (width // 2, height // 2)
            
            if angle != 90:
                rotation_matrix = cv2.getRotationMatrix2D(center, angle , scale=1.0)
                self.image = cv2.warpAffine(self.image, rotation_matrix , (width, height))   
        except:
            pass

    def detect_text(self,image,file_name):
        self.file = file_name
        self.image = image
        cv2.imwrite(f'tests/{self.file}_0.jpg',image)

        # self.plot_image(image)
        self.rotate_image()
        self.boxes = self.craft_extractor.detect_text(self.image)['boxes']
        # self.plot_image(self.image)
        # print('------------ Detected Text Regions ------------')
        # self.draw_rectangles()
        # print('------------ Croped Text Regions ------------')
        self.show_cropped_image()

    def get_bounding_boxes(self,box):
        flat_box = box.flatten()
        x_min = round(min([flat_box[x] for x in [0,2,4,6]]))
        y_min = round(min([flat_box[y] for y in [1,3,5,7]]))
        x_max = round(max([flat_box[x] for x in [0,2,4,6]]))
        y_max = round(max([flat_box[y] for y in [1,3,5,7]]))

        return x_min,y_min,x_max,y_max
    
    # def draw_rectangles(self):
    #     image_rect = self.image.copy()
    #     for box in self.boxes:
    #         x_min,y_min,w,h = cv2.boundingRect(box)
    #         image_rect = cv2.rectangle(image_rect, (x_min,y_min), (x_min + w,y_min + h), (255,0,0), 2)
        # self.plot_image(image_rect)
            
    def show_cropped_image(self):
        count = 1
        for box in self.boxes:
            x_min,y_min,x_max,y_max = self.get_bounding_boxes(box)
            roi = self.image[y_min : y_max , x_min : x_max].copy()
            roi = cv2.resize(roi, None, fx= 2, fy= 2, interpolation= cv2.INTER_CUBIC)
            roi = cv2.medianBlur(roi, 3)
            roi_invert = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            # roi_invert = cv2.adaptiveThreshold(roi_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
            
            
            # roi_invert = cv2.bitwise_not(roi_gray)
            # roi_invert = cv2.resize(roi_invert, None, fx=2, fy=2)
            # thresh = cv2.threshold(roi_gray, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
            # edges = cv2.Canny(roi, 27, 255)
            # self.plot_image(roi_gray)
            # self.plot_image(cv2.bitwise_not(roi_gray))


            # contour, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
            # print(len(contour))
            
            # text_english_t = pytesseract.image_to_string(thresh,lang = 'eng', config = custom_config)
            # text_nepali_t = pytesseract.image_to_string(thresh,lang = 'nep', config = custom_config)
            # text_nepali_hin_t = pytesseract.image_to_string(thresh,lang = 'nep+hin', config = custom_config)


            text_english = pytesseract.image_to_string(roi_invert,lang = 'eng', config = custom_config)
            text_nepali = pytesseract.image_to_string(roi_invert,lang = 'nep', config = custom_config)
            text_nepali_hin = pytesseract.image_to_string(roi_invert,lang = 'nep+hin', config = custom_config)

            # print(f'English (gray) : {text_english}')
            # print(f'Nepali (gray) : {text_nepali}')
            # print(f'Nepali + Hindi (gray) : {text_nepali_hin}')

            # print(f'English (thresh) : {text_english_t}')
            # print(f'Nepali (thresh) : {text_nepali_t}')
            # print(f'Nepali + Hindi (thresh) : {text_nepali_hin_t}')

            cv2.imwrite(f'tests/{self.file}_{count}.jpg',roi_invert)
            
            with open(f'tests/{self.file}_{count}.txt', "w") as file:
                file.write(f'English : {text_english}\n')
                file.write(f'Nepali : {text_nepali}\n')
                file.write(f'Nepali + Hindi : {text_nepali_hin}\n')
            
            count += 1

            # self.plot_image(roi)
    

In [8]:
craft_extractor = Craft( crop_type="box", cuda=True,text_threshold=0.8,link_threshold=0.8,low_text=0.18)

# Read Images

In [13]:
file_type=['JPG','JPEG','PNG','JFIF']
images=[]
#for copying later
filename=[]
for format in file_type:
    for path in glob.glob(f"testing3/*.{format}"):
        filename.append(path)
        images.append(cv2.imread(path))

In [14]:
for index,image in enumerate(images):
    try:
        detected_boards = model(image)
        detected_boards[0].boxes.data = torch.stack([box for box in detected_boards[0].boxes.data])
        boxes = detected_boards[0].boxes.data
        file_name = filename[index].replace('\\','/')
        cv2.imwrite(f'tests/{file_name.split("/")[1]}',image)
        count = 0
        for box in boxes:
            x1,y1,x2,y2 = int(box[0]),int(box[1]),int(box[2]),int(box[3])

            roi =  image[y1 : y2, x1 : x2]
            count += 1
            roi_file_name = f'{file_name.split("/")[1].split(".")[0]}_{count}'
            CraftModule(craft_detector,craft_extractor).detect_text(roi,roi_file_name)
    except Exception:
        traceback.print_exc()


0: 320x640 8 billboards, 406.9ms
Speed: 5.0ms preprocess, 406.9ms inference, 4.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 billboard, 287.2ms
Speed: 4.0ms preprocess, 287.2ms inference, 4.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 2 billboards, 536.5ms
Speed: 6.0ms preprocess, 536.5ms inference, 5.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 5 billboards, 482.7ms
Speed: 5.0ms preprocess, 482.7ms inference, 5.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 6 billboards, 434.8ms
Speed: 5.0ms preprocess, 434.8ms inference, 4.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 5 billboards, 521.6ms
Speed: 5.0ms preprocess, 521.6ms inference, 4.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 billboard, 406.9ms
Speed: 5.0ms preprocess, 406.9ms inference, 4.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 3 billboards, 297.2ms
Speed: 5.0ms preprocess, 297.2ms inference, 