# Refer this website for detailed explanation
- https://www.pyimagesearch.com/2018/09/17/opencv-ocr-and-text-recognition-with-tesseract/
- https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/

In [2]:
import pytesseract
from imutils.object_detection import non_max_suppression
import cv2
import argparse
import numpy as np

#Set path to wherever Tesseract-OCR is installed
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
def decode_predictions(scores, geometry):
    # Grab number of rows and columns from the scores volume, then initialize our set of bounding box rectangles 
    # and corresponding confidence scores
    (nRows, nCols) = scores.shape[2:4]
    rects = []
    confidences = []
    
    # Loop over the number of rows
    for y in range(0, nRows):
        # Extract scores (probabilities) followed by the geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]
        
        # Loop over the number of columns
        for x in range(0, nCols):
            
            #If our score doesn't have sufficient probability, ignore it
            if scoresData[x] < min_confidence:
                continue
                
            # Compute the offset factor as our resulting feature maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            
            # Extract the rotation angle for the prediction and compute its sine and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            
            # Use the geometry data to derive height and width of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            
            # Compute both the starting and ending (x, y)-coordinates for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            
            # Add the bounding box coordinates and probabilities to the respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
            
    # Return a tuple of the bounding boxes and the associated confidences
    return (rects, confidences)

In [4]:
#This is the path to the image
image_path = 'test2.png'

#This is the path to the frozen_east_text_detector.pb file that OpenCV provides
east_path = 'opencv-text-recognition/frozen_east_text_detection.pb'

#Other optional command line arguments
min_confidence = 0.5
width = 320
height = 320
padding = 0.0

In [5]:
#Load the image and grab its dimensions
image = cv2.imread(image_path)
original = image.copy()
(original_height, original_width) = original.shape[:2]

#Set the new width and height and then determine the ratio between the old and new dimensions
new_width = width
new_height = height
ratio_width = original_width / new_width
ratio_height = original_height / new_height

#resize the image and grab the new dimensions
image = cv2.resize(image, (new_width, new_height))
(height, width) = image.shape[:2]

In [None]:
print(original_height, original_width)
print(height, width)
print(ratio_height, ratio_width)

-------------
## Setting Up the EAST Detector
--------------

In [7]:
'''
Define the two output layer names for the EAST detector model that we are interested in
 - the first is the output probabilities
 - the second can be used to derive the bounding box coordinates of text
'''

layerNames = [
    "feature_fusion/Conv_7/Sigmoid",
    "feature_fusion/concat_3"
]

#Load the pretrained EAST detector
print("Loading EAST Detector")
net = cv2.dnn.readNet(east_path)

Loading EAST Detector


In [9]:
# Construct a blob from the image and then perform a forward pass of the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image,
                             1.0,
                             (width, height),
                             (123.68, 116.78, 103.94),
                             swapRB = True,
                             crop = False)
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)


# Decode the predictions, then apply non-maxima-suppression to suppress weak, overlapping bounded boxes
(rects, confidences) = decode_predictions(scores, geometry)
boxes = non_max_suppression(np.array(rects), probs = confidences)

  overlap = (w * h) / area[idxs[:last]]
  np.where(overlap > overlapThresh)[0])))


In [10]:
scores.shape, geometry.shape

((1, 1, 80, 80), (1, 5, 80, 80))

In [11]:
boxes[100]

array([293, 198, 260, 200])

In [12]:
print(len(rects))
print(len(boxes))

220
181


In [14]:
# Initialize the list of results
results = []

# Loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
    # Scale the bounding box coordinates based on the respective ratios
    startX = int(startX * ratio_width)
    startY = int(startY * ratio_height)
    endX = int(endX * ratio_width)
    endY = int(endY * ratio_height)
    
    # In order to obtain a better result at OCR, we can apply a bit of padding to our bounding box, so it can encompass 
    # the whole text area, in case it already doesn't
    deltaX = int((endX - startX) * padding)
    deltaY = int((endY - startY) * padding)
    
    # Apply padding to each side of the bounding box respectively. Also ensure it doesn't cross image boundaries
    startX = max(0, startX - deltaX)
    startY = max(0, startY - deltaY)
    endX = min(original_width, endX + (deltaX*2))
    endY = min(original_height, endY + (deltaY*2))
    
    # Extract the actual padded Region Of Interest (ROI)
    roi = original[endY:startY, endX:startX]
    
    try:
        # in order to apply Tesseract v4 to OCR text we must supply (1) a language, (2) an OEM flag of 4, indicating that the 
        # we wish to use the LSTM neural net model for OCR, and finally (3) an OEM value, in this case, 7 which implies
        # that we are treating the ROI as a single line of text
        config = ("-l eng --oem 1 --psm 7")
        text = pytesseract.image_to_string(roi, config=config)
        print(text)
        # Add the bounding box coordinates and the OCR'd text to the list of results
        results.append(((startX, startY, endX, endY), text))
    except:
        print(startX, startY, endX, endY)

ma y
i e
_— FJ
_—
a a
tt ~
—
—_—.
663 201 580 201
ta |
rr
—
a
Be
- ia
ta iz
1066 190 983 192
659 202 576 206
Ww wW
Oo
618 203 584 207
ee
oe
587 202 587 193
a
4
rr rs
889 190 968 184
580 199 584 196
re
|
1036 192 979 199
ar rr
565 205 584 189
889 185 957 185
1055 188 979 197
Be
572 199 576 201
614 203 576 212
_
ne
655 203 572 212
a
561 202 576 192
_
482 165 520 208
979 189 983 183
1006 191 976 193
523 165 523 208
—
908 187 957 187
531 206 591 189
ee
—_
572 208 595 187
569 199 572 206
919 190 964 184
535 207 595 186
550 204 572 198
501 212 595 193
964 187 972 187
674 189 569 195
1043 189 979 193
565 173 531 210
569 179 542 213
610 202 569 217
994 193 976 198
535 174 531 212
—_

508 210 599 189
859 190 972 183
866 194 991 182
953 189 968 184
520 205 580 194
569 185 554 215
900 196 983 183
942 189 964 188
565 192 561 216
-
927 195 979 182
550 205 565 204
610 198 561 219
re
538 179 538 214
508 181 546 215
610 190 550 217
859 187 953 186
606 184 538 215
968 196 983 183
976 199 991 180
550 19

In [None]:
# sort the results bounding box coordinates from top to bottom
results = sorted(results, key=lambda r:r[0][1])
 
# loop over the results
for ((startX, startY, endX, endY), text) in results:
    # display the text OCR'd by Tesseract
    print("OCR TEXT")
    print("========")
    print("{}\n".format(text))

    # strip out non-ASCII text so we can draw the text on the image
    # using OpenCV, then draw the text and a bounding box surrounding
    # the text region of the input image
    text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
    output = original.copy()
    cv2.rectangle(output, (startX, startY), (endX, endY),
        (0, 0, 255), 2)
    cv2.putText(output, text, (startX, startY - 20),
        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)

    # show the output image
    cv2.imshow("Text Detection", output)
    cv2.waitKey(0)

OCR TEXT


OCR TEXT
ta ta

OCR TEXT
nn

OCR TEXT
Oo

OCR TEXT
re

OCR TEXT
Be

OCR TEXT
Bn

OCR TEXT
rr rs

