# Inference for Tesseract using tesserocr

Inference is working on prepared image cutouts, not library images as a whole.

# Preprocessing steps

run once to load methods

In [1]:
import cv2
import numpy as np

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal - blur
def blur(image):
    return cv2.medianBlur(image,5)
 
# thresholding
def thresholding(image):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 21, 15)

# dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
# erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

# opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

# closing - dilation followed by erosion
def closing(image):
    kernel = np.ones((1,1),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)

# Legacy

## Setup

In [2]:
!sudo add-apt-repository -y ppa:alex-p/tesseract-ocr
!sudo apt-get update
!sudo apt-get install tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
!pip install tesserocr

!curl -O https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/eng.traineddata
!mv eng.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

from IPython.display import clear_output  # to display images

clear_output()

## Inference on folder

In [None]:
import tesserocr
from PIL import Image
import os.path

directory = 'path-to-inference-folder'
cutouts = []
texts = []
scale = 1

with tesserocr.PyTessBaseAPI(oem=tesserocr.OEM.TESSERACT_ONLY, psm=tesserocr.PSM.SINGLE_BLOCK) as api:
    api.SetVariable('tessedit_char_whitelist', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-+().,')
    api.SetVariable('tessedit_char_blacklist', '!?@#$%&*<>_=/:;\'"')

    for filename in sorted(os.listdir(directory)):
        if filename.endswith('.jpg') or filename.endswith('.png'):

            c = cv2.imread(os.path.join(directory, filename))
            g = get_grayscale(c)
            b = blur(g)
            t = thresholding(b)
            o = opening(t)
            cl = closing(o)
            im_pil = Image.fromarray(cl)
            api.SetImage(im_pil)
            output = api.GetUTF8Text()
            output = output.replace('\n', ' ')
            texts.append(output)

# LSTM

## Setup

In [5]:
!sudo add-apt-repository -y ppa:alex-p/tesseract-ocr
!sudo apt-get update
!sudo apt-get install tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
!pip install tesserocr

!curl -O https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/4.1.0/eng.traineddata
!mv eng.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

from IPython.display import clear_output  # to display images

clear_output()

## Inference on folder

In [None]:
import tesserocr
from PIL import Image
import os.path

directory = 'path-to-inference-folder'
cutouts = []
texts = []
scale = 1

with tesserocr.PyTessBaseAPI(oem=tesserocr.OEM.LSTM_ONLY, psm=tesserocr.PSM.SINGLE_BLOCK) as api:
    api.SetVariable('tessedit_char_whitelist', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-+().,')
    api.SetVariable('tessedit_char_blacklist', '!?@#$%&*<>_=/:;\'"')

    for filename in sorted(os.listdir(directory)):
        if filename.endswith('.jpg') or filename.endswith('.png'):

            c = cv2.imread(os.path.join(directory, filename))
            g = get_grayscale(c)
            b = blur(g)
            t = thresholding(b)
            o = opening(t)
            cl = closing(o)
            im_pil = Image.fromarray(cl)
            api.SetImage(im_pil)
            output = api.GetUTF8Text()
            output = output.replace('\n', ' ')
            texts.append(output)