In [67]:
import json
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
import string
import statistics
from difflib import SequenceMatcher

In [88]:
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def helper(gray):
    gray = ((gray/255)**20 * 255).astype(np.uint8)
    plt.imshow(gray)
    plt.show()
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    invert = 255 - opening
#     plt.imshow(cv2.cvtColor(thresh, cv2.COLOR_BGR2RGB))
#     plt.show()
    data = pytesseract.image_to_string(thresh, lang='eng', config='--psm 6')
#     print(data, "\n=======")
    allowed = string.ascii_uppercase+string.ascii_lowercase + " ?"
    lines = data.split("\n")
    res = []
    for line in lines:
        t = []
        correct = 0
        for i in line:
            if i in allowed:
                t.append(i)
                correct+=1
            elif i in "1234567890":
                t.append(i)
            elif len(t) > 0 and t[-1] != " ":
                t.append(" ")
        if correct > len(line)*3/5 and correct > 2:
            res.append("".join(t))
    data = "\n".join(res)
    return data

def process(image):
    gray = get_grayscale(img)
    a = helper(gray)
    return a

In [105]:
with open('./data/train.jsonl', 'r') as json_file:
    json_list = list(json_file)

accuracies = []
tess_accuracies = []
for count, json_str in enumerate(json_list):
    result = json.loads(json_str)
    print("---------------------------------------")
    print("Actual Text\n" + result["text"])
    if len(str(result["id"])) == 4:
        image_path = "./data/img/0" + str(result["id"]) + ".png"
    else:
        image_path = "./data/img/" + str(result["id"]) + ".png"
    x = pytesseract.image_to_string(image_path, lang='eng', config='--psm 6')
    print("---------------------------------------")
    print("Tesseract Prediction \n" + x)
    print("---------------------------------------")
    img = cv2.imread(image_path)
    output_text = process(img)
    print("Output Text \n" + output_text)
    print("---------------------------------------")
    orig_text = result["text"]
    counter = 0
    wrong_counter = 0
    for i, char in enumerate(orig_text):
        counter += 1
        if i < len(output_text):
            if char != output_text[i]:
                wrong_counter += 1
        else:
            wrong_counter += 1

    ratio = SequenceMatcher(None, orig_text, output_text).ratio()
    ratio1 = SequenceMatcher(None, orig_text, x).ratio()

    print("Tesseract Ratio = " + str(ratio1))
    print("Our Ratio = " + str(ratio))
    print("\n\n")
    accuracies.append(ratio)
    tess_accuracies.append(ratio1)
    if count == 40:
        break


---------------------------------------
Actual Text
its their character not their color that matters
---------------------------------------
Tesseract Prediction 
its their Character
~ =
Y
a
Ps
Estee \\
iets PLT
that matters

---------------------------------------
Output Text 
their 
hot their color
that matters
---------------------------------------
Tesseract Ratio = 0.6306306306306306
Our Ratio = 0.7951807228915663



---------------------------------------
Actual Text
don't be afraid to love again everyone is not like your ex
---------------------------------------
Tesseract Prediction 
Hlommnerainaid to fove
ava oy
if :
A y
anemia nace
WOUREK

---------------------------------------
Output Text 
flon t be afraid to love
 again
everyone is not like
 MONT EX
---------------------------------------
Tesseract Ratio = 0.37606837606837606
Our Ratio = 0.8235294117647058



---------------------------------------
Actual Text
putting bows on your pet
------------------------------------

Output Text 
most religigns are
religions of peace
---------------------------------------
Tesseract Ratio = 0.23076923076923078
Our Ratio = 0.9459459459459459



---------------------------------------
Actual Text
when you find out there's an opening for the role of first lady
---------------------------------------
Tesseract Prediction 
Whenivourindlountheres amopenimagionte
HOBOS AY
—-_ —

---------------------------------------
Output Text 
when you find out there s an opening for the
role of tirstlatly
---------------------------------------
Tesseract Ratio = 0.4406779661016949
Our Ratio = 0.9206349206349206



---------------------------------------
Actual Text
if people reject me because i stand on the truth then i have no problem walking alone
---------------------------------------
Tesseract Prediction 
PCS La ae
" | SETURL MI eas
ee Str
ne ae
emo Te
WL CELT

---------------------------------------
Output Text 
if people reject me because 
Stand dn the truth  ee
e e 8
 oT ag

---------------------------------------
Tesseract Prediction 
ead tT Abe
See WR ¥!
a

---------------------------------------
Output Text 
when you agcidentally a
 fallin te toilet
---------------------------------------
Tesseract Ratio = 0.21875
Our Ratio = 0.9135802469135802



---------------------------------------
Actual Text
two woman walking home together after a nice day out together
---------------------------------------
Tesseract Prediction 
tn nat rth Pa

ie ETT) ae zu etlier

‘ y f T

] Ro
| )

#4 MUM Urea

---------------------------------------
Output Text 
two woman walking home quether
after a nice day out together
---------------------------------------
Tesseract Ratio = 0.26153846153846155
Our Ratio = 0.9421487603305785



---------------------------------------
Actual Text
texas churches be like... pew pew pew
---------------------------------------
Tesseract Prediction 
6 temas churches [be [ike...
re
sy - ,
nf
i 7 Lp \
2 ’ | ae
ee HAA De

----------------------

In [106]:
# print(accuracies)
print(sum(accuracies) / len(accuracies))
print(statistics.median(accuracies))
# print(tess_accuracies)
print(sum(tess_accuracies) / len(tess_accuracies))
print(statistics.median(tess_accuracies))

0.800893031842428
0.8913043478260869
0.35026259987621866
0.34615384615384615
