In [14]:
from PIL import Image
import pytesseract

import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt

import numpy as np
import Levenshtein as lev

from enum import IntEnum

import cv2

import time

import optuna

In [15]:
# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

In [16]:
data_dir = r'E:\datasets\MADE\3_graduation\parthplc\archive\data\\'

train_path = data_dir + 'train.jsonl'
dev_path = data_dir + 'dev.jsonl'

train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(dev_path, lines=True)

In [17]:
test_data

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime
...,...,...,...,...
495,83675,img/83675.png,0,i'm gonna be like phelps one day
496,37198,img/37198.png,0,when you're so relaxed you can feel yourself g...
497,48670,img/48670.png,0,look at this sandwich maker club i found on wi...
498,9863,img/09863.png,0,diverse group of women


In [18]:
original_text = train_data.loc[:,'text'].values.tolist()
original_text[:3]

['its their character not their color that matters',
 "don't be afraid to love again everyone is not like your ex",
 'putting bows on your pet']

In [19]:
class ImageTransformType(IntEnum):
    No = 0
    Gray = 1
    Binary = 2
    BinaryInv = 3
    AdaptiveMean = 4
    AdaptiveMeanInv = 5
    AdaptiveGaussian = 6
    AdaptiveGaussianInv = 7
    

In [20]:
def recognize(recognizer_name: str, image_transform: ImageTransformType, thresh: int, maxval: int, block_size:int, c:float) -> list:
    recognized_text = []    
    for i in tqdm(range(len(test_data))):
        image = cv2.imread(data_dir + test_data.values[i][1], cv2.COLOR_BGR2RGB)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if image_transform == ImageTransformType.Gray:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
        if image_transform == ImageTransformType.Binary:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            _, image = cv2.threshold(image, thresh, maxval, cv2.THRESH_BINARY)
            
        if image_transform == ImageTransformType.BinaryInv:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            _, image = cv2.threshold(image, thresh, maxval, cv2.THRESH_BINARY_INV)

        if image_transform == ImageTransformType.AdaptiveMean:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)            
            image = cv2.adaptiveThreshold(image, maxval, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, block_size, c)

        if image_transform == ImageTransformType.AdaptiveMeanInv:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)            
            image = cv2.adaptiveThreshold(image, maxval, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, block_size, c)
            
        if image_transform == ImageTransformType.AdaptiveGaussian:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)            
            image = cv2.adaptiveThreshold(image, maxval, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, c)
            
        if image_transform == ImageTransformType.AdaptiveGaussianInv:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)            
            image = cv2.adaptiveThreshold(image, maxval, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, block_size, c)
            
        if recognizer_name == 'easyocr':            
            recognized_text.append(' '.join(easyocr_reader.readtext(image, detail=0)))
            
        elif recognizer_name == 'pytesseract':
            recognized_text.append(pytesseract.image_to_string(image).replace('\n', ' '))
            
    return recognized_text


def loss(recognizer_name: str, image_transform: ImageTransformType, thresh: int, maxval: int, block_size:int, c:float):
    recognized_text = recognize(recognizer_name, image_transform, thresh, maxval, block_size, c)
    distances = []
    for i in tqdm(range(len(test_data))):
        text = test_data.values[i][3]
        distances.append(lev.distance(text, recognized_text[i]) / len(text))

    distances_mean = np.mean(distances)

    return distances_mean

In [21]:
loss(recognizer_name='pytesseract', image_transform=ImageTransformType.No, thresh=0, maxval=0, block_size=0, c=0)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:31<00:00,  3.30it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 12817.37it/s]


0.6740503137797546

In [22]:
def objective(trial):
    recognizer_name = trial.suggest_categorical("recognizer_name", ['pytesseract'])
    image_transform = trial.suggest_categorical("image_transform", [int(x) for x in ImageTransformType])
    thresh = trial.suggest_int("thresh", 0, 255, 15)
    maxval = trial.suggest_int("maxval", 0, 255, 15)
    block_size = trial.suggest_int("block_size", 3, 21, 2)
    c = trial.suggest_int("c", -10, 10, 1)
    
    return loss(recognizer_name=recognizer_name, image_transform=image_transform, thresh=thresh, maxval=maxval, block_size=block_size, c=c)

study = optuna.create_study()

[32m[I 2022-08-27 17:41:32,077][0m A new study created in memory with name: no-name-474c44ff-7fa5-4524-a039-bb34caf5a68b[0m


In [23]:
t1 = time.time()

study.optimize(objective, n_trials=500)

print('time:', time.time() - t1)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:38<00:00,  3.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 17255.97it/s]
[32m[I 2022-08-27 17:44:10,894][0m Trial 0 finished with value: 0.6740503137797546 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 0, 'thresh': 135, 'maxval': 165, 'block_size': 19, 'c': 3}. Best is trial 0 with value: 0.6740503137797546.[0m
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:38<00:00,  3.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 11966.08it/s]
[32m[I 2022-08-27 17:46:49,606][0m Trial 1 finished with value: 0.6740503137797546 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 0, 'thresh': 60, 'maxval': 0, 'block_size': 21, 'c': -3}. Best is trial 0 with value: 0.6740503137

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:47<00:00,  4.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 10381.17it/s]
[32m[I 2022-08-27 18:41:47,621][0m Trial 32 finished with value: 0.5991826390570034 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 3, 'thresh': 180, 'maxval': 30, 'block_size': 11, 'c': 10}. Best is trial 27 with value: 0.5983319154414116.[0m
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:42<00:00,  4.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 10841.24it/s]
[32m[I 2022-08-27 18:43:30,204][0m Trial 33 finished with value: 0.5510200598064411 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 3, 'thresh': 195, 'maxval': 30, 'block_size': 9, 'c': 9}. Best is trial 33 with value: 0.551020

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:37<00:00,  5.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 17878.53it/s]
[32m[I 2022-08-27 19:34:55,039][0m Trial 64 finished with value: 0.4963361266160189 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 210, 'maxval': 150, 'block_size': 3, 'c': 5}. Best is trial 52 with value: 0.26616726169104893.[0m
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:38<00:00,  5.05it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 13333.45it/s]
[32m[I 2022-08-27 19:36:34,020][0m Trial 65 finished with value: 0.26832276919936976 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 240, 'maxval': 165, 'block_size': 5, 'c': -1}. Best is trial 52 with value: 0.266

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:09<00:00,  7.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 12940.75it/s]
[32m[I 2022-08-27 20:27:09,966][0m Trial 96 finished with value: 1.0 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 255, 'maxval': 135, 'block_size': 21, 'c': 1}. Best is trial 52 with value: 0.26616726169104893.[0m
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:37<00:00,  5.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 7574.04it/s]
[32m[I 2022-08-27 20:28:47,882][0m Trial 97 finished with value: 0.26616726169104893 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 240, 'maxval': 150, 'block_size': 21, 'c': 0}. Best is trial 52 with value: 0.26616726169104893

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:10<00:00,  7.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 9757.19it/s]
[32m[I 2022-08-27 21:20:26,664][0m Trial 128 finished with value: 1.0 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 255, 'maxval': 135, 'block_size': 21, 'c': -2}. Best is trial 52 with value: 0.26616726169104893.[0m
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:52<00:00,  4.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 16943.26it/s]
[32m[I 2022-08-27 21:22:19,156][0m Trial 129 finished with value: 0.9559500087804856 and parameters: {'recognizer_name': 'pytesseract', 'image_transform': 7, 'thresh': 240, 'maxval': 150, 'block_size': 21, 'c': -1}. Best is trial 52 with value: 0.26616726169104

KeyboardInterrupt: 

In [24]:
print("Value:", study.best_value)
print("Parameters:", study.best_params)
print("Importance:", optuna.importance.get_param_importances(study))

Value: 0.26616726169104893
Parameters: {'recognizer_name': 'pytesseract', 'image_transform': 2, 'thresh': 240, 'maxval': 150, 'block_size': 3, 'c': 1}
Importance: OrderedDict([('thresh', 0.6547779464759309), ('image_transform', 0.19284069121366998), ('maxval', 0.11659562130881), ('c', 0.02937258571477986), ('block_size', 0.006413155286809179), ('recognizer_name', 0.0)])


In [None]:
#Value: 0.26236610049718984
#Parameters: {'recognizer_name': 'pytesseract', 'image_transform': <ImageTransformType.Binary: 2>, 'thresh': 241, 'maxval': 39}
#Importance: OrderedDict([('maxval', 0.4087403399999549), ('recognizer_name', 0.2971054412135147), ('thresh', 0.1566787150811999), ('image_transform', 0.13747550370533043)])

In [14]:
#old code
loss('pytesseract', ImageTransformType.Binary, 241, 39)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:10<00:00,  7.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 17853.12it/s]


0.26236610049718984

In [15]:
#old code
test_data['best_rec'] = recognize('pytesseract', ImageTransformType.Binary, 241, 39)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:09<00:00,  7.17it/s]


In [25]:
loss('pytesseract', 2, 240, 150, 3, 1)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:09<00:00,  7.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 13510.49it/s]


0.26616726169104893

In [26]:
test_data['best_rec'] = recognize('pytesseract', 2, 240, 150, 3, 1)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:26<00:00,  5.81it/s]


In [27]:
test_data

Unnamed: 0,id,img,label,text,best_rec
0,8291,img/08291.png,1,white people is this a shooting range,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest,bravery als finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...,your order comes to $37.50 and your white pri...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...,‘itis time. ad to sentl these parasites hack...
4,80243,img/80243.png,1,mississippi wind chime,MISSISSIPDI wind chime
...,...,...,...,...,...
495,83675,img/83675.png,0,i'm gonna be like phelps one day,rmgonna he like phelps one day
496,37198,img/37198.png,0,when you're so relaxed you can feel yourself g...,when you're so relaxed you can feel yourself ...
497,48670,img/48670.png,0,look at this sandwich maker club i found on wi...,look at this sandwich maker club found on wis...
498,9863,img/09863.png,0,diverse group of women,CIVEFSEIGKOUD Of women oe


In [28]:
text_len = test_data['text'].apply(lambda x: len(str(x)))

In [29]:
test_data['best_rec_dist'] = test_data.apply(lambda x: lev.distance(x['text'], x['best_rec']), axis=1) / text_len

In [30]:
test_data.head(30)

Unnamed: 0,id,img,label,text,best_rec,best_rec_dist
0,8291,img/08291.png,1,white people is this a shooting range,white people is this a shooting range,0.027027
1,46971,img/46971.png,1,bravery at its finest,bravery als finest,0.238095
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...,your order comes to $37.50 and your white pri...,0.034483
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...,‘itis time. ad to sentl these parasites hack...,0.2
4,80243,img/80243.png,1,mississippi wind chime,MISSISSIPDI wind chime,0.590909
5,5279,img/05279.png,1,"knowing white people , that's probably the bab...","Knowing white people , that's probably the hah...",0.074074
6,1796,img/01796.png,1,life hack #23 how to get stoned with no weed,life hack #23 how to yet stoned with no weed,0.068182
7,53046,img/53046.png,1,"you've heard of elf on a shelf, now get ready for",you've heard of ell ona shell now cel realy [or,0.183673
8,82301,img/82301.png,1,cooooooooooooon!!!!,"coooooo0000000n! NA pen ost - , w t me OF . ...",2.210526
9,31752,img/31752.png,1,when you get to choose your own mental illness,when you get to choose your own mental illness,0.021739


In [31]:
loss('pytesseract', 2, 195, 75, 15, -1)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:12<00:00,  6.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 9997.63it/s]


0.5417444574689789