In [1]:
import concurrent.futures
import glob
import time
import sys

import pandas as pd
import io
import re
import csv
import numpy as np
from datetime import datetime
import math
from PIL import Image
import pytesseract
import cv2
import os, shutil
from os import listdir
from os.path import isfile, join, isdir

# Ocr function use tesseract to extract text of a given image
# Ocr job use  concurrency to speed up tesseract on images

In [2]:
def binarization(image):
    image = image.convert('RGB')

    npimage = np.asarray(image).astype(np.uint8)  

    npimage[:, :, 0] = 0
    npimage[:, :, 2] = 0

    im = cv2.cvtColor(npimage, cv2.COLOR_RGB2GRAY) 

    ret, thresh = cv2.threshold(im, 80, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) 

    binimage = Image.fromarray(thresh)
    return binimage

def do_ocr(image_path):

    img = Image.open(image_path) 
    text = pytesseract.image_to_string(img,lang='por')
    
    '''
    out_dir = ""
    out_file = 'tmp/page-{}.txt'.format(data[0])
    out_path = out_dir + out_file
    f = open(out_path,"w")
    f.write("%s" %text)
    f.close()
    '''
    return text

def ocr_job(image_list):
    os.environ['OMP_THREAD_LIMIT'] = '1'
    num_workers = os.cpu_count()
    results = {}
    k = 1
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers - 1) as executor:      
        for file, text in zip(image_list, executor.map(do_ocr, image_list)):
            print('{} of {} images'.format(k, len(image_list)))
            results[file] = text
            k += 1
                
    return results

# get all dataset classes 

In [3]:
classes = {}
images_path = 'data/tobacco3482jpg/'
classes_dirs = [f for f in listdir(images_path) if isdir(join(images_path, f))]
print('Classes Found:')
for image_class in classes_dirs:
    if image_class != 'train' and image_class != 'validation':
        classes[image_class] = {}
        print(image_class)


Classes Found:
Note
News
Scientific
Memo
Email
Form
ADVE
Resume
Letter
Report


# get all images from each class 

In [4]:
for image_class in classes:
    images_path = 'data/tobacco3482jpg/{}/'.format(image_class)
    classes[image_class]['images'] = [f for f in listdir(images_path) if isfile(join(images_path, f))]

# Get full path of each image 

In [5]:
image_list = []
for image_class, class_data in classes.items():
    for file in class_data['images']:
        image_list.append('data/tobacco3482jpg/{}/{}'.format(image_class, file))

print(len(image_list))
for i in image_list:
    print(i)

3482
data/tobacco3482jpg/Note/88025179.jpg
data/tobacco3482jpg/Note/2022245046.jpg
data/tobacco3482jpg/Note/2030195129.jpg
data/tobacco3482jpg/Note/2030053173.jpg
data/tobacco3482jpg/Note/80191212_80191213.jpg
data/tobacco3482jpg/Note/2029183905.jpg
data/tobacco3482jpg/Note/93443778.jpg
data/tobacco3482jpg/Note/2030353547.jpg
data/tobacco3482jpg/Note/50076168.jpg
data/tobacco3482jpg/Note/2063603124.jpg
data/tobacco3482jpg/Note/2028837643.jpg
data/tobacco3482jpg/Note/1000240946.jpg
data/tobacco3482jpg/Note/03023805.jpg
data/tobacco3482jpg/Note/2072821881.jpg
data/tobacco3482jpg/Note/2020102257.jpg
data/tobacco3482jpg/Note/2065023135.jpg
data/tobacco3482jpg/Note/2023104578.jpg
data/tobacco3482jpg/Note/2067474399.jpg
data/tobacco3482jpg/Note/2085234526.jpg
data/tobacco3482jpg/Note/2023192377.jpg
data/tobacco3482jpg/Note/91985231-a_91985231.jpg
data/tobacco3482jpg/Note/2029042370.jpg
data/tobacco3482jpg/Note/2072489111.jpg
data/tobacco3482jpg/Note/2048150028.jpg
data/tobacco3482jpg/Note/20

data/tobacco3482jpg/Memo/2054544305.jpg
data/tobacco3482jpg/Memo/2061678550.jpg
data/tobacco3482jpg/Memo/2022181033.jpg
data/tobacco3482jpg/Memo/86235667_5668.jpg
data/tobacco3482jpg/Memo/2049308509.jpg
data/tobacco3482jpg/Memo/03000416_03000417.jpg
data/tobacco3482jpg/Memo/2073420633.jpg
data/tobacco3482jpg/Memo/2041837000_2041837001.jpg
data/tobacco3482jpg/Memo/tob15809.19.jpg
data/tobacco3482jpg/Memo/03726299_03726300.jpg
data/tobacco3482jpg/Memo/2072940944.jpg
data/tobacco3482jpg/Memo/2053522412.jpg
data/tobacco3482jpg/Memo/03370692_0695.jpg
data/tobacco3482jpg/Memo/98416803_6804.jpg
data/tobacco3482jpg/Memo/01390279.jpg
data/tobacco3482jpg/Memo/2040733938.jpg
data/tobacco3482jpg/Memo/2070370405.jpg
data/tobacco3482jpg/Memo/87746801_87746802.jpg
data/tobacco3482jpg/Memo/2020340083.jpg
data/tobacco3482jpg/Memo/93299184.jpg
data/tobacco3482jpg/Memo/2046538342.jpg
data/tobacco3482jpg/Memo/2020134234.jpg
data/tobacco3482jpg/Memo/81331173_81331174.jpg
data/tobacco3482jpg/Memo/86265143.j

In [6]:
print(os.cpu_count())

12


# Extract ocr using multi proccessing

In [7]:
file_text = ocr_job(image_list)

1 of 3482 images
2 of 3482 images
3 of 3482 images
4 of 3482 images
5 of 3482 images
6 of 3482 images
7 of 3482 images
8 of 3482 images
9 of 3482 images
10 of 3482 images
11 of 3482 images
12 of 3482 images
13 of 3482 images
14 of 3482 images
15 of 3482 images
16 of 3482 images
17 of 3482 images
18 of 3482 images
19 of 3482 images
20 of 3482 images
21 of 3482 images
22 of 3482 images
23 of 3482 images
24 of 3482 images
25 of 3482 images
26 of 3482 images
27 of 3482 images
28 of 3482 images
29 of 3482 images
30 of 3482 images
31 of 3482 images
32 of 3482 images
33 of 3482 images
34 of 3482 images
35 of 3482 images
36 of 3482 images
37 of 3482 images
38 of 3482 images
39 of 3482 images
40 of 3482 images
41 of 3482 images
42 of 3482 images
43 of 3482 images
44 of 3482 images
45 of 3482 images
46 of 3482 images
47 of 3482 images
48 of 3482 images
49 of 3482 images
50 of 3482 images
51 of 3482 images
52 of 3482 images
53 of 3482 images
54 of 3482 images
55 of 3482 images
56 of 3482 images
5

440 of 3482 images
441 of 3482 images
442 of 3482 images
443 of 3482 images
444 of 3482 images
445 of 3482 images
446 of 3482 images
447 of 3482 images
448 of 3482 images
449 of 3482 images
450 of 3482 images
451 of 3482 images
452 of 3482 images
453 of 3482 images
454 of 3482 images
455 of 3482 images
456 of 3482 images
457 of 3482 images
458 of 3482 images
459 of 3482 images
460 of 3482 images
461 of 3482 images
462 of 3482 images
463 of 3482 images
464 of 3482 images
465 of 3482 images
466 of 3482 images
467 of 3482 images
468 of 3482 images
469 of 3482 images
470 of 3482 images
471 of 3482 images
472 of 3482 images
473 of 3482 images
474 of 3482 images
475 of 3482 images
476 of 3482 images
477 of 3482 images
478 of 3482 images
479 of 3482 images
480 of 3482 images
481 of 3482 images
482 of 3482 images
483 of 3482 images
484 of 3482 images
485 of 3482 images
486 of 3482 images
487 of 3482 images
488 of 3482 images
489 of 3482 images
490 of 3482 images
491 of 3482 images
492 of 3482 

876 of 3482 images
877 of 3482 images
878 of 3482 images
879 of 3482 images
880 of 3482 images
881 of 3482 images
882 of 3482 images
883 of 3482 images
884 of 3482 images
885 of 3482 images
886 of 3482 images
887 of 3482 images
888 of 3482 images
889 of 3482 images
890 of 3482 images
891 of 3482 images
892 of 3482 images
893 of 3482 images
894 of 3482 images
895 of 3482 images
896 of 3482 images
897 of 3482 images
898 of 3482 images
899 of 3482 images
900 of 3482 images
901 of 3482 images
902 of 3482 images
903 of 3482 images
904 of 3482 images
905 of 3482 images
906 of 3482 images
907 of 3482 images
908 of 3482 images
909 of 3482 images
910 of 3482 images
911 of 3482 images
912 of 3482 images
913 of 3482 images
914 of 3482 images
915 of 3482 images
916 of 3482 images
917 of 3482 images
918 of 3482 images
919 of 3482 images
920 of 3482 images
921 of 3482 images
922 of 3482 images
923 of 3482 images
924 of 3482 images
925 of 3482 images
926 of 3482 images
927 of 3482 images
928 of 3482 

1297 of 3482 images
1298 of 3482 images
1299 of 3482 images
1300 of 3482 images
1301 of 3482 images
1302 of 3482 images
1303 of 3482 images
1304 of 3482 images
1305 of 3482 images
1306 of 3482 images
1307 of 3482 images
1308 of 3482 images
1309 of 3482 images
1310 of 3482 images
1311 of 3482 images
1312 of 3482 images
1313 of 3482 images
1314 of 3482 images
1315 of 3482 images
1316 of 3482 images
1317 of 3482 images
1318 of 3482 images
1319 of 3482 images
1320 of 3482 images
1321 of 3482 images
1322 of 3482 images
1323 of 3482 images
1324 of 3482 images
1325 of 3482 images
1326 of 3482 images
1327 of 3482 images
1328 of 3482 images
1329 of 3482 images
1330 of 3482 images
1331 of 3482 images
1332 of 3482 images
1333 of 3482 images
1334 of 3482 images
1335 of 3482 images
1336 of 3482 images
1337 of 3482 images
1338 of 3482 images
1339 of 3482 images
1340 of 3482 images
1341 of 3482 images
1342 of 3482 images
1343 of 3482 images
1344 of 3482 images
1345 of 3482 images
1346 of 3482 images


1712 of 3482 images
1713 of 3482 images
1714 of 3482 images
1715 of 3482 images
1716 of 3482 images
1717 of 3482 images
1718 of 3482 images
1719 of 3482 images
1720 of 3482 images
1721 of 3482 images
1722 of 3482 images
1723 of 3482 images
1724 of 3482 images
1725 of 3482 images
1726 of 3482 images
1727 of 3482 images
1728 of 3482 images
1729 of 3482 images
1730 of 3482 images
1731 of 3482 images
1732 of 3482 images
1733 of 3482 images
1734 of 3482 images
1735 of 3482 images
1736 of 3482 images
1737 of 3482 images
1738 of 3482 images
1739 of 3482 images
1740 of 3482 images
1741 of 3482 images
1742 of 3482 images
1743 of 3482 images
1744 of 3482 images
1745 of 3482 images
1746 of 3482 images
1747 of 3482 images
1748 of 3482 images
1749 of 3482 images
1750 of 3482 images
1751 of 3482 images
1752 of 3482 images
1753 of 3482 images
1754 of 3482 images
1755 of 3482 images
1756 of 3482 images
1757 of 3482 images
1758 of 3482 images
1759 of 3482 images
1760 of 3482 images
1761 of 3482 images


2124 of 3482 images
2125 of 3482 images
2126 of 3482 images
2127 of 3482 images
2128 of 3482 images
2129 of 3482 images
2130 of 3482 images
2131 of 3482 images
2132 of 3482 images
2133 of 3482 images
2134 of 3482 images
2135 of 3482 images
2136 of 3482 images
2137 of 3482 images
2138 of 3482 images
2139 of 3482 images
2140 of 3482 images
2141 of 3482 images
2142 of 3482 images
2143 of 3482 images
2144 of 3482 images
2145 of 3482 images
2146 of 3482 images
2147 of 3482 images
2148 of 3482 images
2149 of 3482 images
2150 of 3482 images
2151 of 3482 images
2152 of 3482 images
2153 of 3482 images
2154 of 3482 images
2155 of 3482 images
2156 of 3482 images
2157 of 3482 images
2158 of 3482 images
2159 of 3482 images
2160 of 3482 images
2161 of 3482 images
2162 of 3482 images
2163 of 3482 images
2164 of 3482 images
2165 of 3482 images
2166 of 3482 images
2167 of 3482 images
2168 of 3482 images
2169 of 3482 images
2170 of 3482 images
2171 of 3482 images
2172 of 3482 images
2173 of 3482 images


2534 of 3482 images
2535 of 3482 images
2536 of 3482 images
2537 of 3482 images
2538 of 3482 images
2539 of 3482 images
2540 of 3482 images
2541 of 3482 images
2542 of 3482 images
2543 of 3482 images
2544 of 3482 images
2545 of 3482 images
2546 of 3482 images
2547 of 3482 images
2548 of 3482 images
2549 of 3482 images
2550 of 3482 images
2551 of 3482 images
2552 of 3482 images
2553 of 3482 images
2554 of 3482 images
2555 of 3482 images
2556 of 3482 images
2557 of 3482 images
2558 of 3482 images
2559 of 3482 images
2560 of 3482 images
2561 of 3482 images
2562 of 3482 images
2563 of 3482 images
2564 of 3482 images
2565 of 3482 images
2566 of 3482 images
2567 of 3482 images
2568 of 3482 images
2569 of 3482 images
2570 of 3482 images
2571 of 3482 images
2572 of 3482 images
2573 of 3482 images
2574 of 3482 images
2575 of 3482 images
2576 of 3482 images
2577 of 3482 images
2578 of 3482 images
2579 of 3482 images
2580 of 3482 images
2581 of 3482 images
2582 of 3482 images
2583 of 3482 images


2946 of 3482 images
2947 of 3482 images
2948 of 3482 images
2949 of 3482 images
2950 of 3482 images
2951 of 3482 images
2952 of 3482 images
2953 of 3482 images
2954 of 3482 images
2955 of 3482 images
2956 of 3482 images
2957 of 3482 images
2958 of 3482 images
2959 of 3482 images
2960 of 3482 images
2961 of 3482 images
2962 of 3482 images
2963 of 3482 images
2964 of 3482 images
2965 of 3482 images
2966 of 3482 images
2967 of 3482 images
2968 of 3482 images
2969 of 3482 images
2970 of 3482 images
2971 of 3482 images
2972 of 3482 images
2973 of 3482 images
2974 of 3482 images
2975 of 3482 images
2976 of 3482 images
2977 of 3482 images
2978 of 3482 images
2979 of 3482 images
2980 of 3482 images
2981 of 3482 images
2982 of 3482 images
2983 of 3482 images
2984 of 3482 images
2985 of 3482 images
2986 of 3482 images
2987 of 3482 images
2988 of 3482 images
2989 of 3482 images
2990 of 3482 images
2991 of 3482 images
2992 of 3482 images
2993 of 3482 images
2994 of 3482 images
2995 of 3482 images


3356 of 3482 images
3357 of 3482 images
3358 of 3482 images
3359 of 3482 images
3360 of 3482 images
3361 of 3482 images
3362 of 3482 images
3363 of 3482 images
3364 of 3482 images
3365 of 3482 images
3366 of 3482 images
3367 of 3482 images
3368 of 3482 images
3369 of 3482 images
3370 of 3482 images
3371 of 3482 images
3372 of 3482 images
3373 of 3482 images
3374 of 3482 images
3375 of 3482 images
3376 of 3482 images
3377 of 3482 images
3378 of 3482 images
3379 of 3482 images
3380 of 3482 images
3381 of 3482 images
3382 of 3482 images
3383 of 3482 images
3384 of 3482 images
3385 of 3482 images
3386 of 3482 images
3387 of 3482 images
3388 of 3482 images
3389 of 3482 images
3390 of 3482 images
3391 of 3482 images
3392 of 3482 images
3393 of 3482 images
3394 of 3482 images
3395 of 3482 images
3396 of 3482 images
3397 of 3482 images
3398 of 3482 images
3399 of 3482 images
3400 of 3482 images
3401 of 3482 images
3402 of 3482 images
3403 of 3482 images
3404 of 3482 images
3405 of 3482 images


# Transform output of OCR job to pandas dataframe format 

In [9]:
files_list = []
text_list = []
for key, value in file_text.items():
    files_list.append(key)
    text_list.append(value)

# Create dataframe and save as csv

In [11]:
data = {'file_path':files_list , 'text': text_list}
df = pd.DataFrame.from_dict(data)
print(len(df))
display(df.head())
df.to_csv('extracted_text.csv')

3482


Unnamed: 0,file_path,text
0,data/tobacco3482jpg/Note/88025179.jpg,pre Tha clubes 3/e/04. Mr\nVLS Some ve apud Pa...
1,data/tobacco3482jpg/Note/2022245046.jpg,Lis 16 comam\n\n5 (3 Jo\n\nTacna Eli peito\n\n...
2,data/tobacco3482jpg/Note/2030195129.jpg,"AAAPER\n\nThe Perfect Solutidy, Com y;\n\n% mM..."
3,data/tobacco3482jpg/Note/2030053173.jpg,", Mi Sra:\ng2-74 Vaenec Speed motor co BecT, “..."
4,data/tobacco3482jpg/Note/80191212_80191213.jpg,
