<a href="https://colab.research.google.com/github/GruAna/VU/blob/master/data_to_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import cv2 as cv
import numpy as np
import os
import xml.etree.ElementTree as ET

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!cp drive/MyDrive/Colab_Notebooks/VU/utils.py .
from utils import *

In [9]:
from tqdm import tqdm  

Functions

In [4]:
labels_dir = '/content/drive/MyDrive/Colab_Notebooks/VU/CTW1500/ctw1500_train_labels'
(_, _, xml_files) = next(os.walk(labels_dir))
xml_files.sort()
list_xml_paths = [os.path.join(labels_dir, file) for file in xml_files][:200]

In [7]:
def read_gt_ctw_train(xml_file, scaling_ratio=1):
    """
    SCUT-CTW1500 dataset (XML - train labels parser)
    Returns ground truth in a tuple - first contains coordinates (8 numbers), second word (string).
    If image was previously scaled, one might need to scale also gt coordinates by given ratio.
    """
    gt = []

    tree = ET.parse(xml_file)
    root = tree.getroot()

    # get values in this order: height, left coordinate, top coordinate, width
    for i, bbox in enumerate(root[0].findall('box')):
        # create list of integers with bounding box values, sort by attribute name
        # in case in different document there is a different order of attributes
        bbox_integer = [int(val) for key, val in sorted(bbox.attrib.items(), key = lambda el: el[0])]
        
        # calculate bottom coordinate of bounding rectangle x+width, y+height
        x_right= int((bbox_integer[1] + bbox_integer[3]) * scaling_ratio)
        y_bottom = int((bbox_integer[2] + bbox_integer[0]) * scaling_ratio)
        x_left = int(bbox_integer[1] * scaling_ratio)
        y_top = int(bbox_integer[2] * scaling_ratio)

        bbox_coords = np.array([[x_left, y_top], [x_right, y_bottom]])

        # get label
        label = root[0][i].find('label').text

        # create list of labels and corresponding boundin boxes
        gt.append((label, bbox_coords))

    return gt
    

In [10]:
ground_truth = []

for i, file in tqdm(enumerate(list_xml_paths)):
     ground_truth.append(read_gt_ctw_train(file))
# ground_truth is in the is a list of tuples, where first is the gt word 
# and second is an array of top left and bottom right coordinates

200it [01:26,  2.30it/s]


In [11]:
# path to image directory, get full path to all files
imgs_dir = '/content/drive/MyDrive/Colab_Notebooks/VU/CTW1500/train_images'
(_, _, filenames) = next(os.walk(imgs_dir))
filenames.sort()
list_img_paths = [os.path.join(imgs_dir, file) for file in filenames][:200]
n_imgs = len(list_img_paths)

In [12]:
# load images
images = [(cv.imread(file)) for file in list_img_paths]

## Crop

In [14]:
def image_text_crop(images, filenames, ground_truth, one_file=True, result_folder='./results'):
    """
    Crops and saves images based on bounding box ground truth for each text region.
    Creates text file with corresponding annotation.

    Parameter:
    - images: loaded images
    - filenames: list of image filenames with extension
    - groun_truth: list of gt tuples first text annotation, second np.array of 
    left top and bottom right coodinates, format: ('text', [[tl,tl],[br,br]])
    """

    # test if there are not more gts than images
    # else the for loop will never get to those exceeding image count
    gt_length = len(ground_truth)
    if len(images) > gt_length:
        images = images[:gt_length]

    if not os.path.isdir(result_folder):
        os.mkdir(result_folder)
    
    all_texts = []
    for i, img in tqdm(enumerate(images)):
        name, ext = os.path.splitext(filenames[i])

        # count regions in one image - used for file naming purposes
        region = 1
        
        for text, bbox in ground_truth[i]:
            # select image within coordinates (bbox)
            cropped = img[bbox[0,1]:bbox[1,1], bbox[0,0]:bbox[1,0]]

            # create image file:
            # name in format "original-00region.ext"
            new_name = name + '-' + str(region).zfill(3)
            ext_tif = '.tif'

            cv.imwrite(os.path.join(result_folder, new_name + ext_tif), cropped)
            # create  text annotation file(s)
            if one_file:
                all_texts.append(new_name + ext_tif + '\t' + text)
            else:
                # one file for each image with word
                with open(os.path.join(result_folder, new_name + '.gt.txt'), 'w') as f:
                    f.write(text)
            region += 1
    
    if one_file:
        with open(os.path.join(result_folder, 'gt.txt'), 'w') as f:
            for line in all_texts:
                f.writelines(line+'\n')
            

In [15]:
# ! rm -r ../results
image_text_crop(images, filenames[:200], ground_truth, one_file=False)

200it [00:15, 13.07it/s]


In [30]:
# ! rm -r ./results

## Tesseract

In [16]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 3s (1,655 kB/s)
debconf: unable to initi

In [17]:
! tesseract --version

tesseract 4.0.0-beta.1
 leptonica-1.75.3
  libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 1.5.2) : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libwebp 0.6.1 : libopenjp2 2.3.0

 Found AVX2
 Found AVX
 Found SSE


In [18]:
! sudo apt-get install libicu-dev libpango1.0-dev libcairo2-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libicu-dev is already the newest version (60.2-3ubuntu3.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  autoconf automake autopoint autotools-dev debhelper dh-autoreconf
  dh-strip-nondeterminism file gettext gettext-base gir1.2-freedesktop
  gir1.2-pango-1.0 intltool-debian libarchive-cpio-perl libarchive-zip-perl
  libcairo-script-interpreter2 libfile-stripnondeterminism-perl libmagic-mgc
  libmagic1 libmail-sendmail-perl libpangoxft-1.0-0 libpixman-1-dev
  libsigsegv2 libsys-hostname-long-perl libtimedate-perl libtool
  libxcb-shm0-dev m4 po-debconf
Suggested packages:
  autoconf-archive gnu-standards autoconf-doc dh-make dwz gettext-doc
  libasprintf-dev libgettextpo-dev libcairo2-doc imagemagick libpango1.0-doc
  libtool-doc gcj-jdk m4-do

Clone repositories for training tesseract.

In [19]:
% cd /content

/content


In [75]:
# ! rm -r tesstrain

In [20]:
! git clone https://github.com/tesseract-ocr/tesstrain
# ! git clone https://github.com/tesseract-ocr/tessdata_best

Cloning into 'tesstrain'...
remote: Enumerating objects: 819, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 819 (delta 21), reused 25 (delta 11), pack-reused 775[K
Receiving objects: 100% (819/819), 13.37 MiB | 15.15 MiB/s, done.
Resolving deltas: 100% (463/463), done.


In [21]:
% cd ./tesstrain

/content/tesstrain


In [22]:
! pwd

/content/tesstrain


In [23]:
! mkdir data
! mkdir ./data/foo-ground-truth

In [43]:
! rm -r data
! mkdir data
! mkdir ./data/foo-ground-truth

In [44]:
! cp ../results/* ./data/foo-ground-truth

In [32]:
# ! unzip ./ocrd-testset.zip -d ./data/foo-ground-truth

In [25]:
! ls ./data/foo-ground-truth/

0001-001.gt.txt  0056-014.gt.txt  0102-005.gt.txt  0146-014.gt.txt
0001-001.tif	 0056-014.tif	  0102-005.tif	   0146-014.tif
0001-002.gt.txt  0056-015.gt.txt  0103-001.gt.txt  0146-015.gt.txt
0001-002.tif	 0056-015.tif	  0103-001.tif	   0146-015.tif
0001-003.gt.txt  0057-001.gt.txt  0103-002.gt.txt  0146-016.gt.txt
0001-003.tif	 0057-001.tif	  0103-002.tif	   0146-016.tif
0001-004.gt.txt  0057-002.gt.txt  0103-003.gt.txt  0146-017.gt.txt
0001-004.tif	 0057-002.tif	  0103-003.tif	   0146-017.tif
0001-005.gt.txt  0057-003.gt.txt  0104-001.gt.txt  0146-018.gt.txt
0001-005.tif	 0057-003.tif	  0104-001.tif	   0146-018.tif
0001-006.gt.txt  0057-004.gt.txt  0104-002.gt.txt  0146-019.gt.txt
0001-006.tif	 0057-004.tif	  0104-002.tif	   0146-019.tif
0001-007.gt.txt  0057-005.gt.txt  0104-003.gt.txt  0146-020.gt.txt
0001-007.tif	 0057-005.tif	  0104-003.tif	   0146-020.tif
0002-001.gt.txt  0057-006.gt.txt  0104-004.gt.txt  0146-021.gt.txt
0002-001.tif	 0057-006.tif	  0104-004.tif	   0146-021.tif


In [27]:
! sudo apt install bc

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  bc
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 86.2 kB of archives.
After this operation, 223 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 bc amd64 1.07.1-2 [86.2 kB]
Fetched 86.2 kB in 1s (86.9 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable

In [45]:
! make training MODEL_NAME=foo MAX_ITERATIONS=600 START_MODEL=eng TESSDATA=/usr/share/tesseract-ocr/4.00/tessdata

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/foo-ground-truth/0092-018.tif" -t "data/foo-ground-truth/0092-018.gt.txt" > "data/foo-ground-truth/0092-018.box"
+ tesseract data/foo-ground-truth/0092-018.tif data/foo-ground-truth/0092-018 --psm 13 lstm.train
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/foo-ground-truth/0110-015.tif" -t "data/foo-ground-truth/0110-015.gt.txt" > "data/foo-ground-truth/0110-015.box"
+ tesseract data/foo-ground-truth/0110-015.tif data/foo-ground-truth/0110-015 --psm 13 lstm.train
Tesseract Open Source OCR Engine v4.0.0-beta.1 with Leptonica
Page 1
PYTHONIOENCODING=utf-8 python3 generate_line_box.py -i "data/foo-ground-truth/0124-023.tif" -t "data/foo-ground-truth/0124-023.gt.txt" > "data/foo-ground-truth/0124-023.box"
+ tesserac

Copy new model to /usr/share/tesseract-ocr/4.00/tessdata

In [35]:
! cp /content/tesstrain/data/foo.traineddata /usr/share/tesseract-ocr/4.00/tessdata

In [36]:
! ls /usr/share/tesseract-ocr/4.00/tessdata

configs		 foo.traineddata  pdf.ttf
eng.traineddata  osd.traineddata  tessconfigs


In [42]:
! tesseract -l foo /content/drive/MyDrive/Colab_Notebooks/VU/CTW1500/train_images/ -

Failed to load any lstm-specific dictionaries for lang foo!!
Error in findFileFormatStream: failed to read first 12 bytes of file
terminate called after throwing an instance of 'std::__ios_failure'
  what():  basic_filebuf::underflow error reading the file: iostream error
