diff --git a/Dockerfile b/Dockerfile index fa7ce624..80d2b521 100644 --- a/Dockerfile +++ b/Dockerfile @@ -483,6 +483,16 @@ RUN pip install flashtext && \ pip install pykalman && \ /tmp/clean-layer.sh +# Tesseract and some associated utility packages +RUN apt-get install tesseract-ocr -y && \ + pip install pytesseract && \ + pip install wand && \ + pip install pdf2image && \ + pip install PyPDF && \ + pip install pyocr && \ + /tmp/clean-layer.sh +ENV TESSERACT_PATH=/usr/bin/tesseract + # Pin Vowpal Wabbit v8.6.0 because 8.6.1 does not build or install successfully RUN cd /usr/local/src && \ git clone -b 8.6.0 https://github.com/JohnLangford/vowpal_wabbit.git && \ diff --git a/tests/data/test.pdf b/tests/data/test.pdf new file mode 100644 index 00000000..0d1cfb5d Binary files /dev/null and b/tests/data/test.pdf differ diff --git a/tests/test_pytesseract.py b/tests/test_pytesseract.py new file mode 100644 index 00000000..353b8adb --- /dev/null +++ b/tests/test_pytesseract.py @@ -0,0 +1,15 @@ +import unittest +import io +import pytesseract +import numpy as np +from PIL import Image +from wand.image import Image as wandimage + +class TestPytesseract(unittest.TestCase): + def test_tesseract(self): + # Open pdf with Wand + with wandimage(filename='/input/tests/data/test.pdf') as wand_image: + img_buffer = np.asarray(bytearray(wand_image.make_blob(format='png')), dtype='uint8') + bytesio = io.BytesIO(img_buffer) + test_string = pytesseract.image_to_string(Image.open(bytesio)) + self.assertTrue(type(test_string) == str)