From 71e51d1387c934b42a0b6c0d7de645bee075d08a Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 07:00:06 -0800 Subject: [PATCH 1/7] Add tesseract and pytesseract --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 808ae746..ef0fe871 100644 --- a/Dockerfile +++ b/Dockerfile @@ -481,6 +481,11 @@ RUN pip install flashtext && \ pip install pykalman && \ /tmp/clean-layer.sh +# Tesseract and pytesseract +RUN apt-get install tesseract-ocr -y && \ + pip install pytesseract +ENV TESSERACT_PATH=/usr/bin/tesseract + # Pin Vowpal Wabbit v8.6.0 because 8.6.1 does not build or install successfully RUN cd /usr/local/src && \ git clone -b 8.6.0 https://github.com/JohnLangford/vowpal_wabbit.git && \ From 3b88789c4e935383e65f5a567ba59d20dfb31198 Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 07:01:58 -0800 Subject: [PATCH 2/7] Add clean-layer.sh to tesseract layer --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ef0fe871..0938e285 100644 --- a/Dockerfile +++ b/Dockerfile @@ -483,7 +483,8 @@ RUN pip install flashtext && \ # Tesseract and pytesseract RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract + pip install pytesseract && \ + /tmp/clean-layer.sh ENV TESSERACT_PATH=/usr/bin/tesseract # Pin Vowpal Wabbit v8.6.0 because 8.6.1 does not build or install successfully From 60d75811514bb6373965452ce374779ed28d4e78 Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 07:06:22 -0800 Subject: [PATCH 3/7] add pdf for pytesseract test --- tests/data/test.pdf | Bin 0 -> 18143 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/data/test.pdf diff --git a/tests/data/test.pdf b/tests/data/test.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0d1cfb5de7c4ba0b0643811c67cb2fbdf3ead2ce GIT binary patch literal 18143 zcmce-b9ik}*DaWwI61Lx+jdTD>fvtfBG!G8}y|lTFvA%=3hp`a> zSRX1%0NIbYHsCZ?C@P%={p&V7#rFe zeYg5|EwArj?MUzs3BtBk&ek@L1S|yf@(#vE=7vtT4&Ttfu@TU#eKWyG!1R57^B`ek zWbF3s_%>PpZJXE@9(bm@KJH)>}D}6KiP0+vMx%`K>|2sYO5+VdzJjRBs zhRn>2OpF}N%nY1t`Yfy_Osq^s%xo;2?5qat?0o;vEwukhf}@jzvA#95Tb7}L(VnTE zfgT4GNm@#Px~M=vJ)8&+5aPbTfD3k5@_64+TPto+kUt_eFc3N^IKw~b`_IMyUnCOx zA4dL%+T`_3jU5RX|3lKhQzoqMq;F+w`d@^^`d_a2U*=DVfcaaG|H(27=YNNAGE zr>8j!1S)%nxd$|10&aj7L?KR#!OXy;w`ZuQSNpaxH{KBh3{>O?Ui3esXZtU`3)|W_ z8QVBH60rX}tuo)@&=<1(M*Rb{c?VlVCFAd&wZ3DEhhR7 z0a|}EDDG8_W=G0rR3z`Z0PjwkvM)wqW?F+|B(J)*!>5}|H#MxlHLFC%EbQ9 z1o*#n)toRd3Bri*YxfTIj)o;WeaZ+aE+FDjL*p7`2@M)m^9z5MkIEo+n zBY5>MDkt?zTiHD4Um&Modz`8sG0$qxVI7xSw}O5{V$;SXr))7L+=8c%HEpzE6St@} zh#<9jo18@SVL@hgiYO4lyoOAJ_r+r}DhcGY1eu^eRS;rD*PTW>Dg$2n!qrRp*u^t6 z(E`~$C&PnGC~&)#1}M0sHbqz$N0=AcB#5f-U`Gk=GlFCWX6CP)^Ff9-pc` zYFC}ex3j$o-8DLp6XJqM5CY}Y>0veJGCB+Oq$UB8^sR`?S_L`X$uv1>54{SwFsL11p= ztOFWC+G7+%4?p~9nl;A#p~DsSNuPTLLb1AV(D9*Z0b%lIqQp31;Z;!$Z|b@@ zg*nlK<=7<_hIz8A!<)h1Jaoe`BgPVZr9FdLDVzwwrm!3K1`-LtCHC`KE^q4LuJ8w4 zqN+bZE+Em^3i)6?PRhGwn~(-EN5~s`>*AQUH|&V)|H7c?=+52Ol(e71MsVH7IgEo7 zLn)LdSjZ0}{o|}Isq@4aaw)zKm*oppDTYGjL=w}8v3Xao`y&M)!1v=po)}hOh$!K7 zF-|IQ4`{*Z_m|9V2V$)-s+QwyhP>~uQeEMd`ZjjEnG?htmen^>MQ!@+nh{(z{iHas zYVi{SBE-YL-15Xbkf4{%IuE3fKQYp$WXTa%Ba<1xCqW;w$~IuJ)iD_}1}`s>@B^X; z;29t{ZB%N9os|o^--zjixIVD&m9+E;N7M*odLeHhW870lg-N9cUKZm_!gX&kI$-L5 zoNxgeksD!C(DQZ4_kO3Y#q@t7BNETiSBF`Yz|Z8)Sk1i4mN=ahTwo@pkN7?0Dls%%u7?p8B&={g?oCHnL568WV-o z?4J7d90SI~LY|(Jx;FCNM(2jKv;4NFAzOXWMSp-`6fM5-8=SMrn?7y!PpNpDK8~Ki zVVcERv2<70d6P(81ydaJfq8OBH{o?z0 zX@oj{LizMX;x`N^v1BxC!vGxY4{n$s5T=w1o8`uj%2Guf=OVydho}NbK8VLSf&WecuEYe)- zjI$1?(vI1B`q0)&gmUQw1C1R)5@_1N;8hnSM{i4Iw39tf+NHiouc1 zp|`dEf;B_tRxm)UDpKtFt*AFk%HxvaG)KSDFJ;@(I^0!e_&Y8kvtQWOp$sb28+LHIp7|6fYou0R zkehuY1|r?ycMP3!RsL*@U=d4>dZa7W%KAA&?x}{`*=@eeb^yA;Pe#cd@GMWvmByZz z3%bE8NUL763xqP8qbHy}t~yWX8G0|+%phz%$`N~i7uXGb#{0p$0KJP}#IKWkN1Lcs zasB7gOr-SzP&S7T8rOW=WY79r4a`otZIHhYP{vSRh1<+Rd=5O@)OUbZIrb5;MdsFFw_G!Iea~ulZ6gMJ)EyUJR|h0o@-+-?Kg{RS$4T z^nLn{+M~wN>s2s-J*{YueO-8GjNfp&c-k5uhzarU)S|c5@ck}krWNhn_T&5Ijp-ZQ zQ)U^P!-wlD%-1&St;E)x|m#tTOaT9V=!o*A=j!O^bX-Cxh@f& z;`Ps{KThNZJUKTYV?RVPIDm3~(FpOO{)Gv{b-)DJfb4M?FvffkEofhK`}xBK`$rjU z6Nfp6wR%~1_aDDjzwXHY8c06kotqHwxpravO&jZi7j_4WQY+?3#DJ#8T!ucNJpKD# zmuN-oFikDlK1bRfe(ICg+Xwy+q>&x=MBS54;Vizm@Fz&t=`T{zs9qFrBz)7{`O zCYxOVfqFQh?=9gFVRnF1#N5w2jJhrO$zncFB(JxlwI5G6uXtbM4;a+Xl2EI<+P=;& z7QLT3=~n^d9)su*yf=3ATjK=Bg**pm4UaNyiGm+gE#0r@4bMveJt`6y>^9jx!;6Xr z51O2G$PB*@(Q7_x437yhJ9^?Y8h~w3@=q@S(W#9c<9!o+-+T0_@v8q+4F|dH)63{d zYUM}Z>-w-OqS8&38#+^*tcd1bA@5D~pGay0pDr8#uW^c0PnYEJZBAPkdrV#9aqH;& z7gU=o9I=jA*Q2i|U5c@K3`0GS{SN@)i(4#D$WvnGKAa!j6lv-LqMQLkV^r*!8g$}hpM?1PDb*q_nMB05%uz`B@L_QjXuXfvE z8L?jgh^>TQlndzbHNr2pb-UT(x(S<|C5G;F2S6#k!w#$kpfAhff$=3oKM>I_dQ<$j z#e5#{UVCMI-#!lci``7i1$YhDp1R`b+Fo%F_)GYFKjFGxwFghWb_=_EOzlvUFZ}@0 zxrg1ei*MT}yWNM+6t9h)>_g)_bX`hVYrGlT#_J#HyZERRySvZ4XeVHhw+b$;T&}@9@_orKlei#Uj-BED z(wx#<;B0DaxNDu<&ah_hNHKTZc7E=7k9&@v8}H_aL~l7@kHg)lcQE^wEEsv?7qezKV*<3#AHE)NRub#&y z4Md>`mPFE|!=z26Jfcd;j85^yp>4f{o|K8|=5%q6J(@CWziu-db~v(6-f~#zt=nqD zo2+iJX|g%5-_mc|(w%b%^rSgyn&y*gI~oJ4Ps}??;D*RQy2u{n3U!tJ+tu7y?Wl56 zEY<1+D>&KAH0j>k*Z2DnwjlKHolwUUs5iE&2$1NX?xNr+!st{sgY{v97Vcv1_D)+@ zqz0Hh)iju8c`){YIH_D6F8jZy#jSMNJ@=o*#L~A5uTtl9yx!)>K3APh*9+OHpKjyM zlrVddOn$R+W4p`z-5B-9jhS-0?yo1aEm=1G3yqLdm8F}qyN?=o;u$eoJ0YtH)uoBm|tg7IHLrz@{)$1 z*?4LXP2m!}UIV?!OJ9>I*tHbU*Bo0=k-=JewC|`_0V;Jkcp=qwn6;4BW-Hu#d~HCx zgOYAQDFae=KV}Ar>tH{DeM9m07~QaT{8slc(|}Ryu&DMKIQE_0kavPx_pIF@dIHj~ z@o@YLugTNEcKdm+fxF;+N7O&CdV-A^uzxc^STJ~4>Ju*sf-DIWx8-AIFqCNR(M%1V z=D_>fvih;SnT3ijYv921yy=YRIw0LFB`9glCwAp>_Qs zyk&3?;)!TCL!L+Ey&(p_#c;==8;~<%b`Ri@$RW}iQ*DU0B9}`v9m{1fbsv*mkag+j z-^VZ~o*cJn2)!VkO@K2X`82{`m-`VGo)!)!J7`K02O57e28<*vGzNXk&32Kmx}-yGw9~By?QskM&eK zAi*s`plV1oCa&-g?=9a8^9ztIWH?FqJ?Xd|cn`_wo-hX({vf#>j_iQgH6^DUT^!)1 z>XrKi;>G_r34gqALhd*{Nw8CbR8gc;qB75@AChFbz^{q|i;8^Bib9Wt*jEJ*Rz+Vd z0huLfT9!EQcor;aVR5eUynL3ZkVQx=i9$1cEE&Tx5w$sWEV0)_X)L)0vmech*fK>D zP4qG)lune|B~0a0_g!`3ZNlm_G>9os)WMy>Z$YdB9{c({oVf@s(3`=gLAV35`muF# zZaQuXZYpj{ZV*pro&${ghWkqU%==RNmit+DI?p*ia$mAvg1&kD3z7HauX3-EHdgyuxdnobHc9C|`b`f_`W5dLTpN5`>`6Kbe zMu*^sW3Q85)!#*U=yDMFQRN~^hZYY^4^9ry4=`_}ZfXzI4m55oZ?JE)Zfp-YZ@6!| z+yvis-aDV+Zy;{OZoxQ-bx?efz9>IwK8aigWX(#gNw6RLblfm^V_f=e+`v8|=?9Re zei;Ts4RGjpF=(U_ZIL8FA_q%c6Td{@A=!q4)ZxwS&v^Z~B7L!ArzCJ3=XKNB64T5J zq#-yNb2SKDqo70D7Aj{@Y12p}OA2?5m^O%PQ_rH_A)P@_4U-x+Hpp!I{hM$G8Gx=9 zQ90CkBhnL^I}RTKzn^`K*tK4Tyc*dW9vgu@EaRrgPxli69Qq4kjk^G=?+Jc5=paxh z*`-baLB%3kwe*P@;{!z6L`qNIPTWl0O~gn!_w;)yM#tB0Atim5nU%0XPs>~AYohw- z=g%4F0HK@e`Vr^l**~Wr{A*Aw3ddRaCc1Dm3YXnyU`QgdbW2&0>*3hkxK*0owJ$0{ z@?=UwrDol+!qVdW3>#}(X4+Bfc)Iri9>%8ATmQU(f5-mhpfiO=d!@x|ctXXsw&!H$ z?Mk=t(ck%;o#%OOFqWsoe$g=2*5_cnw8d(>MceH)o&WOs!fLnKb9Tp+`|;BEs_%{- zhv)u4ioW;u%(Axk=6Jt#$LD05SZu%h>Fd3}9-jYoax@o*|K{(w(&|;r|L*u&-=5cz z*PF$6@ZNC_bc8tiIW{wKksF`0LeEOtZmU=@oq>RlPCYF06dn-;2@eAc1qa=fCIkE_ z8knn7x?HAGAOKZ_HVjXn%G2TyYZi&m*?>cgp{QFdA+3mK>^zkUbyzCL75cCkj>*{z>XsVjP>OVxb0+rq-C_niJ~nz7PEkQ! zCI1iMkz%~*^HxPm!L1OEO=+%=`a45uk1o~4(i`DSKfJn~Gi*T`zSUnOaImiA`Dgv6=&`|2F*ki0N;u#Y!4F3#De4mIRO72hr{?6=~ zB_XmJJVlCI3BI}dl%YTB7Dd?DtJqL8T+A-1fpdzI!#1D%euY^-HIk#vSbaQUr#fCp zOCe3lV7R4)G{N-n`P)-}p`jUJsHbaztFyy~Q>x=HN|QseZ{=if9v(_FLP^?!)I#O% zJ@ZVYs>G*tdwLK$qSic3C$=imG;10~Q z9(dQ@ccFZrNbxD>mN8kcFnr-Vya&^#n8u3B29%^L3nydmWQS^rU>_>?2T6yFRv~vcJts7yR-YA!#tW<^L~M) z3H^wy0QVFh>os?oxEw}4j2xAtjgQ!vK;1->>nI>;g+DV_;*i^`Ln2I|!ih$4qXs`} z!qZo!V006q{wzHf)Lze@H&DLCB7%KNn-H6rRrI#H@Lek(^z?r?_~XF-OJ;U2iF$HX zTcJmYS2)uTI+9J3$J|0I^D~0&C#(B_0(5^^s03;K0t3ACNVfU#7gtv(IAfBi$Awvg zzBJIGT#aIc8l)(d7o|`A=|w6R4zjk^=Czx-o0fZK;ov40u@2L4>t!MReCozo&xj~r*V;s6K?jaGQ~`|ygDVpg+}%L4eDQ3?OfJn(#T527WF1A zOcm@VcvO)6m2V25!5&^nuh5_fKN#y2Ai?1Spe>3==+s?E*O>5mDZ>0#f&KTISw=j? z2`)t)P5g5_{<_x*{QQW?Nz^)R*Zw@75`(eH%N>Lp5)2aHADuP5XI2-NJA{MmGvzx_ z_#PqRkb6)@&d4m~dsY#-!kLn|*dO8&V!U5CB3WW)r=`y<^T<~GH{^$Rh)FqprQvv^ z#<-)VKp&v(a;M~F8@Zc~4$M#XWTe{q2363=Smg{UaZ_-wK5w($hFquUokdA$(O{VO zAlC#GvX8)UpTOc__F)bpyh71ip<~{V)nUtF(U9;$(cXMReE0^rgJG?R_(I?VF&Lq9 z3x$^)G7!8AHrQB72tvEXwZarw54f-3rM8Q!9Nv*V6Mr+s3*+%hYMaGI1^n?_Kpq(h zSqWuET~NL4)dEOIwDd|K=6M*OrN04<5S0*LiI)8p#9GzCH0&452^`cE7twQHa6H#WWQE}Haqr_zk&z4?MrH^V1%a#ushjhXHRmL02^P>E{d0enYzAJA zkDJxQJ|^{yGgiMzoC7a#`8V;2YGxNbxGZ#pH66#VXo*`wn@1B>&lfGrYUUCL&Y@Hf z^}IzG)bK_%;gMC(#;LNKH^-||o68VXBaPR)&aFKvXk?+-H&psvQWvelkN*_FHJmS~ z$N2uU_Y(f#4su{p$^|9iz-a&4O!Ux&^knj=sXyHrjQH>5+KsW(;G`WiVL_nI2!jVE zggYk0KbkP$IzTw!8HkiGbi$$XVfO|9)^&VrT8w%aN36V~^9(V?5+&8uUPLjqWUMji zE0fQ`HL{f9rMI*~3Cg1Q{uKohg3O4hXQ4Su)Dhj3R_^xfK_nfUYL3;qg_;X=N1R)+ZXXtm_Xh%bRlWFE^I!n*l1;)T#mb3AEf7SxOjKPiCINxNj>vkVcWv4c zYeBIch}(}~15F-#3d%>E3cmx)gp-4zXSjk<5M@;UBsZF2c^>mJQ+Y}I&=Kzb#6-o5 z`KZozBXYt@$@&%HR2KxE53aCO71|ghwPkOCF!!4YanVGyS{mBmy14ci{g)KNPNA5P*Hi=QV{CY#xtki zvLwtcBVoZZ8Rg>y9+oi>+agLcng*DToOd-+3BZT5lDd>cV z2qGdy;vwmnjvB-}h#IQIj4*R02vUs;j4%()bE9rpCW_umb!YM^DUDX0i%}QXJO$jn z;*k}uH27MnrWUz`6I@i2sOTYEZZT0+qPVhh7TiqpONn~qH{7?IcFl4|GHN5~x1bgy z$dWIzOkB}gbWM9}D0URoJ1mFVF0!bPHCLF`9@};?tZ=YtGN)OIPS`UX)$btL)Zj9y zL#J5>_yT6Zct%E(FO~-86uVp;IwD{HGU`kc8%l+Sq@9Ic?{(9$T|AC`5WV&D@plDd zUq{b`>B)RRzoOQWZ7Efa=XPIE%ILT1qGPw`S}9~mt6Rt5LsQC3UztEsJD^T{wco&0 zW@M!IADKqH>3ZvHA@;PQc4PuY@Qx!#^1isuZkNyNS#xS7nAX+K%+^BIW;z&Sslu5g z2h0(Vo!zQu%a%fZZhTEcD<{%P!KnaRug9N$u3FeqI zYn}8=X=aw}72Jmd1Jc^VJP0t|Bj^&L1A3d;;ah{f7*T?R@Bh(;pRz~EhH9D85BgqK z_vwRlwGI2X4r;bu<{EBd2eft22a-^Kr&gbk2JJJd8!Zs=AP;Cn5Vwki-6K;UD+>wW zhtq~=k)PA#A=Opc_2j`mi2S|28r%f)K>$X5b{~DqKBM13o`d(S@_(?4*s9zFamgx1 zCB+XBT_Z>eAL5>TgRww>p)Cu9EZoO+2rJn;m3@u~iU%8Dz&Wz79esnA{3VE~+z-Ac z2OfvWqRKkWz3~PYw1;zNxAZUv=XtXTJUd7}l-IGhebEiH9|v7%$J@ED?I3=7rMZ+n z`!ibg45oce5rV$&+%9P-5#2m{B5y)@U8U87-;dxFK2c$_wC+TAT<+fGod!h*In*;W zfPdo%&&1jdm5&&aBs+>!$xK&M3?C6oFp~LkRF1T5Zs*+YJ`sb}v)msK5f8mJGW(2g zyR`fB6cL*T5`HjpD2zHeEb~3yOS~s<@YLy*;Ny3j?iN)~t^Hnwtv{AJ)aE|90b$^n zn*(z>5zH8p1H6XZ4x!k%hNb}dJm-CvJShw0Y^3lFvw~BkYuS||ty=CS-Cv*Lx9+Rl zuJoNwJ)V(4q9q?#o&f4i2(dlG_pE@~@Q7zb-}gGet1Ovs-|VFkse6Z!`o!B`;d@A*n1$Z?++cY^S#d;gNOz%f`A|(-MAbvn z*z0r*LOjH7vlHoM&E>)h+KS7H)V1DH^UpA`*UeNsfaDd|I_$Po=c-GqE8x)wy#G{~ zhvpiU3clHO&y?-elU8&K?wOe}?~+KI2~)^MWKx>EQ351AG^H!iia^e8`%45aFPV?% zM8T9e`hcr+9@vQjH!@eNERk(*`&_3Z(?{GFktHA(fdCX{d;l_E+74P~z)Qb+4{-;L z50wv{4m$Pv5cR%Z;=atI?7ZU{idim4m70WwGr)U+bP=YZXvT@zEqJiEyZp`P7Th0q}CLF@f8gMXn zw*>CM&0EVRz`ftgE~A6WG=#EG`z*roi{*ffgYHa%f>SIPX+$4|haOHLS)l3~;a$Rv zptT=Q6EqL7RMW?SC)}d1lP1D6&JRYv<1wm-as;W{hszN92{yCOrvnWiCuxsX6+qJf z)Du8%uXeApT#>lk|3~HwY152g9aed3!Y4PK1VIoL)HQ)gKk$|310!QUtSmHfV6p52 z@s(V^$xV>eUwI#{4fZMk96*#-ONF>g-Y7tz)JK&wa?v6%hGG~`H&n$0>$st_wf)8j zj{;d(Pxh1eL^LXLJdKcT&lL9<3Xym$*z=ui$^bB6^FXoz+^GBs+6d?Otoj6+cIJ(= z;%O>a{iJgm`l1)^3LMi0))UmJ52La50Z}%}J{oIpyIY1O8(S9Q`8Ln?%9t&_s`FR3 zsUUApCbD&9b!2v9JWM~J{AV<;#nm{^#U5oC;G^h@-L zTUZ^dU#z}_EG#q$lHM1`_5Oy!mx!blNZT?cDI{1!^jGn;r}A*CCcw!mqnQCQ?^Ys%%6;;P0$euZr{#6%n}&G%B!lbAeaTLNzWtU zhzvisfo2#6R7o?KDrtG)D905W?P(c3KW=Y(!sGqj={e~E3DyUw?9yR8sW8#E2nq*TJjDuN*vQ30`y$buLBmg_jsvA!-WZ;9K`Y8sInDOacrW* zuPmiyXISBW&V=J+@ldr-si*L4o=VcqnlbMzy9$p5B5FnV%Z1oVZD4=?UZCC=OW1+` zWRcLq5Hb)l98gqDY;+vnT2eR|TcX@pNJ`gKoA^D5G?*EZ8_i#u=GgENZYC)=1~(lA z3!S;sT0?({oqUxG@NNAn{M7MXF8t(vpM8V*{8SJ!gV;8KhtfhuK`r(#5b-ui+FM=K zckp9n%MaYKVkh0`H))V)`MC*~IkCduI_Nouhm4z>9q&nzYBNRJp1)Gpb{`aYMr?sq zIHRWI{3Uq$5#q?7#hlDqq=)k!xB76DI<;qdJL{a?A0|G!0$xPJ!5}2WD3lk)k|Y?Y zQXLc92s?(y!4T>TZbV)Hvy?GJ$U@CbLc&7A!9hX7vO`wJ`iZh$B~3>1N98KB`>+r4 zX$;trFa1|TL<(Kj?kKXVr_=g^^`UgfTpNf&KC@1FW{SpPg~!y&NzC|QD?MJLnRHw) zBT;`v?WL=?xv=`+twHBuNRvA{7rHKgm{$ghu_RVDU&H%x{c39-MC5$n{As`5DG2#x zG9v=2G&ziM1=F?>e<_2QN~z(rqf;7^HHrmL8v>r=+ zS^?)*cO<7nj0bXb*~_i;HR|&ZP`I`jEXL~r)kc!1hn*5X5ySjRoO3Te*GVqdbbqaE zu~wZ)w4jH$#G>~IN{I?H{jN)Wdnpf4{HPLSe?g4q0HA-w_J5k+l!pr+gp%x}wQ)HcR+#8r^!2=y4}OG9?w2|-dS<89(;VpaZW4NagL7Ae7jVMns3?8hz;1vLhbflUA!R9eI~Y!VA2 zC4qS-tTAYYcCBp;>t0gBjLw*vV>qwl5;D(2I?oamVgPxUCL@Bq2)v%+uaTY)(l8~+ za|Plq06%6DlG8p&@2I|LEj9Q){(JTYrWsJpV|Si7bBL|`Z1R!VO1pMDRejfzBoftX zqAELmLqqPKTSII#+_56n_~mZ2<3jk-Eex^i^U|Ek_(TgVzd5&abG^Yg=l*3tf6Z*7 zHOzc~xow)SD_LP!5s3u5EP2yWCoVw_c{8tYwnxFw-yBkQNIW3(TBthUvl*``EKK}f zT{%6pB{=aIq@gC3ApYlKU!d`z`VU3iavh@C&p(6eJi!CVRk%hwHEW1`Z!8Chj^Kee zTkyu(h7no8zSd?#AmsG){QR~}f3yB(aV8!8zzZKd-u!?S^lu>uW>Fi6qlSJ zLd&PrIIi0BH*1NLe}rTk&1|%u(qjt!=^({s%*W|@1;W^h?_;!C^K34B2H4eBbU!TU zFR-3Y`^zl(2w2-p7RRs2;d7flu7NOn>vBJ2+gf~J6g4Hhx)W(iK^)ls<(dAg5S~3s zK6Xh~A=SdPTwBw+#k|hgZp^QaNU6S~bpnOUZjft>IRHd+zWm(-qh-y$% zyiWG}t~!mhqRhgsRT)|qRcX&g9(*(ss|ky(d_aixP9LbHOGt>{NKw4qT|+6>U_D#`{=7*cWU56H{Q%gI!G zyNQp$pW}eP)1;rR-2*e>qKd~b{_?Q=ICJ}v_}B*%z2B>D;ZPk{DfuR|%_RnhYo&1F z+ez$BK0WpnFMx(V`B|cv-Tv**) z)WH(Guu8lWQ*nuO0eogjc1dPQ?lJo`!!(B$w-&n=|E6t=ev5spwX19Z(IkNVWUsw{>{*|M5k5lhR~~%o7@KNe7jO}xN4wI zRGd|Rfv#pQWJ*?n!NA?%XowvAE}{>~Z(IaCWF2%Ls^3viz1&$L{9@#X>mW3t>0#OO z$efZC2*^9FYZ?eB3Dae~8@pLD;u}qT%idg(uoY@q%F8@`{S~`1l7K^bRp!ywqY`x8@&*eCBV`F04%whf&Osmh43-k(?wl z#0$^*KqUpN@I0gDqti&CLb<}dLOK+ zgIXl}GbHu^wM1@&N|VO+FrS%Y67K>bx?%q5XgmuDgmkfh2sKfnWFX;3e@v)e`AT zdLqGi+G#FIoS6hX7AR*5+NJh}^8iRl=@?qaFhF7uZ>@3%@D=i$|7y6cabwk|mV&?3 zds;vfAQ=q4Smxx>y)vsempNr}Z*oJ2s$K*Kpm>w^_UpZG&?ro1F^MGoRSczA&hqII z*l%$U&xG~SeC+3k(#;0h85p4`9V8g^HP$pjWC1b-?pwH1g$@=uuQzAGiX|DofrPLJ zfPc-4BqOF|U%}Rj4yX_0F&qq^&MVJwI+QT=Rz$AX}_r!t_)cbU5GX@z)gS)e66nCqlc`=@& z-T>dUQ^?JV8KgEOOY>+HojYME2Urq!s0Onvbf<-DVizzlwB+s*+hoPG8gju9d75N? z__?!4pY+7+DF;}PDgdnlo|XK(EBoAmf7Em!M4jLN&@y4fQ(O~BaR|&|K#}AdK{^t^~B4y_u9?f;mY6I3=k`*KJRBf9&Bo{+|~ljrDx~P zfqBy5_MgTMXI{Btd>|T)*Z5H&^*MYUmu(g0Q37+{2W*?c=K(7!infIFvVdok4q=!` z+)J^LU_oGkV?jLEvcHjPB>WHyNz#-AkqXpCQ_G`v>+z_u?|E6t(+Wh#-khSQq5hEy zR=sF@!>#8r)5U*^{h=nvkJTy758+-|z!>zCEJ}jh-f3*JzzOj`D&vxyI(Yfl1&Z zh#soOkqBPaFB1p{8CD3QnB8EP{E#{qKs9gbyL}Ao0Z->0|`CxcKGV*EFdTfOsS#M)S8u)RAdI~l$gx9+E%AYk1l)hWh5rQ z82Km{7Mm^#_=O(nhZD80mVY*w2b0J?<#VNM0DvBSr+};I+JGJtHK@Et$&K~7vJh1V zq_#{6RR@M``bY4pV5WWJ!1-|M^HkJRAm*nzw5?zIXC>-R*-mv&-t{NL^q0YG?z;vC z1F+ZE7lumT6SSP5A*H+<@B(c*H2q_Of`31g-~e`TdCDL(0N$JsP}h4;0bQ0!Sd@78{-uX}HtNn=2_OBZ zD`b`2L1*#>XXYFQf4u90I6c*yw-kpn-hi?k{IPO$q=;5r91{z4^R)|2+Yi#T5@k4df$ z6Jz%?C|z&J-FTL7*4+n3Z{p2{&a0*8wDGBm?RB*=($pxr6ZsS3Ivb_(> zk|1q?M0|oCMhaHEsT{;gXIoGZ>+%{uY`c&S;2A$AX{Rt96uG`Q<0~6UV!s!wz zuKB?v7yYk|h?a!OvjdtqUM0}RG!QFeyq3>lR$wY+-mv)d!L%~JJ0fO>t9FW}_KvBm z_PjIQJZbTR6-tzFSNn^rigRM8St|B4+h44zg2&YysLpq%OcS4fb~iiR++@~ur=)5< zuc=I2$o^`Hi6x!AtgJmYWT!+c?@O>suRO}IlO}JCfGOv3307R3qMi0~=9D~Z)5U84 zRq5_qLKzu@Rh()Y0=?r~!&E7yL1~#-?Y<7ptdI~*wyM)oDOGQi-GGFiYI|HfspEVX zR(-tj;UfL)64Pl)TU8wr>*^jQnq?YOP%T{V3l?iG5$>5gP4%+X=vKYe3#Rou0zO@u z^;IV(p37@yprt*c`3R8?U<~n;jDw5Rsvq3ogZ`w+np%GHCLJ@Pxuc{b0XTJe5J?HT zyhE8?va?pVn(%R?=5Fs0i^bCE!0DDzJHmcaIj@vIPU{ylJ5H(TB4W#)^LyZ%WV_p) zx~aA!i?t3P?c=f0Vvp@4yNcG})iLdue9OKpUdq}2qgrs`z?Yz9*(Qc#M-8C2oF1v( zt4<}B$>~nes>}f-yzo|DPfx$LUb{fFiWD+IYzjA^X1e+2j`Yl*yZ>Ci#yMN0y;O5| zzDCtCe6c}`^*!G1XZV#qn`)_1gA*hNm0(A594Oj@|Hpf&bydBPv4%eNbLg;3Vf~5K z>zxUyLrSz8q0+_3j#HA^NF{WQfiFT4RPP}T$h?Z$%b4g{~ z#DWp%ka&HM5rYvzZSbyMW&9C(cqgi(8^nryGyd2I*<^s|BYL+Xp$E!4N`lvXwHsz$ zUaGUA{`kB5U~|%QfAukbwu`GtkT^DmitbV?mj3STz#QB*JN9;(*HQKUkNmpJbQj=g z>qOZFVp2Z!t~rbRx1@~=b-N)N{_uLDXS%gi^13=&dgY%BOy1HQ9@E9%`Y8GqO^=2= zrS^O(@5P}welhK$RkW1kV(y1c^;+8sCO+q z$v>4H3yKcMU(U+!9wxeQ#_J3Hwk&X2&6@QsTH`8T7FR5IZ_VAeFYmf91La#L9srz| zBs;t4vv%Y4_M?Nh%O`I-%6j!p>d@C?1Hj-qrY z7GDp2%~=gbZSb2f+>;5w>R@s(mGoR*b%uF*>%i*0BIjYu<00k>i_~%QG1XP28wo#^ z`8}EO9-!*P$tQZ#{I%iO!;QzinKdCJuM_Q6TeYWC-8}&jt+xJlJ7(qW%yXwk6Jy5%Ut~Knp-?AazPB1R z-Von$#s%wjgAV|CvFqyGStKYW>Rw6;^YKV z6Rgu^k=;U4X6G0ucq?Vw`x`$fbBm(8hKG3>5OY&!VbN4BrX#kIKP^2<9o-+%$ro?d zhnPX{RJiy(OYSJ*0m$aO316MeP1oFhH0STS>BFljJ}NwqQM~3p-Hs)N1C-XuWGbA+ z^UlrAGU}wTm6lbtmQF0~XBNe;XAvyrbMt?mar2$n(pc(iI?|WhiWQpIYJF{NMESjp zN+iIRmlr@+t18G^S^EFFu+2ED!c|sUJFL_RVx36YK)4KetV)+J{B@bGkV%Hd(vGnx z|51!XXH`{h=_IN0y&Fz5wNs*^%z=fADx)s2b6jOVC8GkT8L4GD^~^+)E~~g_UgK|- zUB|X_w9UAJ7A2?M`*8%-c=6ApisXz8%WTES%5vyV?)sLRga;F^W{{JexIMK3I7g>V?Acb@EZRj3w{dV*{9n~r4n>-OH%g`&s^VJO zmOJc2S7t3gn8kCMr&!h`!r0Ms7g?}`Dj$k$E_YDLZz<>kM3`4pIqZ?vx9Ci0#yR29 zl@6{gQZSmvADo6zp(o|qA(aSB2AM3EhL5lVT9!O=hou~)Q-zbk>CB|AI+QoFMm|KNGB`L7Is4LR$J|MkMc-;l zWK(w~vc20TB{A~4?(}cy)lgXO{8o@~VTN-Xj3+I{{G}3fVDBgjZ?;B|q$BR~bPOg& zG90_83Sh^4;g2TX$(?s)A1mcdKka_p*24_+sEkrs4TJE z@A9gS_{|j4pvt5Qm*uv#06QeQRmD|~QCbMLSbZIh)d0ptp2Cfz8iTt%^@es!hBgIX zCzTH`2{^Z5AvM{dOGb<1qv86f)wv6&J97p#RSs;+WKUdn(vh>@sg1v6 z?Rz!(5{2cs7%i1`WFvVkMVj51YRoz=3|kmN3m&F+@lqZR3(Xmj(|GCy z&d5n&CuaUqAKl|G(GznM3zug8j!MS-cr!IUWG1H;o518Ca22Rg&|O|q=I7w;E-ogm z9yxiA=*Zad9H2yHad z(~Zj3Syooc)>%|OFS9RMZ7bzg&k;9F{?ZRcGi6=j^6iD8E~a+*v*3nVaah|A_eoxV za%Z{VWiT}8#g@{0tOl(6CgP)Eq(B$sK$W92n+3;Isc%$B^@RxKHgfD)zF= zfbImf@aAU7tiD&BCI+E))Raz=`93|J|--0wzv||F%h4JATq)kOAS>(`$rAN#vdwQ7Nz?Bor|HYn}~%U;IdUUP^Ek z&HGcbX%r0Vnq8V_$UFt-YU)M?(0C`lav(65Ve_cHn_IBmU}rtiq#!mbT0!rb7I^{% z`yxhNtu=dC1E@Hkf%8wFfN2SMb}yvy%+=HA?!n{pEfE{MnGd~KHUX*%Xcm|HMX7V0 zO*}$EYprv2cKr>}6Z#uY&Y&!gL-VCPd#7rl-3{&trs&duTP!)!xijAft9hI3R~z18 zSH_$YcTL3XG&qsYEHhuy4Z!!qu7A@dnCjK)$dyW9kXojIYxjwH`5a0=$@%I z*{&UV$%yfQ#s6bJCq4UB^ZaM-wboVb-wUUIY1ne4vi8`cU&rzaHk;P(uV9=Zj}g#@ zpz{J?+wMcaXI1zoWd)by!gqq3SO9m0W8IMN2RePjEwiY&M8VhsB^qjjeDiNP2<-hH zenyb*&G)KY%`TROCM%R28_n9g4zuOo4eh-{)qvT#(T^9as76?4IrUZN&)( zSUvb+nywuXEog3MJIMV&Qsb!I-8&4iW-H$dvHd*#AJ%^4C<)+cb(VQ?;Pa_!@xC928|%KNF4|!*XJK~q&VRht zzEw&hKEjDhf+BKnS3h4JFUWIUMV#kv8zmQt!$o)x^|2g!KPO zD%QG*OYUVhaQ*q z?Oz^LTzN9%m|aS*KudaOJMT?dyBDwH?JnPOz)VOO=?LY_6GL!Pq2`$<;G;~8fJaTx v=(G|BCN;O~#ykv6lNrS^Pvby4GzEAZMPgA&MNw)Rmyw|vm#V6(zZ(|-LGCl* literal 0 HcmV?d00001 From 98c658822cf6782ca0907ab7a68691922e701aa6 Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 13:06:39 -0800 Subject: [PATCH 4/7] Add unittest for pytesseract --- tests/test_pytesseract.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 tests/test_pytesseract.py diff --git a/tests/test_pytesseract.py b/tests/test_pytesseract.py new file mode 100644 index 00000000..ef6bba09 --- /dev/null +++ b/tests/test_pytesseract.py @@ -0,0 +1,14 @@ +import unittest +import io +import pytesseract +import numpy as np +from wand.image import Image as wandimage + +class TestPytesseract(unittest.TestCase): + def test_tesseract(self): + # Open pdf with Wand + with wandimage(filename='/input/tests/data/test.pdf') as wand_image: + img_buffer = np.asarray(bytearray(wand_image.make_blob(format='png')), dtype='uint8') + bytesio = io.BytesIO(img_buffer) + test_string = pytesseract.image_to_string(PILImage.open(bytesio)) + self.assertTrue(type(test_string) == str) From cefa1c25a2377b39d9ec4522b9268d6bed2c6c73 Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 13:08:33 -0800 Subject: [PATCH 5/7] Add wand, pdf2image, PyPDF, pyocr --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0938e285..203d47fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -481,9 +481,13 @@ RUN pip install flashtext && \ pip install pykalman && \ /tmp/clean-layer.sh -# Tesseract and pytesseract +# Tesseract and some associated wrappers RUN apt-get install tesseract-ocr -y && \ pip install pytesseract && \ + pip install wand && \ + pip install pdf2image && \ + pip install PyPDF && \ + pip install pyocr && \ /tmp/clean-layer.sh ENV TESSERACT_PATH=/usr/bin/tesseract From cd4fcc4560e66dc31ed8c6b57c8ae72e951a7a29 Mon Sep 17 00:00:00 2001 From: crawforc3 Date: Fri, 22 Feb 2019 13:10:15 -0800 Subject: [PATCH 6/7] Change tesseract comment --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 203d47fc..00e1a451 100644 --- a/Dockerfile +++ b/Dockerfile @@ -481,7 +481,7 @@ RUN pip install flashtext && \ pip install pykalman && \ /tmp/clean-layer.sh -# Tesseract and some associated wrappers +# Tesseract and some associated utility packages RUN apt-get install tesseract-ocr -y && \ pip install pytesseract && \ pip install wand && \ From e4024b9224b5d5a472ac33dca9e9931921330fd1 Mon Sep 17 00:00:00 2001 From: Vincent Roseberry Date: Thu, 7 Mar 2019 21:51:14 +0000 Subject: [PATCH 7/7] Fix tesseract unit test --- tests/test_pytesseract.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_pytesseract.py b/tests/test_pytesseract.py index ef6bba09..353b8adb 100644 --- a/tests/test_pytesseract.py +++ b/tests/test_pytesseract.py @@ -2,6 +2,7 @@ import io import pytesseract import numpy as np +from PIL import Image from wand.image import Image as wandimage class TestPytesseract(unittest.TestCase): @@ -10,5 +11,5 @@ def test_tesseract(self): with wandimage(filename='/input/tests/data/test.pdf') as wand_image: img_buffer = np.asarray(bytearray(wand_image.make_blob(format='png')), dtype='uint8') bytesio = io.BytesIO(img_buffer) - test_string = pytesseract.image_to_string(PILImage.open(bytesio)) + test_string = pytesseract.image_to_string(Image.open(bytesio)) self.assertTrue(type(test_string) == str)