## Setup + Imports

In [6]:
import sys
import os
import re
import subprocess
import tempfile
from PIL import Image


def parse_captcha(filename):
    """Return the text for thie image using Tesseract
    """
    img = threshold(filename)
    return tesseract(img)


def threshold(filename, limit=100):
    """Make text more clear by thresholding all pixels above / below this limit to white / black
    """
    # read in colour channels
    img = Image.open(filename)
    # resize to make more clearer
    m = 1.5
    img = img.resize((int(img.size[0]*m), int(img.size[1]*m))).convert('RGBA')
    pixdata = img.load()

    for y in xrange(img.size[1]):
        for x in xrange(img.size[0]):
            if pixdata[x, y][0] < limit:
                # make dark color black
                pixdata[x, y] = (0, 0, 0, 255)
            else:
                # make light color white
                pixdata[x, y] = (255, 255, 255, 255)
    img.save('tmp/threshold_' + filename)
    return img.convert('L') # convert image to single channel greyscale



def call_command(*args):
    """call given command arguments, raise exception if error, and return output
    """
    c = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = c.communicate()
    if c.returncode != 0:
        if error:
            print error
        print "Error running `%s'" % ' '.join(args)
    return output


def tesseract(image):
    """Decode image with Tesseract  
    """
    # create temporary file for tiff image required as input to tesseract
    input_file = tempfile.NamedTemporaryFile(suffix='.tif')
    image.save(input_file.name)

    # perform OCR
    output_filename = input_file.name.replace('.tif', '.txt')
    call_command('tesseract', input_file.name, output_filename.replace('.txt', ''))
    
    # read in result from output file
    result = open(output_filename).read()
    os.remove(output_filename)
    return clean(result)


def gocr(image):
    """Decode image with gocr
    """
    input_file = tempfile.NamedTemporaryFile(suffix='.ppm')
    image.save(input_file.name)
    result = call_command('gocr', '-i', input_file.name)
    return clean(result)
     

def ocrad(image):
    """Decode image with ocrad
    """
    input_file = tempfile.NamedTemporaryFile(suffix='.ppm')
    image.save(input_file.name)
    result = call_command('ocrad', input_file.name)
    return clean(result)


def clean(s):
    """Standardize the OCR output
    """
    # remove non-alpha numeric text
    return re.sub('[\W]', '', s)




img = threshold('../capcha.jpg')
print filename
print 'Tesseract:', tesseract(img)
print 'Gocr:', gocr(img)
print 'Ocrad:', ocrad(img)



SyntaxError: Missing parentheses in call to 'print'. Did you mean print(error)? (<ipython-input-6-81eda0658e29>, line 46)

In [44]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import os
from tika import parser
import codecs
import re
import sys
import sys
# download pdf's urls
from datetime import datetime
import time
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
YEAR_LIST = [2018,2019]

In [45]:
#get list of url from maxim's github
import urllib, json
url = f"https://raw.githubusercontent.com/GraphtyLove/KPMG-Challenge/master/assets/link_each_entreprise_{YEAR}.json"
response = urlopen(url)
data = json.loads(response.read())
data = list(set(data))

## Download PDF

In [46]:

for YEAR in YEAR_LIST:
    for i, link in enumerate(data[:5]):
        try:
            time.sleep(0.1)
            reg_url = link
            req = Request(url=reg_url, headers=headers) 
            html = urlopen(req).read()
            soup = BeautifulSoup(html,'lxml')
            hrefs = soup.find_all('a',attrs={"target":"_blank"})
            pdf_url = hrefs[3].attrs['href']

            monitor_url = 'mopdf'
            if (monitor_url in pdf_url):
                print(f'ERROR, suspected monitor from url {i} {pdf_url}')
            else:
                myfile = requests.get(pdf_url)
                with open(f'../assets/pdf/{YEAR}/pdf-{i}.pdf', 'wb') as file:
                    file.write(myfile.content)

                print(f"PDF n°{i} OK from -> {link}")

        except:
            print(f'ERROR url: {i} {link}')
            print(sys.exc_info())


PDF n°0 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0716751509
ERROR, suspected monitor from url 1 http://www.ejustice.just.fgov.be/mopdf/2019/01/16_1.pdf#Page1647
PDF n°2 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0717552154
PDF n°3 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0717540474
PDF n°4 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0716768731
PDF n°0 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0716751509
ERROR, suspected monitor from url 1 http://www.ejustice.just.fgov.be/mopdf/2019/01/16_1.pdf#Page1647
PDF n°2 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0717552154
PDF n°3 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0717540474
PDF n°4 OK from -> https://www.staatsbladmonitor.be/bedrijfsfiche.html?ondernemingsnummer=0

## Check all the PDF and sort a list of there names

In [47]:
for YEAR in YEAR_LIST:

    file_number = []
    pdf_path = f'../assets/pdf/{YEAR}/'

    for r, d, f in os.walk(pdf_path):
        for file in f:
            if '.pdf' in file:
                file_number.append(re.findall('[0-9]+', file)[0])
    # Array of all PDF number as string            
    file_number.sort(key=int)
    for number in file_number:
        file_path = pdf_path + 'pdf-'+ number + '.pdf'
        try:
            raw = parser.from_file(file_path)
            len_raw = len(raw['content'])
    
            with codecs.open(f'../assets/txt/{YEAR}/txt-{number}.txt', 'w', 'utf-8') as file:
                    file.write(raw['content'])
                    text_len = len(raw['content'])
    
            print(f'file N° {number} -> len: {text_len}')
            if text_len < 1000:
                print(f'suspected scan: {number}')
        except:
            print(f'ERROR with: {number}')
            print(sys.exc_info())
        
        
# * ---------- Find the key of a value ---------- *

#for k,v in enumerate(file_number):
#    if v == '2000':
#        print(f'key: {k} value: {v}')

IndentationError: expected an indented block (<ipython-input-47-6de52ee627bd>, line 13)

## Re-process PDF -> TXT for ERRORS

In [None]:
# ####REWRITE THIS
# for number in error_files:
#     file_path = f'pdf/{number}.pdf'
#     while(True):
#         try:
#             raw = parser.from_file(file_path)
#             len_raw = len(raw['content'])
            
#             with codecs.open(f'txt/txt-{number}.txt', 'w', 'utf-8') as file:
#                     file.write(raw['content'])
#                     text_len = len(raw['content'])
                    
#             #with codecs.open(f'txt/txt-{number}.txt', 'r','utf-8') as file:
#                 #text_len = len(file.read())
#             print(f'file N° {number} -> len: {text_len}')
#             if text_len > 1000:
#                 break
#             else:
#                 error_files.append(number)
#                 break
#         except:
#             print(f'ERROR with: {number}')
#             print(sys.exc_info())
# print(error_files)

# ---------- DEBUG ----------

## PDF -> TXT on a specifique file

In [None]:
# file_to_convert = 23932

# raw = parser.from_file(f'pdf/{file_to_convert}.pdf')
# raw_len = len(raw['content'])
# print(raw_len)

# with open(f'{file_to_convert}.txt', 'w') as file:
#     file.write(raw['content'])

# with open(f'{file_to_convert}.txt', 'r') as file:
#     txt_len = len(file.read())
#     print(txt_len)


## Print the len of a file

In [None]:
# with open('txt/47.txt', 'r') as file:
#      print(len(file.read()))