In [None]:
import yaml
from collections import OrderedDict

def ordered_load(
        stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict
):
    class OrderedLoader(Loader):
        pass

    def construct_mapping(loader, node):
        loader.flatten_mapping(node)
        return object_pairs_hook(loader.construct_pairs(node))

    OrderedLoader.add_constructor(
        yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
        construct_mapping)

    return yaml.load(stream, OrderedLoader)

In [None]:
"""
This module abstracts templates for invoice providers.

Templates are initially read from .yml files and then kept as class.
"""

import yaml
import os
import nbimporter
import re
import dateparser
from unidecode import unidecode
import logging as logger
from collections import OrderedDict

#import utils


OPTIONS_DEFAULT = {
    'remove_whitespace': False,
    'remove_accents': False,
    'lowercase': False,
    #'currency': 'EUR',
    'date_formats': [],
    'languages': [],
    'decimal_separator': '.',
    'replace': [] 
    #'field_separator': r'\s+',
    #'line_separator': r'\n',
}

def read_templates(folder):
    """
    Load yaml templates from template folder. Return list of dicts.
    """
    output = []
    for path, subdirs, files in os.walk(folder):
        for name in sorted(files):
            print("Name of Template",name)
            if name.endswith('.yml'):
                tpl = ordered_load(open(os.path.join(path, name)).read())
                tpl['template_name'] = name

                
                assert 'keywords' in tpl.keys(), 'Missing keywords field.'
                required_fields = ['date', 'amount', 'invoice_number']
                assert len(set(required_fields).intersection(tpl['fields'].keys())) == len(required_fields), \
                    'Missing required key in template {} {}. Found {}'.format(name, path, tpl['fields'].keys())

                
                if type(tpl['keywords']) is not list:
                    tpl['keywords'] = [tpl['keywords']]
                keywd = tpl['keywords']
                print("Keywords",keywd)
                
                
                output.append(InvoiceTemplate(tpl))
    
    return output


class InvoiceTemplate(OrderedDict):
    
    def __init__(self, *args, **kwargs):
        super(InvoiceTemplate, self).__init__(*args, **kwargs)

        # Merge template-specific options with defaults
        self.options = OPTIONS_DEFAULT.copy()

        for lang in self.options['languages']:
            assert len(lang) == 2, 'lang code must have 2 letters'

        if 'options' in self:
            self.options.update(self['options'])

        # Set issuer, if it doesn't exist.
        if 'issuer' not in self.keys():
            self['issuer'] = self['keywords'][0]

    def prepare_input(self, extracted_str):
        """
        Applying transformations to yml template data
        """
        
        # Remove withspace
        if self.options['remove_whitespace']:
            optimized_str = re.sub(' +', '', extracted_str)
        else:
            optimized_str = extracted_str

        # Remove accents
        if self.options['remove_accents']:
            optimized_str = unidecode(optimized_str)

        # convert to lower case
        if self.options['lowercase']:
            optimized_str = optimized_str.lower()
        
         

        return optimized_str

    def matches_input(self, optimized_str):
        """See if keyword matches template"""
        
        if all([keyword in optimized_str for keyword in self['keywords']]):
            
            logger.debug('Matched template %s', self['template_name'])
            return True
        else:
            
            logger.debug('Not matched')

    def parse_number(self, value):
        assert value.count(self.options['decimal_separator']) < 2,\
            'Decimal separator cannot be present several times'
        
        amt_pipe = value.replace(self.options['decimal_separator'], '|')
        # remove all possible thousands separators
        amnt_pipe_no_thousand_sep = re.sub(
            '[.,\s]', '', amt_pipe)
        # put dot as decimal sep
        return float(amnt_pipe_no_thousand_sep.replace('|', '.'))

    def coerce_type(self, value, target_type):
        if target_type == 'int':
            if not value.strip():
                return 0
            return int(self.parse_number(value))
        elif target_type == 'float':
            if not value.strip():
                return 0.0
            return float(self.parse_number(value))
        assert False, 'Unknown type'

    def extract(self, optimized_str):
        """
        From template file extrcat invoice amount,invoice date and total amount.
        """

        # Try to find data for each field.
        output = {}
        output['issuer'] = self['issuer']
        
        for k, v in self['fields'].items():
            
            if k.startswith('static_'):
                logger.debug("field=%s | static value=%s", k, v)
                output[k.replace('static_', '')] = v
            else:
                logger.debug("field=%s | regexp=%s", k, v)

               
                if type(v) is list:
                    for v_option in v:
                        res_find = re.findall(v_option, optimized_str)
                        if res_find:
                            break
                else:
                    res_find = re.findall(v, optimized_str,re.I|re.DOTALL)
                    
                if res_find:
                    logger.debug("res_find=%s", res_find)
                    if k.startswith('date'):
                        raw_date = res_find[0]
                        output[k] = dateparser.parse(
                            raw_date, date_formats=self.options['date_formats'],
                            languages=self.options['languages'])
                        logger.debug("result of date parsing=%s", output[k])
                        if not output[k]:
                            logger.error(
                                "Date parsing failed on date '%s'", raw_date)
                            return None
                    elif k.startswith('amount'):
                        output[k] = self.parse_number(res_find[0])
                    else:
                        output[k] = res_find[0]
                else:
                    logger.warning("regexp for field %s not match", k)

        
        output['currency'] = self.options['currency']

        if len(output.keys()) >= 5:
            output['desc'] = 'Invoice %s from %s' % (
                output['invoice_number'], self['issuer'])
            logger.debug(output)
            return output
        else:
            logger.error(output)
            return None


In [None]:
templates = read_templates("C:\\Users\\Vraj Parikh\\Shipments assignment\\mruga_shipments\\templates")

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def to_text(path):
    
    print("in text")
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password,
        caching=caching, check_extractable=True)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str

In [None]:
import csv
def invoices_to_csv(data, path):
    with open(path, "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')

        writer.writerow(['date', 'invoice_number', 'amount'])
        for line in data:
            writer.writerow([
                line['date'].strftime('%d/%m/%Y'),
                line['invoice_number'],
                line['amount']])

In [None]:
import os
import pytesseract
from PIL import Image
from os.path import splitext

directory = 'C:\\Users\\Vraj Parikh\\Shipments assignment\\mruga_shipments\\invoice'
extracted_str = ""
output =[]

for file in os.listdir(directory):
    print(os.path.abspath(file))
    f =os.path.join(directory,file)
    file_name,extension = splitext(f)
    if extension.lower() == '.pdf':
        
        extracted_str = to_text(f)
        extracted_str = str(extracted_str).split("\n")
        extracted_str = ' '.join(extracted_str)
        
        for t in templates:
            optimized_str = t.prepare_input(extracted_str)
            if t.matches_input(optimized_str):
                output.append(t.extract(optimized_str))
                
        print("Final_Result::",output)
    else:
        pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'
        im = Image.open(f)
        extracted_str = pytesseract.image_to_string(im)
        
        for t in templates:
            optimized_str = t.prepare_input(extracted_str)
            #print(optimized_str)
            if t.matches_input(optimized_str):
                output.append(t.extract(optimized_str))
                
        print("Final_Result::",output)

In [None]:
"""
Determined which PDFs contain text or scanned images or whether it is searchable or not using command line pdffonts.
Where excuted the pdf  displays table with font names which means there is text to serach otherwise displays empty table
which means that pdf contains scanned images .
"""

In [None]:
# Processing PDF with scanned images Converting scanned pdf pages to images

from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
import os

directory = 'C:\\Users\\Vraj Parikh\\Shipments assignment\\mruga_shipments\\scanned_pdf images'
req_image = []
final_text = []
result_text = ""
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0]

for file in os.listdir(directory):
    print(os.path.abspath(file))
    f =os.path.join(directory,file)
    image_pdf = Image(filename=f, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')
    
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image: 
        txt = tool.image_to_string(PI.open(io.BytesIO(img)),lang=lang,builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
final_text = str(final_text).split("\n")
final_text = ' '.join(final_text)
output_scanned_pdf = []
for t in templates:
    opt_str = t.prepare_input(final_text)
    
    if t.matches_input(opt_str):
        output_scanned_pdf.append(t.extract(opt_str))
print("Final_Result::",output)    