In [1]:
import pdfplumber
import csv
from tkinter import *
from tkinter import filedialog
from tkinter import messagebox
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

In [2]:
#I used PDFMiner Here to extract data from PDF in Columns to be able to extract the Invoice Number, Invoice Date correctly
def get_pdf_file_content(path_to_pdf):
    resource_manager = PDFResourceManager(caching=True)
    out_text = StringIO()
    codec = 'utf-8'
    laParams = LAParams()
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)
    text = out_text.getvalue()
    fp.close()
    text_converter.close()
    out_text.close()
    return text

In [3]:
#I used PDFPlumber here to extract the rest of the data from the pdf, PDFPlumber extracts text from PDF by lines
def extract(path_pdf):
    with pdfplumber.open(path_pdf) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
        return text

In [4]:
def findInvoiceNum(text):
    text = text.split("\n")
    counter = 0
    for row in text:
        if "Invoice Number" in row:
            invIndx = counter + 2
        counter += 1

    invNum = text[invIndx]
    return(invNum)

In [5]:
def findInvoiceDate(text):
    text = text.split("\n")
    counter = 0
    for row in text:
        if "Invoice Date" in row:
            invIndx = counter + 2
        counter += 1     
    if text[invIndx] !=  "":
        date = text[invIndx]
    else:
        invIndx -= 1 
        date = text[invIndx]
    return date 

In [6]:
def findTotal(text):
    for row in text.split('\n'):
        if row.startswith('Subtotal'):
            total = row.split()[-1]
            return total

In [7]:
def findTax(text):
    for row in text.split('\n'):
        if row.startswith('TOTAL TAX'):
            tax = row.split()[-1]
            return tax

        


In [8]:
def findTotalAfterTax(text):
    for row in text.split('\n'):
        if "Including VAT" in row:
            total = row.split()[-1]
            return total

        elif "TOTALSAR" in row:
            total = row.split()[-1]
            return total
                


In [9]:
#This function runs all the codes Necessary 
def pdfProcessing():
    #variables
    InvList = []
    fields = ["Invoice Number", "Invoice Date", "Subtotal Amount", "Tax Amount", "Total After Tax"]
    files = list(filedialog.askopenfilenames(filetypes=[('PDFS','*.pdf')]))
    filename = "Invoices Data.csv"
    
    #loop through set of files uploaded
    if files is not None:
        for file in files:
            #here specific file name
            fileLink = file
            #processPDF
            invTxt = (get_pdf_file_content(fileLink))
            text = extract(fileLink)
            #extract info
            InvoiceNum = findInvoiceNum(invTxt)
            invoiceDate = findInvoiceDate(invTxt)
            total = findTotal(text)
            tax = findTax(text)
            TotalAfterTax = findTotalAfterTax(text)
            #write into some list
            InvList.append([InvoiceNum,invoiceDate,total, tax,TotalAfterTax])
        
        #creating and saving into a csv file
        with open(filename, 'w', encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            # write the header
            writer.writerow(fields)
            # write multiple rows 
            writer.writerows(InvList)
        messagebox.showinfo('Processing Complete', 'Invoice Extraction Complete!')


In [10]:
def main():
    global main, labelFrame
    #creating a window
    main = Tk()
    main.title("Invoice Recognition System")
    main.minsize(width=400,height=400)
    main.geometry("800x500")

    #background colors
    Canvas1 = Canvas(main)
    Canvas1.config(bg="#4983C4")
    Canvas1.pack(expand=True,fill=BOTH)

    #frames and labels
    headingFrame1 = Frame(main,bg="#123A5F",bd=5)
    headingFrame1.place(relx=0.2,rely=0.09,relwidth=0.6,relheight=0.20)
    
    headingLabel = Label(headingFrame1, text="Welcome! \n Please upload your invoices for information detection! \n Please note only PDFs allowed!", bg='white',
                         fg='black', font=('Times',20))
    headingLabel.place(relx=0,rely=0, relwidth=1, relheight=1)
 
    labelFrame = Frame(main, bg='#4983C4')
    labelFrame.place(relx=0.1, rely=0.3, relwidth=0.8, relheight=0.5)
    
    #buttons
    btn1 = Button(labelFrame,text="Upload Invoices",bg='black', fg='gray', command=pdfProcessing)
    btn1.place(relx=0.5, rely=0.5, anchor=CENTER, relwidth =0.5, relheight = 0.2)
  
    
    mainloop()
    
main()
