In [1]:
import tkinter as tk
from tkinter.filedialog import askdirectory
import pandas as pd
import tabula
import re
import glob
import os
    

input_directory = ''
output_directory = ''
window = tk.Tk()
window.configure(bg='light gray')
window.title("PDF Reader")
window.geometry("600x400")    
    
def main_screen():
    
    def clear_screen():
        # Destroy all widgets in the root window
        for widget in window.winfo_children():
            widget.destroy()
    
    def get_directions():
        # Brings user to the direction page
        clear_screen()
        label = tk.Label(font=("Bubblegum Sans", 45), bg='lightgray', fg='Black', text='Directions')
        label.place(x=200, y=10)
        back_label = tk.Label(window, text='Main Page', cursor="hand2",bg='lightgray', fg='blue')
        back_label.place(x=10, y=80)
        back_label.bind("<Button-1>", lambda e: main_screen())
        
        step_1 = tk.Label(window, text='Step one:', font=('arial', 20), fg='black', bg='lightgray')
        step_1.place(x=10, y= 110)
        step_1_text = tk.Label(window, text= 'Choose to have the pdf reader to open the output directory once the \nrun is finished.')
        step_1_text.config(font=('arial', 15), fg='black', bg='lightgray')
        step_1_text.place(x=110, y= 115)
        
        step_2 = tk.Label(window, text='Step two:', font=('arial', 20), fg='black', bg='lightgray')
        step_2.place(x=10, y= 175)
        step_2_text = tk.Label(window, text='Select the input folder you want pdf reader to choose from.')
        step_2_text.config(font=('arial', 15), fg='black', bg='lightgray')
        step_2_text.place(x=110, y= 180)
        
        step_3 = tk.Label(window, text='Step three:', font=('arial', 20), fg='black', bg='lightgray')
        step_3.place(x=10, y= 240)
        step_3_text = tk.Label(window, text=' Select the output folder you want pdf reader to choose from.')
        step_3_text.config(font=('arial', 15), fg='black', bg='lightgray')
        step_3_text.place(x=110, y= 245)
        
        step_4 = tk.Label(window, text='Step four:', font=('arial', 20), fg='black', bg='lightgray')
        step_4.place(x=10, y= 305)
        step_4_text = tk.Label(window, text=' If everything is correct, click the run button.')
        step_4_text.config(font=('arial', 15), fg='black', bg='lightgray')
        step_4_text.place(x=110, y= 310)

        
    def get_output_directory():
        # displays os window for user to select output folder 
        global output_directory
        remove_error()
        output_directory = askdirectory()
        output_entry.config(fg='black', bg='lightgray', text=output_directory)
        
    
    def get_input_directory():
        # displays os window for user to select input folder
        global input_directory
        remove_error()
        input_directory = askdirectory()
        input_entry.config(bg='lightgray', fg='black', text=input_directory)
    
    def remove_error():
        # removes the warning label 
         warning_label.config(bg='lightgray', text='')
            
            
    def run_button():
        # checks to make sure that input/output folders have been selected and then runs. 
        global window
        if input_directory == '' or output_directory == "":
            warning_label.config(bg='lightgray', font=('Impact', 20), fg='darkred', text="ERROR\nYou must select input/output folders before clicking run")      
        else:    
            read_pdfs()
        
        
    def read_pdfs():
        # runs all of the nessasary logic for the pdfs to be read and desired data to be scrapped. If the pdf account num exist as a excel file then the new data is appended to it.
        # otherwise it creates a new file. 
        global input_directory
        global output_directory
        invoice_reg = r'\d+[^ ]*'
        ACC_REG = r'(?<=\s)\d+'
        TOTAL_REG= r'\$(\d+(?:,\d+)?(?:\.\d+)?)'
        running_label.config(text='Running.....', fg='black')
        
        area = [0, 0, 1000, 1000]
        pdf_files = glob.glob(f'{input_directory}/*.pdf')
        
        if pdf_files == []:
            warning_label.config(bg='lightgray', font=('Impact', 20), fg='darkred',text='Input folder does not have pdf files.\nPlease choose another folder and try again')
        else:
            run_button.config(state='disabled')
            
            for pdf_file in pdf_files:
                print(pdf_file)
                
                dfs = tabula.read_pdf(pdf_file, area=area, pages='all')

                df = pd.concat(dfs)
                df.fillna('', inplace=True)
                df = df.drop_duplicates()
                is_invoice = df.isin(['Invoice No. / Date']).any().any()

                if is_invoice:
                    doc_type = 'Invoice'
                    invoice_column = df.columns[df.isin(['Invoice No. / Date']).any()][0]
                    invoice_mask = df[invoice_column].str.contains('Invoice No. / Date')
                    date_index = df[invoice_mask].index.values[0]
                    invoice_date = df.loc[date_index, 'Invoice']
                    acc_column = df.columns[df.isin(['Invoice No. / Date']).any()][0]
                    acc_mask = df[acc_column].str.contains('Purchase Order No.')
                    acc_index = df[acc_mask].index.values[0]

                    try:
                        acc_num = df.loc[acc_index, 'Invoice']
                        acc_num = re.findall(ACC_REG, acc_num)[0]
                    except TypeError:
                        acc_num = df.loc[acc_index, 'Invoice'].values[0]
                        acc_num = re.findall(ACC_REG, acc_num)[0]

                    total_mask = df["Invoice"].str.contains('Qty Total Curr.')
                    start = df[total_mask].index.values[0]
                    total = df.loc[start + 1, 'Invoice']
                    start = df[total_mask].index.values[0]
                    total = re.findall(TOTAL_REG, total)[0]  

                else:
                    doc_type = 'Returns Credit Memo'
                    invoice_mask = df['Unnamed: 1'].str.contains('Credit N. / Date')
                    date_index = df[invoice_mask].index.values[0]
                    invoice_date = df.loc[date_index, 'Returns Credit Memo']
                    acc_mask = df['Unnamed: 0'].str.contains('Bill-To Address')
                    acc_index = df[acc_mask].index.values[0] + 1 
                    acc_num = df.loc[acc_index, 'Unnamed: 0']
                    total_mask = df["Unnamed: 0"].str.contains('Credit Note Summary')
                    start = df[total_mask].index.values[0]
                    total = df.loc[start + 1, 'Returns Credit Memo']
                    total = '-' + re.findall(TOTAL_REG, total)[0]


                try:
                    invoice_num = re.findall(invoice_reg, invoice_date)[0]
                except TypeError:
                    invoice_date = invoice_date.values[0]
                    invoice_num = re.findall(invoice_reg, invoice_date)[0]

                date = re.findall(invoice_reg, invoice_date)[1]

                data = {
                    'Date': date,
                    'Doc_type': doc_type,
                    'Invoice': invoice_num,
                    'Account': acc_num,
                    'Total': total
                }

                df1 = pd.DataFrame(data, index=['0'])


                try:
                    df2 = pd.read_excel(f'{output_directory}/{acc_num}.xlsx')
                    df3 = pd.concat([df2, df1], ignore_index=True)
                    df3.to_excel(f'{output_directory}/{df1.Account[0]}.xlsx', index=False)
                    print('found existing file')

                except FileNotFoundError:
                    df1.to_excel(f'{output_directory}/{df1.Account[0]}.xlsx', index=False)
                    print('created new file')
                        
            running_label.config(text='DONE', fg='black')
            if var.get() == '1':
                os.system(f'open "{output_directory}"')
    
        
    
    clear_screen()
    label = tk.Label(window, text="Welcome to PDF Reader!")
    label.config(font=("Bubblegum Sans", 45), bg='lightgray', fg='Black')
    label.place(x=60, y=10)


    input_button = tk.Button(window, text='Select input', cursor="hand2", command=get_input_directory,  borderwidth=0, highlightthickness=0, width=8)
    input_button.place(x=10, y= 200)

    output_button = tk.Button(window, text='Select output', cursor="hand2", command=get_output_directory,  borderwidth=0, highlightthickness=0, width=8)
    output_button.place(x=10, y= 240)

    directions_label = tk.Label(window, text='Directions', cursor="hand2", fg='blue', bg="lightgray")
    directions_label.place(x=10, y=80)
    directions_label.bind("<Button-1>", lambda e: get_directions())

    run_button = tk.Button(window, text="Run", command=run_button, cursor="hand2",  borderwidth=0, highlightthickness=0)
    run_button.place(x=10, y=280)
    
    var = tk.StringVar()
    open_when_done_button = tk.Checkbutton(window, variable=var, text= ": Open output when done", bg='lightgray', fg='black', activebackground="blue")
    open_when_done_button.place(x=10, y=150)

    warning_label = tk.Label(window, text="")
    warning_label.config(bg='lightgray')
    warning_label.place(x=80, y = 310)

    running_label = tk.Label(window, text="")
    running_label.config(bg='lightgray', fg='black')
    running_label.place(x=80, y = 280)

    input_entry = tk.Label(window, bg='lightgray',fg='black')
    input_entry.place(x=150, y=200)

    output_entry = tk.Label(window, bg='lightgray')
    output_entry.place(x=150, y=240)
      
    window.mainloop() 
    
main_screen()  


2023-08-21 10:18:13.794 python[9399:822327] +[CATransaction synchronize] called within transaction
2023-08-21 10:18:13.891 python[9399:822327] +[CATransaction synchronize] called within transaction
2023-08-21 10:18:20.148 python[9399:822327] +[CATransaction synchronize] called within transaction
2023-08-21 10:18:20.209 python[9399:822327] +[CATransaction synchronize] called within transaction


/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915801235_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915719615_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915605797_ZFU3.pdf
found existing file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915706115_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915760507_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915615395_ZFU3.pdf
found existing file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915726096_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915676958_ZFU3.pdf
created new file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915725670_ZFU3.pdf
found existing file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915584514_ZFU3.pdf
found existing file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915679251_ZFU3.pdf
found existing file
/Users/joshburch/projects/pdf-reader/pdf/93pdf/6915771654_ZFU