### Extract claim value for each vendor

In [None]:
import os
import time
import camelot
import pandas as pd
import numpy as np
import re
import PyPDF2
from glob import glob

claims_path = r''
input_path = r''
output_path = r''


In [None]:
claims = pd.read_csv(claims_path)
claims['DTT_Claim_Number_Standardised'] = claims['DTT_Claim_Number_Standardised'].apply(str)
claims

In [None]:
# create a df that has all of the files that have been extracted from first layer extraction
file_list = glob(input_path+'\*.pdf')
# for file in file_list
files = pd.DataFrame(file_list, columns =['filepath']) 
files['filepath'] = files['filepath'].astype(str)
files

In [None]:
# Match a filepath to its corresponding claim number
claims['filepath'] = claims['DTT_Claim_Number_Standardised'].apply(lambda claim: files[files['filepath'].str.contains(claim)]['filepath'].any(0))
claims

In [None]:
#check how many claim numbers do not have corresponding PDFs
missing = claims[claims['filepath'] == False]
missing.shape

In [None]:
#remove rows without corresponding PDF file path
claims = claims[claims['filepath'] != False]
claims.shape

In [None]:
# check for duplicate files for the same claim number
duplicates = claims[claims.duplicated(['filepath'])]
duplicates #no duplicates in this case

In [None]:
#extract the claim value from each PDF
t0 = time.time()

success = []
values = []
for ind in claims.index:
    df = pd.DataFrame()
    file = claims['filepath'][ind]
    if os.path.exists(file):
        s = False
        v = 0
        try: 
            tables = camelot.read_pdf(file, flavor='stream', pages='all',encoding = 'ISO-8859-1',table_areas=['0,800,800,00'], strip_text=',$') # ,table_areas=['0,842,595,0']
            if len(tables) == 0:
                print(file + ' no content read through, extraction not successful')
            else:
                i = 0
                found = False
                while (i < len(tables) and found == False):
                    df = tables[i].df
                    for col in df.columns:
                        if not (df[df[col].str.contains('Tax 15.00%')].empty):
                            found = True
                            print(file.split('\\')[-1], "page using:", i)
                    i += 1
                if found == False:
                    df = tables[-1].df
                
                list1 = df.stack().tolist() #convert to list
                list2 = [s for s in list1 if "." in s] #remove all elements of list not containing decimal
                list3 = [s.split() for s in list2] #split random number containing a new line (meant to have been in seperate cells)
                list4 = sum(list3, []) #flatten to one list instead of list of lists
                list4a = [s for s in list4 if "%" not in s] #remove all percentages
                list5 = [re.sub('[^0-9.-]', '', s) for s in list4a] #remove every character that isn't a number or a decimal point
                list6 = [float(s) for s in list5 if re.match('^-?[0-9]+(\.[0-9][0-9])$',s)] #remove any item that does not fit [digits].[digit][digit]
                list6a = [s for s in list6 if s != 0] #remove all zero elements 
                list7 = list6a[-3:] #take the last 3 items
                list7.sort(key=abs)
        #         print(list7, "sum:", list7[0]+list7[1], "w/o GST:", list7[1], "w GST:", list7[2])
                if (len(list7) > 2 and round(list7[0]+list7[1], 2) == round(list7[2], 2)):
                    s = True
                    v = round(list7[1], 2)
                else:
                    s = False
                    v = round(max(list7, key=abs)*10/11, 2)

        except ValueError: 
            print(file, 'Extaction not successful') 
    else:
        print(file, "Could not be opened")
    success.append(s)
    values.append(v)

claims['w/o+GST=w'] = success
claims['claim w/o GST'] = values

t1 = time.time()
print("total time:", t1-t0)
claims

In [None]:
# check where the values are negative of themselves
discrepency = claims[claims['AMOUNT_IN_DOC_CURR'] != claims['claim w/o GST']]
discrepency = discrepency[abs(discrepency['AMOUNT_IN_DOC_CURR']) == abs(discrepency['claim w/o GST'])]
discrepency.shape

In [None]:
unopened = claims[claims['claim w/o GST'] == 0]
unopened

In [None]:
# only looking at files that were able to be opened
claims = claims[claims['claim w/o GST'] != 0]
claims

In [None]:
#rows where the internal validity check passed
passed = claims[claims['w/o+GST=w'] == True]
passed

In [None]:
#rows where the three numbers used are not the claim value with and without GST and GST (ie. don't pass the internal check)
failed = claims[claims['w/o+GST=w'] == False]
failed

### Check the new results against the old edc ones

In [None]:
# load all of the output into 1 dataframe
file_list = glob(r'')
file_list

In [None]:
check_lines = [pd.read_csv(f, sep=',', dtype=str, encoding='windows-1252', low_memory=False) for f in file_list]
check =pd.concat(check_lines, ignore_index=True)
check

In [None]:
# merge the claim value obtained from the EDC results with the data frame
claims['check'] = claims['DTT_Claim_Number_Standardised'].apply(lambda claim: check[check['Reference No.'].str.contains(claim)]['Claim Amount Excl GST'].any(0))
claims['check'] = claims['check'].str.replace(',', '').astype(float)
claims

In [None]:
# consider claims for which the new value matches the old value to be a positive
positives = claims[claims['claim w/o GST'] == claims['check']]
positives

In [None]:
# consider claims for which the new value does not match the old value to be a negative
negatives = claims[claims['claim w/o GST'] != claims['check']]
negatives

In [None]:
# false positives
fp = claims[claims['claim w/o GST'] != claims['check']]
fp = fp[fp['w/o+GST=w'] == True]
fp

In [None]:
# true negatives
tn = claims[claims['claim w/o GST'] != claims['check']]
tn = tn[tn['w/o+GST=w'] == False]
tn

In [None]:
# true positives
tp = claims[claims['claim w/o GST'] == claims['check']]
tp = tp[tp['w/o+GST=w'] == True]
tp

In [None]:
# false negatives
fn = claims[claims['claim w/o GST'] == claims['check']]
fn = fn[fn['w/o+GST=w'] == False]
fn

### Export the data

In [None]:
claims.loc['filename'] = claims['filepath'].str.split('\\').str[-1]
claims['Year'] = 'FY1920'
claims = claims.reset_index()
claims.columns

In [None]:
# Match the expected formatting
export_df = claims[['filename', 'DTT_Customer_Standardised', 'DTT_Claim_Number_Standardised', 'claim w/o GST', 'filepath', 'Year', 'w/o+GST=w']]

export_df = export_df.rename(columns={"filename": "FileName",
                                      "DTT_Customer_Standardised": "Vendor", 
                                      "DTT_Claim_Number_Standardised": "Reference No.", 
                                      "claim w/o GST": "Claim Amount",
                                      "filepath": "FilePath", 
                                      "w/o+GST=w": "Success"})
# export_df = export_df.reset_index()
export_df

In [None]:
export_df.to_csv(output_path+'', index=True)