## Tree Stuff

In [1]:
### 2do: create static class and add 'mode' and 'verbose' as class variables

In [4]:
import pandas as pd 
import os
import string
import subprocess
import time
import shutil
from fuzzywuzzy import fuzz
import pytesseract as ps

GMRA_subtext = 'EONIA in case of Cash Margin in euro'
GMSLA_subtext = 'euro: “EUR-EONIA” for a day “d” in the relevant Calculation Period, is a reference rate equal to the overnight rate as calculated by the European Central Bank on the first TARGET Settlement Day and reported on Bloomberg page “EONIA Index”'
ISDACSA_subtext = 'EONIA. “EONIA” means for any day the reference rate equal to the overnight rate as calculated by the European Central Bank and appearing on Reuters page EONIA on the first TARGET Settlement Day following that day. For the purposes of this Annex, TARGET Settlement Day means any day on which the Trans-European Automated Real-Time Gross Settlement Express Transfer (TARGET) System is open.'

GMRA_subtext = GMRA_subtext.translate(str.maketrans('', '', string.punctuation)).lower()
GMSLA_subtext = GMSLA_subtext.translate(str.maketrans('', '', string.punctuation)).lower()
ISDACSA_subtext = ISDACSA_subtext.translate(str.maketrans('', '', string.punctuation)).lower()

def_dict = {'CSA':ISDACSA_subtext, 'GMRA':GMRA_subtext, 'GMSLA':GMSLA_subtext}


def walk_through_files(path, file_extension=('.pdf', '.tiff')):                # can check for both if tuple!
    for (dirpath, dirnames, filenames) in os.walk(path):
        for filename in filenames:
            if filename.endswith(file_extension): 
                yield os.path.join(dirpath, filename).replace("\\","/")        # ensure right backslashes (Windows)!
                #yield dirpath+'/'+filename

                
def generateDocList(directory):
    print("Generating List of documents in parent folder...")
    documentsList = []                            
    for fname in walk_through_files(directory):
        #print(fname)
        documentsList.append(fname)
    return documentsList


def convert2imageOrMove(pdf_file, fullfilename, newfoldername, poppler_path):
#     print("c2iom:")
#     print("pdf_file: "+str(pdf_file))
#     print("fullfilename: "+str(fullfilename))
    dpi=300
    if not os.path.exists(newfoldername):
        os.mkdir(newfoldername)
        #print("Directory " , outputpath ,  " Created ") 
        if pdf_file.endswith(".pdf"):                              # for pdfs
            outputfile = newfoldername + '/'+os.path.splitext(fullfilename)[0]+'.jpeg'
            #print(outputfile)
            #assert os.path.exists(outputfile)
            process = subprocess.Popen('"%s" -jpeg -r %s "%s" "%s"' % (poppler_path, dpi, pdf_file, outputfile))
            out, err = process.communicate()
        else:                                                      # for tiffs
            #print("From "+str(docspath+pdf_file)+" to "+str(filename+"/"+pdf_file))
            #print(pdf_file)
            #print(newfoldername+'/'+pdf_file)
            shutil.copy(pdf_file, newfoldername+'/'+fullfilename)
            
def generateFolderList(documentsList):
    print("Creating document folders for OCR...")
    poppler_path = r"C:/Users/george.gousios.SYNECHRON/Desktop/libs/poppler-0.68.0/bin/pdftoppm.exe"    # path to extracted Poppler
    assert os.path.exists(poppler_path)
    foldersList = []                          # list of created folders! 
    for pdf_file in documentsList:           # for every file in documentsList
        #print("pdf_file: "+str(pdf_file))
        fullfilename = os.path.basename(pdf_file)
        filefolder = os.path.dirname(pdf_file)
        #print("file folder: "+str(filefolder))
        #print("fullfilename: "+str(fullfilename))
        filename, fileextension = os.path.splitext(fullfilename)
        #print("filename: "+str(filename))
        #print("fileextension: "+str(fileextension))
        newfoldername = filefolder+'/'+filename
        #print("newfoldername: "+str(newfoldername))
        foldersList.append(newfoldername)                    # add to created folders list
        convert2imageOrMove(pdf_file, fullfilename, newfoldername, poppler_path)
    return foldersList
            

def deleteFolders(foldersList):
    print("Cleaning up...")
    for folder in foldersList:          # delete all generated folders
        if os.path.exists(folder):
            shutil.rmtree(folder)
    print("Done!")

    
def ocr(img): 
    ps.pytesseract.tesseract_cmd = r'C:\Tesseract-OCR\tesseract'             # needed if tesseract not in PATH
    text = ps.image_to_string(img)
    text = text.translate(str.maketrans('', '', string.punctuation)).lower().rstrip() # remove punctuation and lowercase                                                
    return text


def getscore(pagetext, docclass):
    #return fuzz.token_set_ratio(pagetext, def_dict[docclass])            # needs change
    return fuzz.partial_ratio(pagetext, def_dict[docclass])             # maybe better?
    

def getexactmatch(pagetext, docclass):    
    return def_dict[docclass] in pagetext


def inferTrueClass(fullpath):
    filename = os.path.basename(fullpath).lower()
    filename, fileextension = os.path.splitext(filename)
    #print(filename)
    if 'csa' in filename:
        return 'CSA'
    elif 'gmra' in filename:
        return 'GMRA'
    elif 'gmsla' in filename:
        return 'GMSLA'
    else:
        return 'unknown'
    
# def trueclass(name):                                          # update to create inference based on parent folder
#     parts = name.split(' ')
#     docClass = parts[0]
#     return parts[0]

def trueclass(fullpath):
    inference = inferTrueClass(fullpath)
    if inference !='unknown':
        return inference
    else:
        print("Warning: Class for "+str(fullpath)+" cannot be infered! Defaulting to GMRA")
        return 'GMRA'


def getContractsFromPath(path):
    return os.path.basename(path)


def classifypdf(fullpath):
    #print(fullpath)
    eonialist = []
    filename = os.path.basename(fullpath)                                      #'CSA example 1.pdf'
    filename2, fileextension = os.path.splitext(filename)                      #'CSA example 1' , '.pdf'
    parentdir = os.path.dirname(fullpath)                                      #'./parentdir/1'
    pdfimages = parentdir+'/'+filename2                                        #'./parentdir/1/CSA example 1'
    #pdfimages+'/'+filename2 
    docclass = trueclass(fullpath)
    #print(pdfimages)
    #print('\n')
    maxscore = -1
    maxpage = -1
    count = 0
    for image in os.listdir(pdfimages):                       # for every page (image) of document
        eoniaflag = 0                                         # initialize eonia flag for every page
        count = count+1                                       # keep current page number
        image_path = pdfimages+'/'+image
        pagetext = ocr(image_path)                            # get text from image (lowercased + punc. removed)
        score = getscore(pagetext,docclass)                   # get a match score for the document class definiton
        exactmatch = getexactmatch(pagetext,docclass)         # get an exact match (True/False)
        if 'eonia' in pagetext:                               # check for any reference of 'eonia' in the page
            eoniaflag = 1
            #print('Found EONIA in page'+str(count))
            eonialist.append(count)
        if score>maxscore:
            maxscore = score
            maxpage = count  
            #print("\tNew rec: "+str(maxscore)+" on page "+str(count))
            if maxscore>90 & eoniaflag==1:
                return docclass, exactmatch, maxscore, maxpage, eonialist        
    return docclass, exactmatch, maxscore, maxpage, eonialist


def setMatch(row):
    pageno = row['PageNo']
    eoniarefpages = row['EoniaRefPages']
    matchperc = row['MatchPercentage']
    if pageno in eoniarefpages and int(matchperc)>90:
        return True
    else:
        return False
    

def parseDocuments(documentsList):           # Single Threaded execution takes 1 minute per thingy
    print("Parsing Documents...")
    if len(documentsList)>0:
        df = pd.DataFrame() 
        df['Contracts'] = [path for path in documentsList]  
        #df['Contracts'] = df.apply(lambda row: getContractsFromPath(row['Paths']), axis=1)  
        #df['Folders'] = [folder for folder in foldersList]    
        df['TrueClass'], df['ExactMatch'], df['MatchPercentage'], df['PageNo'], df['EoniaRefPages'] = zip(*df['Contracts'].map(classifypdf))
        df['Match'] =  df[['PageNo', 'EoniaRefPages', 'MatchPercentage']].apply(setMatch, axis=1)
        return df
    else:
        print("No Documents Found")
        return pd.DataFrame()

    
def extractResultsToCsv(df, savepath='./results_tree_partialratio.csv'):
    df.to_csv(savepath, index=False)
    print("Results exported to "+str(savepath))
    
def inputcheck(directory, mode, verbose):
    assert os.path.exists(directory)
    assert mode == 'partial' or mode == 'tokenset'
    assert verbose in range(0,2)
    
    
def folderDocOCR(directory, mode='tokenset', verbose=0):
    inputcheck(directory, mode, verbose)
    start = time.time()
    documentsList = generateDocList(directory)
    foldersList = generateFolderList(documentsList)
    df = parseDocuments(documentsList, mode, verbose)
    extractResultsToCsv(df)
    deleteFolders(foldersList)
    end = time.time()
    print("Process took "+str(float(end - start)/60)+" minutes to complete")

In [4]:
# # clean up dirty shit

# directory = './parentdir'
# documentsList = generateDocList(directory)
# foldersList = generateFolderList(documentsList)
# deleteFolders(foldersList)

In [25]:
# Run the whole process

parentdir = './documents'

folderDocOCR(parentdir) 