# Name File PDFs
## Author: 2d Lt Marc Allerheiligen, USAFA DFVR

This program takes a folder of arbitrarily named PDFs,
reads the name on the orders at the ends of them,
and puts renamed PDFs into another folder.

It will send files ot 2 out folders: 
    One for verified names and one for unverified names.
    If there is a folder for a person in the verified names folder,
    then the program knows that that is a correct name and adds the PDF to that folder.
        If you run the PDF splitter first, then these folders will already exist.
    Otherwise, PDFs go into the unverified names folder for manual verification.


## Library Installation

Library installation must be run once per machine this code is used on.  
There is no problem with running the whole file, but if the libraries are installed, then the program will run faster if you only run code after library installation.

easyocr is finicky, which is why all potential fixes are included below.

In [None]:
%%capture
!pip install torch torchvision torchaudio
!pip install -U numpy --user # fixes easyOCR
!pip install opencv-contrib-python
!pip install easyocr

In [None]:
%%capture
!pip install PyMuPDF

# Parameters

In [None]:
'''PARAMETERS'''
IN_PATH = "PDFs in/"
RAW_IMG_PATH = "cache/images/raw/"
ROTATED_IMG_PATH = "cache/images/rotated/"
TXT_PATH = "cache/texts/"
NAMES_PATH = "cache/names/"
OUT_PATH_FAIL = "PDFs OCR failed to name/"
OUT_PATH_GUESSED_NAME = "PDFs with guessed names/"
OUT_PATH_VERIFIED = "PDFs named correctly/"
SCANNED_WRONG_PATH = "scanned incorrectly/"
NEEDS_RESCAN_PATH = "needs to be rescanned/"
BATCH_SIZE = 100 # passed to OCR reader call
SHALLOW_FILE_COMPARE = False
SUFFIX = ' (F)'
MIN_IMG_DIMENSION = 5 # if either width or height is below this, the image is ignored.

#Replace a part of a name before attempting to tell what is a middle name 
    #and what is a 2-part last name:

# Imports

In [None]:
import easyocr
import os
import fitz # PyMuPDF
from tqdm import tqdm
from PIL import Image
from re import findall, sub
from shutil import copy, rmtree
from filecmp import cmp
from unidecode import unidecode
from time import time
import re
from string import punctuation

# Functions

In [None]:
global TIME_SPENT_ON
TIME_SPENT_ON = {
    'READING TEXT':0,
    'OCR':0,
    'INTERPRETING TEXT':0,
    'SANITIZING TEXT':0
}

## File Functions

In [None]:
def makeFolders(filePath):
    assert not '//' in filePath, filePath
    folderPath = '/'.join(filePath.split('/')[:-1])+'/'
    if not os.path.isdir(folderPath): os.makedirs(folderPath)

def filePathsIn(folderPath, substringNeeded = ''):
    if os.path.isfile(folderPath):
        filePath = folderPath
        if substringNeeded in filePath:
            return [filePath]
        return []
    #else
    paths = list()
    for name in os.listdir(folderPath):
        subPath  = folderPath + '/' + name
        subPath = subPath.replace('//','/')
        paths.extend(filePathsIn(subPath,substringNeeded))
    return paths

def whichPage(imagePath): return imagePath.split('page_')[1].split('_of')[0]
def getNumPages(imagePath): return imagePath.split('of_')[1].split('_image')[0]

def smartCopy(f, t):
    makeFolders(t)
    copy(f, t)

## Image Functions

In [None]:
ROTATE = {90:Image.ROTATE_90,
          180:Image.ROTATE_180,
          270:Image.ROTATE_270}

def makeImages(PdfPath):
    rawImgFolderPath = PdfPath.replace(IN_PATH, RAW_IMG_PATH).split('.')[0]+'/'
    rotatedImageFolderPath = rawImgFolderPath.replace(RAW_IMG_PATH,ROTATED_IMG_PATH)
    imageAreas = dict()
    doc = fitz.open(PdfPath)
    numPages = len(doc)
    pageNum = 0
    for page in doc:
        pageNum += 1
        image_list = page.get_images()
        if not image_list:
            print("No images found on page",pageNum,"of PDF at:",PdfPath)
        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            imageName = "page_{:02d}_of_{:02d}_image_{:03d}".format(pageNum,numPages,image_index)+".png"
            rawImgPath = rawImgFolderPath + imageName
            if not os.path.isfile(rawImgPath):
                xref = img[0] # get the XREF of the image
                pix = fitz.Pixmap(doc, xref) # create a Pixmap
                if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                    pix = fitz.Pixmap(fitz.csRGB, pix)
                makeFolders(rawImgPath)
                pix.save(rawImgPath) # save the image as png
            imageArea = rotateImageAndGetArea(rawImgPath,page.rotation)
            if imageArea:
                rotatedImagePath = rotatedImageFolderPath + imageName
                imageAreas[rotatedImagePath] = imageArea
    return imageAreas

def rotateImageAndGetArea(rawImgPath, rotation):
    originalImage = Image.open(rawImgPath)
    w, h = originalImage.width, originalImage.height
    imageArea = (w * h)
    if w < MIN_IMG_DIMENSION or h < MIN_IMG_DIMENSION:
        return False
    
    rotatedImagePath = rawImgPath.replace(RAW_IMG_PATH,ROTATED_IMG_PATH)
    if os.path.isfile(rotatedImagePath):
        return imageArea
    if rotation == 0:
        rotatedImage = originalImage
    else:
        rotatedImage = originalImage.transpose(ROTATE[360-rotation])
    makeFolders(rotatedImagePath)
    rotatedImage.save(rotatedImagePath)
    return imageArea

def readImage(imgPath):
    '''READ CACHED TEXT'''
    textPath = imgPath.replace(ROTATED_IMG_PATH,TXT_PATH).replace('.png','.txt')
    startTime = time()
    global TIME_SPENT_ON
    if os.path.isfile(textPath):
        with open(textPath, 'r') as f:
            result = f.read().split('\n')
            TIME_SPENT_ON['READING TEXT'] += (time()-startTime)
            return result
        print("An existing text file could not be read: textPath")
    '''OCR'''
    startTime = time()
    try:
        result = OcrReader.readtext(imgPath, detail = 0, batch_size = BATCH_SIZE)
    except:
        print("Could not read:",imgPath)
        result = []
    else:
        makeFolders(textPath)
        with open(textPath, 'w') as f: f.write('\n'.join(result))
    TIME_SPENT_ON['OCR'] += (time() - startTime)
    return result

## String Functions

In [None]:
def getYearFromString(string):
    for year in range(2023,2099):
        if str(year) in string:
            return str(year)
    assert False, "a string was passed to get the year from it, but no year was found:"+string

def sanitize(string):
    startTime = time()
    if type(string) == list:
        result = string
        return sanitize(''.join(result))
    string = unidecode(string).lower()
    for char in " ,;:'":
        string = string.replace(char,'')
    global TIME_SPENT_ON
    TIME_SPENT_ON['SANITIZING TEXT'] += (time()-startTime)
    return string

def nameFormats(name):
    # note that variable names assume 2 middle names and 2 last names
    assert type(name) == str, name
    assert ', ' in name
    l = list()
    if len(name.split(', ')[1].split(' ')) >= 2: # if middle name
        LL, FMM = name.split(', ')
        F = FMM.split(' ')[0]
        LLF = LL+', '+F
        FLL = F+' '+LL
        l.extend([LLF,FLL])
    FMMLL = ' '.join(reversed(name.split(', ')))
    l.append(FMMLL)
    return l

def splitName(LLFMM):
    assert ', ' in LLFMM
    LL, FMM = LLFMM.split(', ')
    if ' ' in LL:
        if len(LL.split()[-1].replace('l','').replace('I','')) == 0:
        # If you remove l and I and the length is 0, remove it
            LL = LL.split()[0]
    FMM = FMM.split(' ')
    FM = FMM[:2]
    FM = ' '.join(FM)
    return (FM,LL)

def nameInListInResult(verifiedNames, result):
    # note that result can be list of strings
    for maybeName in verifiedNames:
        assert not '.' in maybeName, """Please run the file organizer before this program.
            A file was found in a year folder, but this program expects only folders."""
        for maybeFormat in nameFormats(maybeName):
            if sanitize(maybeFormat) in sanitize(result):
                verifiedName = maybeName
                return verifiedName
        # check for second middle name in name with format First Middle Middle last Last
        (FM,LL) = splitName(maybeName)
        # note that below variable names assume 2 middle names and 2 last names
        FMMLL = ' '.join(result)
        if FM in FMMLL and LL in FMMLL:
            MLL = FMMLL.split(FM)[1]
            if LL in MLL:
                M = MLL.split(LL)[0]
                M = M.strip()
                if not ' ' in M:
                    verifiedName = maybeName
                    return verifiedName
    return False

def portionUpper(string):
    return sum(letter.upper() == letter for letter in string) / len(string)

### String functions finding a name from a specific spot

In [None]:
def flipName(name):
    assert not ',' in name, name
    name = name.strip()
    split = name.split()
    numLastNamesDict = {2:1,3:1,4:2,5:2} # assumption, counting III as a 2nd last name
    numLastNames = numLastNamesDict[len(split)]
    if len(split) == 3: # check for First von Last
        if split[1].lower() == split[1]:
            numLastNames = 2
    FMM = split[:-numLastNames]
    LL  = split[-numLastNames:]
    return ' '.join(LL) + ', ' + ' '.join(FMM)

def nameInOrders(result):
    return False

def nameInTranscript(result):
    return False

## Misc Function

In [None]:
def isOdd(i): return bool(i%2)

 # Main

Find PDFs

In [None]:
print("Finding input PDFs...")
inPdfs = filePathsIn(IN_PATH)
print("Found",len(inPdfs),"input PDFs.")
print("Finding processed PDFs")
verifiedPdfPaths = [OUT_PATH_VERIFIED, NEEDS_RESCAN_PATH, SCANNED_WRONG_PATH]
verifiedPdfs = list()
guessedPdfs = list()
for folder in verifiedPdfPaths:
    if os.path.exists(folder):
        verifiedPdfs += filePathsIn(folder, SUFFIX)
if os.path.exists(OUT_PATH_GUESSED_NAME):
    guessedPdfs += filePathsIn(OUT_PATH_GUESSED_NAME)

print("Checking which PDFs from",IN_PATH,"are not also in any of the following folders:\n",verifiedPdfPaths)
print("And also not in:\n",OUT_PATH_GUESSED_NAME)
verifiedInPdfs = set()
guessedInPdfs = set()
for inPdf in tqdm(inPdfs):
    for verifiedOutPdf in verifiedPdfs:
        if inPdf not in verifiedInPdfs:
            if cmp(inPdf,verifiedOutPdf, shallow=SHALLOW_FILE_COMPARE):
                verifiedInPdfs.add(inPdf)
    for guessedOutPdf in guessedPdfs:
        if inPdf not in guessedInPdfs:
            if cmp(inPdf, guessedOutPdf, shallow = SHALLOW_FILE_COMPARE):
                guessedInPdfs.add(inPdf)
print("Found",len(verifiedInPdfs),"PDFs which have already been processed.")
print("Found",len(guessedInPdfs) ,"PDFs which their name was guessed.")

for inPdf in verifiedInPdfs.union(guessedInPdfs):
    inPdfs.remove(inPdf)
    
print("That leaves",len(inPdfs),"PDFs to try to name.")

Make all images on all pages and cache sizes
Then sort images by image size

In [None]:
allImageAreas = dict()
for pdf in tqdm(inPdfs):
    someImageAreas = makeImages(pdf)
    allImageAreas.update(someImageAreas)

In [None]:
sortedListOfTuples = sorted(allImageAreas.items(), key = lambda x:x[1])
sortedListOfPaths = list(x[0] for x in sortedListOfTuples)
imagePaths = sortedListOfPaths

In [None]:
len(imagePaths)

sort again to use last pages first since most names are on the last pages

In [None]:
cachedAlready = list()
lastPage = list()
secondLastPageOfDoubleSided = list()
otherOddPages = list()
otherEvenPages = list()
for path in imagePaths:
    pageNum = int(whichPage(path))
    numPages = int(getNumPages(path))
    pagesFromLast = numPages - pageNum
    if os.path.isfile(path.replace(ROTATED_IMG_PATH,TXT_PATH).replace('.png','.txt')): # if text is cached:
        cachedAlready.append(path)
    elif pagesFromLast <= 1:
        # includes last page of single sided and last page of double sided
        lastPage.append(path)
    elif pagesFromLast == 3:
        secondLastPageOfDoubleSided.append(path)
    elif isOdd(pageNum):
        otherOddPages.append(path)
    else:
        otherEvenPages.append(path)

imagePaths = list()
imagePaths = cachedAlready + lastPage + secondLastPageOfDoubleSided + otherOddPages + otherEvenPages

In [None]:
len(imagePaths)

In [None]:
OcrReader = easyocr.Reader(['en'])

In [None]:
for key in TIME_SPENT_ON:
    TIME_SPENT_ON[key] = 0

# Find names
for image
if not verified:
    look for verified name
    if not guessed name:
        look for guessed name:
        
        

In [None]:
print("""Reading images (or reading cached texts). 
    Note that this is done in increasing order of size,
    so the given time estimate will often be too optimistic,
    especially while reading cached texts. :/""")
allVerifiedNames = dict()
for imagePath in tqdm(imagePaths):
    inPdfPath = imagePath.replace(ROTATED_IMG_PATH,IN_PATH).split('/page')[0]+'.pdf'
    if inPdfPath not in verifiedInPdfs and os.path.exists(inPdfPath): # ignore PDFs which no longer exist
        year = getYearFromString(inPdfPath)
        yearOutFolder = OUT_PATH_VERIFIED+year+'/'
        if year in allVerifiedNames.keys():
            verifiedNames = allVerifiedNames[year]
        else:
            if os.path.exists(yearOutFolder):
                verifiedNames = allVerifiedNames[year] = os.listdir(yearOutFolder)
            else:
                verifiedNames = []
        failPdfPath = inPdfPath.replace(IN_PATH,OUT_PATH_FAIL)
        result = readImage(imagePath) # reads cached text or runs OCR on image
        if len(verifiedNames) > 0:
            startTime = time()
            verifiedName = nameInListInResult(verifiedNames, result)
            TIME_SPENT_ON['INTERPRETING TEXT'] += (time()-startTime)
            if verifiedName:
                outPath = standardPath = OUT_PATH_VERIFIED+year+'/'+verifiedName+'/'+verifiedName+SUFFIX+'.pdf'
                whichScan = 1
                while os.path.isfile(outPath):
                    same = cmp(inPdfPath, outPath, shallow = False)
                    assert not same, "Please delete duplicates from:"+IN_PATH
                    whichScan += 1
                    outPath = standardPath.replace(SUFFIX+'.pdf', SUFFIX+' Scan {}.pdf'.format(i))
                verifiedInPdfs.add(inPdfPath)
                smartCopy(inPdfPath, outPath)
                if os.path.isfile(failPdfPath):
                    os.remove(failPdfPath)
                if os.path.exists(OUT_PATH_GUESSED_NAME):
                    for unverifiedFile in filePathsIn(OUT_PATH_GUESSED_NAME):
                        if cmp(outPath, unverifiedFile, SHALLOW_FILE_COMPARE):
                            os.remove(unverifiedFile)
        # if name not verified, try guessing name for now
        if not verifiedName and inPdfPath not in guessedInPdfs:
            startTime = time()
            guessedName = nameInTranscript(result)
            if not guessedName:
                guessedName = nameInOrders(result)
                if guessedName: guessedName = flipName(guessedName)
            TIME_SPENT_ON['INTERPRETING TEXT'] += (time()-startTime)
            if not guessedName:
                smartCopy(inPdfPath, failPdfPath)
            else: # if guessed name
                outPath = standardPath = OUT_PATH_GUESSED_NAME+year+'/'+guessedName+'/'+guessedName+SUFFIX+'.pdf'
                i = 0
                while os.path.isfile(outPath):
                    same = cmp(inPdfPath, outPath, shallow = False)
                    assert not same, "Please delete duplicates from:"+IN_PATH
                    i += 1
                    outPath = standardPath.replace(SUFFIX+'.pdf', SUFFIX+' Scan {}.pdf'.format(i))
                guessedInPdfs.add(inPdfPath)
                smartCopy(inPdfPath, outPath)
                if os.path.isfile(failPdfPath): os.remove(failPdfPath)
print(len(filePathsIn(OUT_PATH_VERIFIED,SUFFIX)),"files have been named and verified.")
print(len(filePathsIn(OUT_PATH_GUESSED_NAME)),"files have been named but not verified.")
print(len(filePathsIn(OUT_PATH_FAIL)),"files could not be read","\n")

In [None]:
inPdfPath in guessedInPdfs

In [None]:
TIME_SPENT_ON