In [21]:
from google.cloud import vision
from google.cloud import storage
from PIL import Image
import numpy as np

diagnostic = False

In [22]:
def CroppedImage(file):
    """Crops the image to only include the label."""
    img = Image.open(file)
    width, height = img.size

    left = 1 * width / 4
    right = 3 * width / 4
    top = 0
    bottom = height / 3
    cropped_image = img.crop((left, top, right, bottom))

    picture = cropped_image.save('%s_Cropped.JPG' %file[:-4])

In [23]:
def ReadImage(path):
    """Reads an image, crops it and return the text in the output format required."""
    from google.cloud import vision
    from PIL import Image
    import numpy as np

    CroppedImage(path)
    cropped_path = '%s_Cropped.JPG' %file[:-4]

    client = vision.ImageAnnotatorClient()

    with open(cropped_path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image, image_context={"language_hints": ["ko"]})

    output_string = ''

    word_confidences = []

    for page in response.full_text_annotation.pages:
        for block in page.blocks:

            for paragraph in block.paragraphs:
                
                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])

                    output_string += '%s-' %word_text
                    word_confidences.append(word.confidence)

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )

    print('Output:', output_string[:-1].upper(), "\nConfidence:", np.prod(word_confidences))
    return output_string[:-1].upper(), word_confidences

In [24]:
def FindErrors(output_string):
    """Function to find all the errors associated with the inital output with subfunctions which act to rectify these errors and 
    present the output in a suitable format."""
    output_string = RemoveSpecialCharacters(output_string)

    output_split = output_string.split('-')
    output_split = RemoveDeadElements(output_split)

    if len(output_split) != 3:
        print("Incorrect number of paragraphs. There are %i, there should be three (3)" %len(output_split))
        return 1
    else:
        print("\nCorrect number of paragraphs.")

    title, identity, whole_date = output_split

    title = TitleErrors(title)
    identity = IdentityErrors(identity)
    whole_date = DateErrors(whole_date)

    label = "%s-%s-%s" %(str(title), str(identity), str(whole_date))
    print("\nFinal label is:", label)
    
    return label

In [25]:
def RemoveSpecialCharacters(output_string):
    """Function removes all special characters that are read by the OCR."""
    print("\nRemoving special characters from the output string (e.g. '.', '|').")

    for character in output_string:
        if 'A' <= character <= 'Z' or '0' <= character <= '9' or character == '/' or character == '-':
            if diagnostic == True:
                print("Character %s is fine." %character)
        else:
            print("Character %s has been removed." %character)
            output_string = output_string.replace(character, "")

    return output_string

def RemoveDeadElements(output_split):
    """Function removes paragraphs that are not the right size. These paragraphs are often formed when the 
    OCR reads spetial characters frin the image."""
    print("\nRemoving paragraphs without three (3), four (4), five (5) or eight (8) elements.")
    empty = []

    for element in output_split:
        if len(element) == 3 or len(element) == 4 or len(element) == 5 or len(element) == 8:
            empty.append(element)
        else:
            if element == "":
                print("Removed empty paragraph")
            else:
                print("Paragraph removed:", element)
                print("Length of removed paragraph:", len(element))
    
    output_split = empty

    print("Label after removing dead paragraphs:", output_split)

    return output_split


In [26]:
def TitleErrors(title):
    """Finds errors in the title. The title should be three (3) characters long and contain only english capitcal letters."""
    print("\nLooking for errors in the title (%s)." %title)

    if len(title) != 3:
        print("Incorrect title length. Title has %i elements, three (3) are required." %len(title))

    for character in title:
        if not 'A' <= character <= 'Z':
            print("Incorrect character (%s) in the title" %character)
            return 1

    print("Final title:", title)

    return title

def IdentityErrors(identity):
    """Finds errors in the identity. The identy should be four (4) or five (5) characters long and should follow the
    pattern [number, letter, number, letter number*] with the final number being optional. Any other format is an error."""
    print("\nLooking for errors in the identity (%s)." %identity)

    acceptable_numbers = ['1', '2', '3', '4', '5', '6', '7', '8']
    acceptable_letters = ['B', 'F', 'G', 'K', 'N', 'O', 'P', 'R', 'S', 'V', 'W', 'Y']

    if len(identity) != 4 and len(identity) != 5:
        print("Incorrect identity length. Identity has %i elements, four (4) or five (5) are required." %len(identity))

    for i, character in enumerate(identity):
        if i == 0 or i == 2 or i == 4:
            if character not in acceptable_numbers:
                identity = ReplaceLetter(identity, i)
                new_number = identity[i]
                print("Character %s at index %s has been replace with %s" %(character, i, new_number))

        if i == 1 or i == 3:
            if character not in acceptable_letters:
                identity = ReplaceNumber(identity, i)
                new_number = identity[i]
                print("Character %s at index %s has been replace with %s" %(character, i, new_number))

    print("Final identity:", identity)
    
    return identity

def DateErrors(whole_date):
    """Finds errors in the date. The format of the date is mm/dd/yy."""
    print("\nLooking for errors in the date (%s)." %whole_date)

    if len(whole_date) != 8:
        print("length of the date is incorrect. Current length is %i, required length is eight (8)." %len(whole_date))
        return 1
    
    for i, character in enumerate(whole_date):
        if i in [0, 1, 3, 4, 6, 7]:
            if not '0' <= character <= '9':
                print("Invalid date. Date contains %s at index %i" %(character, i))
        if i in [2, 5]:
            if character != '/':
                print("Invalid date. Date contains %s at index %i" %(character, i))

    print("Final date:", whole_date)
    return whole_date
    

In [27]:
numbers_to_letters = [['0', 'O'], ['1', 'I'],['2', 'S'], ['3', 'B'], ['4', 'Y'], ['5', 'S'], ['6', 'G'], ['7', 'Y']]
letters_to_numbers = [['G', '6'], ['B', '3'], ['S', '5'], ['Y', '7'], ['T', '1'], ['A', '7'], ['Z', '2']]

def ReplaceNumber(identity, i):
    """Replaces the erroneous number at index i with a matched alternative from numbers_to_letters. (e.g.
    '0' goes to 'O', '2' goes to 'S' etc.)"""
    print("Replacing erroneous number (%s) with a matched alternative." %identity[i])

    number_error = identity[i]

    for element in numbers_to_letters:
        if number_error in element:
            new_character = element[1]
            if diagnostic == True:
                print("Replaced %s with %s at index %s" %(number_error, new_character, i))
            split_identity = list(identity)
            split_identity[i] = new_character
            identity = ''.join(split_identity)

    return identity

def ReplaceLetter(identity, i):
    """Similar to above but replaces letters with matched numbers."""
    print("Replacing erroneous letter (%s) at index %s with a matched alternative." %(identity[i], i))

    letter_error = identity[i]

    for element in letters_to_numbers:
        if letter_error in element:
            new_character = element[1]
            if diagnostic == True:
                print("Replacing '%s' with '%s' at index %s" %(letter_error, new_character, i))
            split_identity = list(identity)
            split_identity[i] = new_character
            identity = ''.join(split_identity)

    return identity

In [28]:
file = '/Users/jordan/Desktop/Guppies_Home/Data/DSC_1219.JPG'
output_string, word_confidences = ReadImage(file)
label = FindErrors(output_string)

Output: MCA-✩-TRAV-01/21/13 
Confidence: 0.17012289877775902

Removing special characters from the output string (e.g. '.', '|').
Character ✩ has been removed.

Removing paragraphs without three (3), four (4), five (5) or eight (8) elements.
Removed empty paragraph
Label after removing dead paragraphs: ['MCA', 'TRAV', '01/21/13']

Correct number of paragraphs.

Looking for errors in the title (MCA).
Final title: MCA

Looking for errors in the identity (TRAV).
Replacing erroneous letter (T) at index 0 with a matched alternative.
Character T at index 0 has been replace with 1
Replacing erroneous letter (A) at index 2 with a matched alternative.
Character A at index 2 has been replace with 7
Final identity: 1R7V

Looking for errors in the date (01/21/13).
Final date: 01/21/13

Final label is: MCA-1R7V-01/21/13


In [20]:
label

'MCA-1R7V-01/21/13'