# Import

In [6]:
from google.cloud import documentai_v1beta2 as documentai
from google.oauth2 import service_account #Control API Keys
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFont
# import the necessary packages
import cv2

# API Setup

In [7]:
keyDIR = "/Users/kunal/Documents/VdartResumeProject/APIKEYSGOOGLE/resumeMatcher-documentAI.json"

In [8]:
credentials = service_account.Credentials.from_service_account_file(keyDIR) #using service account to go through google
client = documentai.DocumentUnderstandingServiceClient(credentials=credentials)
gcs_source = documentai.types.GcsSource(uri="gs://document_ai_resume/Document_44.pdf")

In [9]:
# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

In [10]:
def _get_text(el, document):
    """Doc AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response

# Setting up for snipping

In [11]:
# initialize the list of reference points and boolean indicating
# whether cropping is being performed or not
refPt = []
cropping = False
def click_and_crop(event, x, y, flags, param):
    # grab references to the global variables
    global refPt, cropping
    # if the left mouse button was clicked, record the starting
    # (x, y) coordinates and indicate that cropping is being
    # performed
    if event == cv2.EVENT_LBUTTONDOWN:
        refPt = [(x, y)]
        cropping = True
    # check to see if the left mouse button was released
    elif event == cv2.EVENT_LBUTTONUP:
        # record the ending (x, y) coordinates and indicate that
        # the cropping operation is finished
        refPt.append((x, y))
        cropping = False
        # draw a rectangle around the region of interest
        #cv2.rectangle(image, refPt[0], refPt[1], (0, 255, 0), 2)
        #cv2.imshow("image", image)

# Now

In [12]:
def calcMatrixFromPic(pathImg, sizePortion, disply):
    print("New Window Opened")
    print("Just click and hold where you want the top right of the picture selected and  drag to the bottom right end of " + 
          "where you want the rectangual to end.\n Once you let go of holding the click, double click \"C\" and done")
    imagePath = pathImg
    img = Image.open(imagePath)
    x,y = img.size
    # load the image, clone it, and setup the mouse callback function
    image = cv2.imread(imagePath)
    clone = image.copy()
    cv2.namedWindow("image")
    image = cv2.resize(image, (int(x/sizePortion), int(y/sizePortion)))   
    cv2.setMouseCallback("image", click_and_crop)
    # keep looping until the 'q' key is pressed
    while True:
        # display the image and wait for a keypress
        cv2.imshow("image", image)
        key = cv2.waitKey(1) & 0xFF
        # if the 'r' key is pressed, reset the cropping region
        if key == ord("r"):
            image = clone.copy()
        # if the 'c' key is pressed, break from the loop
        elif key == ord("c"):
            break
    # if there are two reference points, then crop the region of interest
    # from teh image and display it
    if len(refPt) == 2:
        roi = clone[refPt[0][1]:refPt[1][1], refPt[0][0]:refPt[1][0]]
        cv2.imshow("ROI", roi)
        cv2.waitKey(0)
    # close all open windows
    cv2.destroyAllWindows()
    TINT_COLOR = (90, 200, 10)
    img = Image.open(imagePath)
    img = img.convert("RGBA")
    overlay = Image.new('RGBA', img.size, TINT_COLOR+(0,))
    draw = ImageDraw.Draw(overlay)  # Create a context for drawing things on it.
    img = Image.alpha_composite(img, overlay)
    img = img.convert("RGB") # Remove alpha for saving in jpg format.
    im1 = img.crop((refPt[0][0]*sizePortion, refPt[0][1]*sizePortion, refPt[1][0]*sizePortion, refPt[1][1]*sizePortion))
    if disply:
        display(im1)
    return (refPt[0][0]*sizePortion, refPt[0][1]*sizePortion, refPt[1][0]*sizePortion, refPt[1][1]*sizePortion)

In [13]:
def convert2CornersTuple2Matrix4Corner(cornersXY):
    return ((cornersXY[0], cornersXY[1]), (cornersXY[2], cornersXY[1]), (cornersXY[2], cornersXY[3]), (cornersXY[0], cornersXY[3]))

In [14]:
def convertMatrix2NormalizedVertices(matrix, width, height):
    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    
    bound = documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[0][0]/width,
                        y=matrix[0][1]/height
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[1][0]/width,
                        y=matrix[1][1]/height
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[2][0]/width,
                        y=matrix[2][1]/height
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[3][0]/width,
                        y=matrix[3][1]/height
                    )
                ]
            )
        )
    return bound

# Do your own Manual

In [15]:
#XY2MATRIX = calcMatrixFromPic(imgPath, 2, True)
#matrix = convert2CornersTuple2Matrix4Corner(XY2MATRIX)
#bound = convertMatrix2NormalizedVertices(matrix)
#print(matrix)
#table_bound_hints.append(bound)

In [16]:
table_bound_hints = []

# Automatic

In [17]:
imgPath = "/Users/kunal/Documents/VdartResumeProject/VisionAPi/Document_44_1.jpg"
table_bound_hints = []

In [18]:
doubleCheck = False
while True:
    newBound = input("Do you want a new bound")
    if (newBound == "" or newBound == "yes" or newBound == "Yes") and doubleCheck:
        XY2MATRIX = calcMatrixFromPic(imgPath, 2, True)
        matrix = convert2CornersTuple2Matrix4Corner(XY2MATRIX)
        img = Image.open(imgPath)
        x,y = img.size
        bound = convertMatrix2NormalizedVertices(matrix,x,y)
        print(matrix)
        table_bound_hints.append(bound)
        doubleCheck = False
    elif newBound == "" or newBound == "yes" or newBound == "Yes":
        doubleCheck = True
    elif newBound == "no" or newBound == "No":
        break

Do you want a new boundno


# After

In [19]:
# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(enabled=True, table_bound_hints=table_bound_hints)
# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format("resumematcher")
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    table_extraction_params=table_extraction_params)
document = client.process_document(request=request)
documentFormParser = document

In [35]:
dfArray = []
tableGroups = []
for page in document.pages:
    tablesPerPage = []
    dfArray.append('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
        dfArray.append('Table {}: '.format(table_num))
        singleTable = []
        for row_num, row in enumerate(table.header_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            dfArray.append('Header Row {}: {}'.format(row_num, cells))
            singleTable.append(["Header",row_num,cells])
        for row_num, row in enumerate(table.body_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            dfArray.append('Row {}: {}'.format(row_num, cells))
            singleTable.append(["Row", row_num, cells])
        tablesPerPage.append([singleTable, table_num])
    tableGroups.append([tablesPerPage, page.page_number])

In [37]:
import pandas as pd

In [39]:
dfTables = pd.DataFrame(dfArray)

In [40]:
dfTables

Unnamed: 0,0
0,Page number: 1
1,Table 0:
2,"Header Row 0: Finance Consultant, Consulting E..."
3,Row 0: • Interim o o o Director Of Finance & I...
4,Row 1: 95% decrease in quarterly commissions c...
5,"Row 2: • IT Finance Consultant, McKesson (Enga..."
6,Row 3: o o o Issue & Scope: IT Business Unit h...
7,"Row 4: • IT Finance Manager, Floor & Décor (En..."
8,Row 5: o Issue & Scope: IT Business Unit had l...
9,"Row 6: o o Tools & Analysis: Excel, with integ..."


In [33]:
workExperience = []
for page in document.pages:
    print('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
        print('Table {}: '.format(table_num))
        for row_num, row in enumerate(table.header_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            print('Header Row {}: {}'.format(row_num, cells))
            workExperience.append(cells)
        for row_num, row in enumerate(table.body_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            print('Row {}: {}'.format(row_num, cells))

Page number: 1
Table 0: 
Header Row 0: Finance Consultant, Consulting Engagements (3 Firms), 2018-2020

Row 0: • Interim o o o Director Of Finance & Interim Controller, Aptitude Health (Engaged by RGP), 2019-2020
Issue & Scope: Director Of Finance for $17 Million multi-entity organization resigned with short notice
Tools & Analysis: Intacct & Excel
Findings & Results: Fulfilled month-end & year-end responsibilities with minimal training & guidance,
enabling organization to continue financial operations without disruption; Performed analysis that yielded

Row 1: 95% decrease in quarterly commissions calculations time; Improved financial processes

Row 2: • IT Finance Consultant, McKesson (Engaged by Strive Consulting), 2019

Row 3: o o o Issue & Scope: IT Business Unit had limited visibility into financial results for $140 Million budget
Tools & Analysis: Excel, with integrated PowerPoint refresh, of financial & project/program data
Findings & Results: Built streamlined processes to inc

In [34]:
workExperience

['Finance Consultant, Consulting Engagements (3 Firms), 2018-2020\n',
 '• Ally Financial (Engaged by Darton Group): Achieved Basel II compliance by completing 1,000-item data lineage\n',
 'Excel: Nested conditionals (IF, SUMIF, SUMIFS, etc.); Referencing (VLOOKUP, HLOOKUP); Pivots; Light Macros\n',
 'MBA in Management, University of North Carolina at Charlotte, 2006\nM.S. Economics, University of North Carolina at Charlotte, 2005\n']

In [None]:
workExperience 

In [23]:
len(tableGroups)

2

In [24]:
len(tableGroups[0])

2

In [29]:
print(tableGroups[0][0][0])

[[['Header', 0, 'Finance Consultant, Consulting Engagements (3 Firms), 2018-2020\n'], ['Row', 0, '• Interim o o o Director Of Finance & Interim Controller, Aptitude Health (Engaged by RGP), 2019-2020\nIssue & Scope: Director Of Finance for $17 Million multi-entity organization resigned with short notice\nTools & Analysis: Intacct & Excel\nFindings & Results: Fulfilled month-end & year-end responsibilities with minimal training & guidance,\nenabling organization to continue financial operations without disruption; Performed analysis that yielded\n'], ['Row', 1, '95% decrease in quarterly commissions calculations time; Improved financial processes\n'], ['Row', 2, '• IT Finance Consultant, McKesson (Engaged by Strive Consulting), 2019\n'], ['Row', 3, 'o o o Issue & Scope: IT Business Unit had limited visibility into financial results for $140 Million budget\nTools & Analysis: Excel, with integrated PowerPoint refresh, of financial & project/program data\nFindings & Results: Built streamli

In [27]:
for i in tableGroups[0]:
    print(i)

[[[['Header', 0, 'Finance Consultant, Consulting Engagements (3 Firms), 2018-2020\n'], ['Row', 0, '• Interim o o o Director Of Finance & Interim Controller, Aptitude Health (Engaged by RGP), 2019-2020\nIssue & Scope: Director Of Finance for $17 Million multi-entity organization resigned with short notice\nTools & Analysis: Intacct & Excel\nFindings & Results: Fulfilled month-end & year-end responsibilities with minimal training & guidance,\nenabling organization to continue financial operations without disruption; Performed analysis that yielded\n'], ['Row', 1, '95% decrease in quarterly commissions calculations time; Improved financial processes\n'], ['Row', 2, '• IT Finance Consultant, McKesson (Engaged by Strive Consulting), 2019\n'], ['Row', 3, 'o o o Issue & Scope: IT Business Unit had limited visibility into financial results for $140 Million budget\nTools & Analysis: Excel, with integrated PowerPoint refresh, of financial & project/program data\nFindings & Results: Built streaml

In [31]:
for i in tableGroups:
    print("Page: " + str(i[1]))
    for j in i[0]:
        print(j)

Page: 1
[[['Header', 0, 'Finance Consultant, Consulting Engagements (3 Firms), 2018-2020\n'], ['Row', 0, '• Interim o o o Director Of Finance & Interim Controller, Aptitude Health (Engaged by RGP), 2019-2020\nIssue & Scope: Director Of Finance for $17 Million multi-entity organization resigned with short notice\nTools & Analysis: Intacct & Excel\nFindings & Results: Fulfilled month-end & year-end responsibilities with minimal training & guidance,\nenabling organization to continue financial operations without disruption; Performed analysis that yielded\n'], ['Row', 1, '95% decrease in quarterly commissions calculations time; Improved financial processes\n'], ['Row', 2, '• IT Finance Consultant, McKesson (Engaged by Strive Consulting), 2019\n'], ['Row', 3, 'o o o Issue & Scope: IT Business Unit had limited visibility into financial results for $140 Million budget\nTools & Analysis: Excel, with integrated PowerPoint refresh, of financial & project/program data\nFindings & Results: Built 

In [21]:
tableGroups

[[[[[['Header',
      0,
      'Finance Consultant, Consulting Engagements (3 Firms), 2018-2020\n'],
     ['Row',
      0,
      '• Interim o o o Director Of Finance & Interim Controller, Aptitude Health (Engaged by RGP), 2019-2020\nIssue & Scope: Director Of Finance for $17 Million multi-entity organization resigned with short notice\nTools & Analysis: Intacct & Excel\nFindings & Results: Fulfilled month-end & year-end responsibilities with minimal training & guidance,\nenabling organization to continue financial operations without disruption; Performed analysis that yielded\n'],
     ['Row',
      1,
      '95% decrease in quarterly commissions calculations time; Improved financial processes\n'],
     ['Row',
      2,
      '• IT Finance Consultant, McKesson (Engaged by Strive Consulting), 2019\n'],
     ['Row',
      3,
      'o o o Issue & Scope: IT Business Unit had limited visibility into financial results for $140 Million budget\nTools & Analysis: Excel, with integrated PowerPoi