In [1]:
from google.cloud import documentai_v1beta2 as documentai
from google.oauth2 import service_account #Control API Keys

In [2]:
keyDIR = "/Users/kunal/Documents/VdartResumeProject/APIKEYSGOOGLE/resumeMatcher-documentAI.json"

In [4]:
credentials = service_account.Credentials.from_service_account_file(keyDIR) #using service account to go through google
client = documentai.DocumentUnderstandingServiceClient(credentials=credentials)
gcs_source = documentai.types.GcsSource(uri="gs://document_ai_resume/Document_619.pdf")

In [5]:
# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

In [6]:
def _get_text(el, document):
    """Doc AI identifies form fields by their offsets
    in document text. This function converts offsets
    to text snippets.
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response

In [7]:
# Improve form parsing results by providing key-value pair hints.
# For each key hint, key is text that is likely to appear in the
# document as a form field name (i.e. "DOB").
# Value types are optional, but can be one or more of:
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
"""key_value_pair_hints = [
    documentai.types.KeyValuePairHint(key='Emergency Contact',value_types=['NAME']),
    documentai.types.KeyValuePairHint(key='Referred By')
]"""

# Setting enabled=True enables form extraction
form_extraction_params = documentai.types.FormExtractionParams(enabled=True) # key_value_pair_hints=key_value_pair_hints

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format("resumematcher")
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    form_extraction_params=form_extraction_params)

documentFormParser = client.process_document(request=request)

In [8]:
entitiesDetectedGoogleDocumentAI = []
print("URI INFO: {}".format(documentFormParser.uri))
print("Text detected (first 100): {}".format(documentFormParser.text[:100]))
for ent in documentFormParser.entities:
    print("Entity {}: {} \n\tConifd: {}".format(ent.mention_id, ent.mention_text, ent.confidence))
    ctDetected = 1
    tempPositions = []
    for position in ent.text_anchor.text_segments:
        print("\tDetected {}: {} -- {}".format(ctDetected, position.start_index, position.end_index))
        tempPositions.append([ctDetected, position.start_index, position.end_index])
        ctDetected+=1
    entitiesDetectedGoogleDocumentAI.append([ent.mention_text, ent.confidence, tempPositions])
    #print("\n")

URI INFO: 
Text detected (first 100): Serhii Kalinin
1180 St Clair Ave West,
Toronto, ON M6E 1B4
E-mail: kalininsergg92@gmail.com
Phone: 4
Entity 0: kalininsergg92@gmail.com 
	Conifd: 0.966052770614624
	Detected 1: 59 -- 91
Entity 1: 437-231-58-07 
	Conifd: 0.9390579462051392
	Detected 1: 92 -- 112
Entity 2: 05/27/1992 
	Conifd: 0.48646771907806396
	Detected 1: 113 -- 137
Entity 3: Jan 2020 — Dec 2020 
	Conifd: 0.18704494833946228
	Detected 1: 3714 -- 3752
Entity 4: 6, 2018 — March 1, 2018 
	Conifd: 0.23073223233222961
	Detected 1: 3762 -- 3790
Entity 5: - fluent 
	Conifd: 0.2284661829471588
	Detected 1: 4758 -- 4774
Entity 6: - with a dictionary 
	Conifd: 0.11633188277482986
	Detected 1: 4775 -- 4801


In [9]:
def findEntitiesDocumentAIG(documentFormParser):
    entitiesDetectedGoogleDocumentAI = []
    #print("URI INFO: {}".format(documentFormParser.uri))
    #print("Text detected (first 100): {}".format(documentFormParser.text[:100]))
    for ent in documentFormParser.entities:
        #print("Entity {}: {} \n\tConifd: {}".format(ent.mention_id, ent.mention_text, ent.confidence))
        ctDetected = 1
        tempPositions = []
        for position in ent.text_anchor.text_segments:
            #print("\tDetected {}: {} -- {}".format(ctDetected, position.start_index, position.end_index))
            tempPositions.append([ctDetected, position.start_index, position.end_index])
            ctDetected+=1
        entitiesDetectedGoogleDocumentAI.append([ent.mention_text, ent.confidence, tempPositions])
        #print("\n")
    return entitiesDetectedGoogleDocumentAI

In [10]:
entitiesDetectedGoogleDocumentAI

[['kalininsergg92@gmail.com', 0.966052770614624, [[1, 59, 91]]],
 ['437-231-58-07', 0.9390579462051392, [[1, 92, 112]]],
 ['05/27/1992', 0.48646771907806396, [[1, 113, 137]]],
 ['Jan 2020 — Dec 2020', 0.18704494833946228, [[1, 3714, 3752]]],
 ['6, 2018 — March 1, 2018', 0.23073223233222961, [[1, 3762, 3790]]],
 ['- fluent', 0.2284661829471588, [[1, 4758, 4774]]],
 ['- with a dictionary', 0.11633188277482986, [[1, 4775, 4801]]]]

In [11]:
formParserTotal = []
for page in documentFormParser.pages:
    temp = []
    print('Page number: {}'.format(page.page_number))
    for form_field in page.form_fields:
        print('Field Name: {}\tConfidence: {}'.format(
            _get_text(form_field.field_name, documentFormParser),
            form_field.field_name.confidence))
        print('Field Value: {}\tConfidence: {}'.format(
            _get_text(form_field.field_value, documentFormParser),
            form_field.field_value.confidence))
        temp.append([_get_text(form_field.field_name, documentFormParser),_get_text(form_field.field_value, documentFormParser), form_field.field_name.confidence])
    formParserTotal.append([temp,page.page_number])

Page number: 1
Field Name: E-mail: 	Confidence: 0.966052770614624
Field Value: kalininsergg92@gmail.com
	Confidence: 0.966052770614624
Field Name: Phone: 	Confidence: 0.9390579462051392
Field Value: 437-231-58-07
	Confidence: 0.9390579462051392
Field Name: DATE OF BIRTH
	Confidence: 0.48646771907806396
Field Value: 05/27/1992
	Confidence: 0.48646771907806396
Page number: 2
Page number: 3
Field Name: Febr 	Confidence: 0.23073223233222961
Field Value: 6, 2018 — March 1, 2018
	Confidence: 0.23073223233222961
Field Name: Health Informatics 	Confidence: 0.18704494833946228
Field Value: Jan 2020 — Dec 2020
	Confidence: 0.18704494833946228
Page number: 4
Field Name: Russian 	Confidence: 0.2284661829471588
Field Value: - fluent
	Confidence: 0.2284661829471588
Field Name: Polish 	Confidence: 0.11633188277482986
Field Value: - with a dictionary
	Confidence: 0.11633188277482986
Page number: 5


In [12]:
formParserTotal

[[[['E-mail: ', 'kalininsergg92@gmail.com\n', 0.966052770614624],
   ['Phone: ', '437-231-58-07\n', 0.9390579462051392],
   ['DATE OF BIRTH\n', '05/27/1992\n', 0.48646771907806396]],
  1],
 [[], 2],
 [[['Febr ', '6, 2018 — March 1, 2018\n', 0.23073223233222961],
   ['Health Informatics ', 'Jan 2020 — Dec 2020\n', 0.18704494833946228]],
  3],
 [[['Russian ', '- fluent\n', 0.2284661829471588],
   ['Polish ', '- with a dictionary\n', 0.11633188277482986]],
  4],
 [[], 5]]

In [13]:
#gcs_source = documentai.types.GcsSource(uri="gs://document_ai_resume/Document_11.pdf")
input_config = documentai.types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')

In [14]:
# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(enabled=True) #table_bound_hints=table_bound_hints
# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format("resumematcher")
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    table_extraction_params=table_extraction_params)
document = client.process_document(request=request)

In [15]:
tableGroups = []
for page in document.pages:
    tablesPerPage = []
    print('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
        print('Table {}: '.format(table_num))
        singleTable = []
        for row_num, row in enumerate(table.header_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            print('Header Row {}: {}'.format(row_num, cells))
            singleTable.append(["Header",row_num,cells])
        for row_num, row in enumerate(table.body_rows):
            cells = ''.join([_get_text(cell.layout, documentFormParser) for cell in row.cells])
            print('Row {}: {}'.format(row_num, cells))
            singleTable.append(["Row", row_num, cells])
        tablesPerPage.append([singleTable, table_num])
    tableGroups.append([tablesPerPage, page.page_number])

Page number: 1
Page number: 2
Table 0: 
Header Row 0:   Travel 100%.
Installation of voting equipment prior to election in accordance with the required operating

Row 0:  systems.
Performs service set-up and maintenance procedures in accordance with the appropriate codes and standards.
service

Row 1:  Repairs and maintains voting equipment; troubleshoots and identifies voting equipment

Row 2: problems such as but not limited to computer failure, printer jam and low battery voltage;

Row 3:  replaces damaged or malfunctioning
Delivers voting equipment to designated polling places and explains and/or demonstrates voting

Row 4:  equipment operation to election officials.
Performs test election procedures after election.

Page number: 3


In [16]:
tableGroups

[[[], 1],
 [[[[['Header',
      0,
      '\uf0b7 \uf0b7 Travel 100%.\nInstallation of voting equipment prior to election in accordance with the required operating\n'],
     ['Row',
      0,
      '\uf0b7 systems.\nPerforms service set-up and maintenance procedures in accordance with the appropriate codes and standards.\nservice\n'],
     ['Row',
      1,
      '\uf0b7 Repairs and maintains voting equipment; troubleshoots and identifies voting equipment\n'],
     ['Row',
      2,
      'problems such as but not limited to computer failure, printer jam and low battery voltage;\n'],
     ['Row',
      3,
      '\uf0b7 replaces damaged or malfunctioning\nDelivers voting equipment to designated polling places and explains and/or demonstrates voting\n'],
     ['Row',
      4,
      '\uf0b7 equipment operation to election officials.\nPerforms test election procedures after election.\n']],
    0]],
  2],
 [[], 3]]

In [17]:
((0, 56), (1653, 174))

((0, 56), (1653, 174))

In [18]:
width = 1654 
height =  2339

In [19]:
table_bound_hints = convertMatrix2NormalizedVertices(((75,1025), (1650, 1025), (1650,2300), (75,2300)))

NameError: name 'convertMatrix2NormalizedVertices' is not defined

In [20]:
def convertMatrix2NormalizedVertices(matrix):
    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[0][0]/width,
                        y=matrix[0][1]/height
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[1][0]/width,
                        y=matrix[1][1]/height
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[2][0]/width,
                        y=matrix[2][1]/height
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=matrix[3][0]/width,
                        y=matrix[3][1]/height
                    )
                ]
            )
        )
    ]
    return table_bound_hints

In [24]:
from google.cloud import documentai_v1beta2 as documentai
from google.oauth2 import service_account #Control API Keys

keyDIR = "/Users/kunal/Documents/VdartResumeProject/APIKEYSGOOGLE/resumeMatcher-documentAI.json"

credentials = service_account.Credentials.from_service_account_file(keyDIR) #using service account to go through google
client = documentai.DocumentUnderstandingServiceClient(credentials=credentials)

gcs_source = documentai.types.GcsSource(uri="gs://document_ai_resume/Document_1197.pdf")

input_config = documentai.types.InputConfig(
    gcs_source=gcs_source, mime_type='application/pdf')

In [25]:
#table_bound_hints = convertMatrix2NormalizedVertices(((75,1025), (1650, 1025), (1650,2300), (75,2300)))

# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(
    enabled=True) #table_bound_hints=table_bound_hints

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format("resumematcher")
request = documentai.types.ProcessDocumentRequest(
    parent=parent,
    input_config=input_config,
    table_extraction_params=table_extraction_params)

document = client.process_document(request=request)

def _get_text(el):
    """Convert text offset indexes into text snippets.
    """
    response = ''
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in el.text_anchor.text_segments:
        start_index = segment.start_index
        end_index = segment.end_index
        response += document.text[start_index:end_index]
    return response

for page in document.pages:
    print('Page number: {}'.format(page.page_number))
    for table_num, table in enumerate(page.tables):
        print('Table {}: '.format(table_num))
        for row_num, row in enumerate(table.header_rows):
            cells = '\t'.join(
                [_get_text(cell.layout) for cell in row.cells])
            print('Header Row {}: {}'.format(row_num, cells))
        for row_num, row in enumerate(table.body_rows):
            cells = '\t'.join(
                [_get_text(cell.layout) for cell in row.cells])
            print('Row {}: {}'.format(row_num, cells))

Page number: 1
Page number: 2
Table 0: 
Header Row 0:   	Travel 100%.
Installation of voting equipment prior to election in accordance with the required operating
	
Row 0:  	systems.
Performs service set-up and maintenance procedures in accordance with the appropriate codes and standards.
	service

Row 1:  	Repairs and maintains voting equipment; troubleshoots and identifies voting equipment
	
Row 2: 	problems such as but not limited to computer failure, printer jam and low battery voltage;
	
Row 3:  	replaces damaged or malfunctioning
Delivers voting equipment to designated polling places and explains and/or demonstrates 	voting

Row 4:  	equipment operation to election officials.
Performs test election procedures after election.
	
Page number: 3
