In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

| | |
|-|-|
|Author(s) | [Valentin Huerta](https://github.com/valentinhuerta1996) |
||  [Ulises Jimenez](https://github.com/ulises-jimenez07) |

In [1]:
from entity_processor import DocumentAIEntityExtractor, ModelBasedEntityExtractor
from extractor import OnlineDocumentExtractor
from prompts_module import get_compare_entities_prompt, get_extract_entities_prompt
from temp_file_uploader import TempFileUploader
from vertexai.generative_models import GenerativeModel

In [2]:
project_id = "project-id"
location = "us"  # Or other supported locations like 'eu'
processor_id = "processor-id"
processor_version_id = "processor-version-id"  # Optional for batch processing
# File to process
file_path = "test_file.pdf"
mime_type = "application/pdf"

gcs_output_uri = "gs://bucket-output"  # GCS URI for output
gcs_temp_uri = "gs://bucket-temp"  # GCS URI for output

In [3]:
online_extractor = OnlineDocumentExtractor(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    # processor_version_id=processor_version_id
)
online_document = online_extractor.process_document(file_path, mime_type)

docai_entity_extractor = DocumentAIEntityExtractor(online_document)
docai_entities = docai_entity_extractor.extract_entities()

In [7]:
docai_entities

In [5]:
temp_file_uploader = TempFileUploader(gcs_temp_uri)
gcs_input_uri = temp_file_uploader.upload_file(file_path)

prompt_extract = get_extract_entities_prompt()
model_extractor = ModelBasedEntityExtractor(
    "gemini-1.5-flash-001", prompt_extract, gcs_input_uri
)
gemini_entities = model_extractor.extract_entities()

temp_file_uploader.delete_file()

In [8]:
compare_prompt = get_compare_entities_prompt()
compare_prompt = compare_prompt.format(
    docai_output=str(docai_entities), gemini_output=str(gemini_entities)
)

model = GenerativeModel("gemini-1.5-flash-001")
docai_gemini_response_analysis = model.generate_content(compare_prompt)
print(docai_gemini_response_analysis.text)