In [2]:
import os
import time
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from google.api_core.client_options import ClientOptions

In [4]:
# --------------- CONFIGURATION -------------------
PROJECT_ID = "prj-app-wh-dev"
LOCATION = "us"  # or "eu" depending on your processor location

PROCESSOR_ID = '56e60b554136ec3d'
# Provide paths
LOCAL_DOCUMENTS_DIR = "path_to_documents"
LOCAL_ANNOTATIONS_DIR = "path_to_annotations"  # JSON annotation files


**Create Processor**

In [15]:
def create_processor(client, display_name):
    parent = f"projects/{PROJECT_ID}/locations/{LOCATION}"
    processor = documentai.Processor(
        type_="CUSTOM_EXTRACTION_PROCESSOR",  # Very important
        display_name=display_name,
    )
    response = client.create_processor(parent=parent, processor=processor)
    print(f"Processor created: {response.name}")
    return response.name


client = documentai.DocumentProcessorServiceClient()
processor_name = create_processor(client, display_name="automation_test_processor")
processor_id = processor_name.split("/")[-1]





Processor created: projects/905161924890/locations/us/processors/56e60b554136ec3d


In [7]:
URIS = [
    "gs://prj-app-wh-dev-backend-data/docai/1736724814.pdf",
    "gs://prj-app-wh-dev-backend-data/docai/Sabbagh, Dalal.pdf",
]

In [1]:
dataset_path = 'prj-app-wh-dev-backend-data/docai_samples'

In [None]:
URIS = [
    "gs://prj-app-wh-dev-backend-data/docai/1736724814.pdf",
    "gs://prj-app-wh-dev-backend-data/docai/Sabbagh, Dalal.pdf",
]

**Initialize Dataset**

In [17]:
# from google.cloud import documentai_v1beta3 as documentai
# from google.api_core.client_options import ClientOptions

# Your settings
project_id = "prj-app-wh-dev"
location = "us"  # based on your endpoint
processor_id = "56e60b554136ec3d"
gcs_uri_prefix = "gs://prj-app-wh-dev-backend-data/docai_dataset/"

# Setup the endpoint correctly
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

# Initialize the Document AI client
client = documentai.DocumentServiceAsyncClient(client_options=client_options)

# Build the full dataset resource name
dataset_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}/dataset"

# Prepare the dataset configuration
dataset = documentai.Dataset(
    name=dataset_name,
    gcs_managed_config=documentai.Dataset.GCSManagedConfig(
        gcs_prefix=documentai.GcsPrefix(
            gcs_uri_prefix=gcs_uri_prefix
        )
    ),
    spanner_indexing_config=documentai.Dataset.SpannerIndexingConfig()
)

# Prepare the update request
update_request = documentai.UpdateDatasetRequest(
    dataset=dataset
)

# Call the update_dataset API
operation = client.update_dataset(request=update_request)

response = (await operation).result()





**Create Schema**

In [11]:
from google.cloud import documentai_v1beta3 as documentai
from google.protobuf import field_mask_pb2

# ✅ Use the DocumentServiceClient
client = documentai.DocumentServiceClient()

# Dataset name
dataset_name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}/dataset"
dataset_schema_name = f"{dataset_name}/datasetSchema"

# Define the schema fields with all data types
properties_list = [
    documentai.DocumentSchema.EntityType.Property(
        name="invoice_number",
        display_name="Invoice Number",
        value_type="string",  # Plain text
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_ONCE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="item_count",
        display_name="Item Count",
        value_type="number",  # Number
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_ONCE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="total_amount",
        display_name="Total Amount",
        value_type="currency",  # Currency
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_ONCE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="line_item_amount",
        display_name="Line Item Amount",
        value_type="money",  # Money
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_MULTIPLE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="invoice_date",
        display_name="Invoice Date",
        value_type="datetime",  # Datetime
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.REQUIRED_ONCE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="billing_address",
        display_name="Billing Address",
        value_type="address",  # Address
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_ONCE,
    ),
    documentai.DocumentSchema.EntityType.Property(
        name="is_paid",
        display_name="Is Paid",
        value_type="boolean",  # Checkbox
        occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.OPTIONAL_ONCE,
    ),
]

# Create the DocumentSchema - ✅ name must be 'custom_extraction_document_type'
document_schema = documentai.DocumentSchema(
    display_name="Invoice Schema (All Data Types)",
    description="Schema demonstrating all supported data types",
    entity_types=[
        documentai.DocumentSchema.EntityType(
            display_name="Invoice",
            name="custom_extraction_document_type",
            description="Root type for invoice fields",
            base_types=["document"],
            properties=properties_list
        )
    ]
)

# Build the dataset schema
dataset_schema = documentai.DatasetSchema(
    name=dataset_schema_name,
    document_schema=document_schema
)

# Prepare the request
update_schema_request = documentai.UpdateDatasetSchemaRequest(
    dataset_schema=dataset_schema,
    update_mask=field_mask_pb2.FieldMask(paths=["document_schema"])
)

# ✅ Update the dataset schema using DocumentServiceClient
response = client.update_dataset_schema(request=update_schema_request)
print("✅ Dataset schema updated with all data types:", response)




✅ Dataset schema updated with all data types: name: "projects/prj-app-wh-dev/locations/us/processors/56e60b554136ec3d/dataset/datasetSchema"
document_schema {
  display_name: "Invoice Schema (All Data Types)"
  description: "Schema demonstrating all supported data types"
  entity_types {
    name: "custom_extraction_document_type"
    base_types: "document"
    properties {
      name: "invoice_number"
      value_type: "string"
      occurrence_type: OPTIONAL_ONCE
      property_metadata {
      }
      display_name: "Invoice Number"
    }
    properties {
      name: "item_count"
      value_type: "number"
      occurrence_type: OPTIONAL_ONCE
      property_metadata {
      }
      display_name: "Item Count"
    }
    properties {
      name: "total_amount"
      value_type: "currency"
      occurrence_type: OPTIONAL_ONCE
      property_metadata {
      }
      display_name: "Total Amount"
    }
    properties {
      name: "line_item_amount"
      value_type: "money"
      occurrenc