# Export and Import Document schema using Gemini

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.


## Objective

This document guides how to extract and export a schema from a sample document to a spreadsheet (.xlsx extension) using Gemini and import a schema from a spreadsheet to a processor. This approach considers 3 level nesting as well.

## Prerequisites
* Vertex AI Notebook Or Colab (If using Colab, use authentication) 
* Vertex AI API enabled for Gemini API calls
* Processor details to import to the processor
* Permission For Google Storage, Vertex AI and Vertex AI Notebook.


## Step by Step procedure 

## Exporting Document schema to a spreadsheet

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
import base64
import json
import vertexai
from google.cloud import storage
import pandas as pd
import os
from vertexai.generative_models import GenerativeModel, Part, SafetySetting

### 2.Setup the inputs
* `project_id` : Provide GCP project Number
* `location` : The region where the resources or services are hosted
* `mime_type` : The format type of the file
* `input_uri` : The URI of the input file stored in Google Cloud Storage (GCS).
* `output_gcs_bucket` : The GCS bucket where the output will be stored.
* `output_gcs_folder` : The folder path within the GCS bucket for storing output files.

In [None]:
project_id = "xxxxxxxxxxxxxx"  # Project ID of the project
location = "us-central1"  # Location of Gemini
mime_type = "application/pdf"  # Mime type of input document
input_uri = (
    "gs://your-bucket/your-folder/your-document.pdf"  # GCS uri of input document
)
output_gcs_bucket = "destination-bucket"
output_gcs_folder = "destination-folder/sub-folder"

### 3.Run the required functions

In [None]:
def upload_to_gcs(
    bucket_name: str, destination_blob_name: str, source_file_path: str
) -> None:
    """
    Uploads a local file to a Google Cloud Storage (GCS) bucket.

    Args:
        bucket_name (str): The name of the GCS bucket where the file will be uploaded.
        destination_blob_name (str): The name of the destination file (blob) in the GCS bucket.
        source_file_path (str): The local path to the file that needs to be uploaded.

    Returns:
        None: This function does not return a value but prints a message upon successful upload.
    """
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a new blob (file) in the bucket
    blob = bucket.blob(destination_blob_name)

    # Upload the file to the blob
    blob.upload_from_filename(source_file_path)

    print(f"File {source_file_path} uploaded to {destination_blob_name}")


def remove_local_file(file_path: str) -> None:
    """
    Removes a file from local storage.

    Args:
        file_path (str): The full path to the file that needs to be deleted.

    Returns:
        None: This function does not return a value but prints a message indicating whether
              the file was successfully deleted or if an error occurred.

    Exceptions:
        FileNotFoundError: Raised if the specified file is not found.
        PermissionError: Raised if the file cannot be deleted due to insufficient permissions.
        Exception: Catches and prints any other exceptions that occur during the deletion process.
    """
    try:
        os.remove(file_path)
        print(f"File {file_path} has been removed from local storage.")
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except PermissionError:
        print(f"Permission denied: Unable to delete {file_path}.")
    except Exception as e:
        print(f"Error occurred while trying to delete {file_path}: {e}")


def export_schema(
    document_schema: dict, output_bucket: str, output_folder: str, source_filename: str
) -> None:
    """
    Exports a document schema to an Excel file and uploads it to a Google Cloud Storage (GCS) bucket.

    Args:
        document_schema (dict): The document schema that needs to be exported. It should be a dictionary that can be converted to a pandas DataFrame.
        output_bucket (str): The name of the GCS bucket where the exported schema will be uploaded.
        output_folder (str): The folder path within the GCS bucket where the schema file will be stored.
        source_filename (str): The name of the source file used to name the exported schema file.

    Returns:
        None: This function does not return a value but performs the following tasks:
            1. Exports the schema to an Excel file.
            2. Uploads the Excel file to the specified GCS bucket.
            3. Removes the local copy of the schema file after uploading.

    Exceptions:
        Exception: Catches any exception during the upload process and prints an error message.
    """
    schema_filename = f"{source_filename}_schema_exported.xlsx"

    df = pd.DataFrame(document_schema)
    df.to_excel(schema_filename, index=False)

    try:
        upload_to_gcs(
            output_bucket,
            f"{output_folder.rstrip('/')}/{schema_filename}",
            schema_filename,
        )
    except e:
        print("Error occured while uploading schema to bucket")
        print(e)

    remove_local_file(schema_filename)


def generate(project_id: str, location: str, mime_type: str, input_uri: str) -> dict:
    """
    Generates a schema from a document using a generative AI model and returns the schema in JSON format.

    Args:
        project_id (str): The Google Cloud project ID where the AI model is deployed.
        location (str): The region where the model is hosted.
        mime_type (str): The MIME type of the input document (e.g., 'application/pdf').
        input_uri (str): The URI of the input document that is used to generate the schema.

    Returns:
        dict: The generated schema in JSON format.

    Raises:
        Exception: If an error occurs during schema generation or JSON parsing.
    """

    schema = ""

    try:
        # Initialize Vertex AI project and location
        vertexai.init(project=project_id, location=location)

        # Load the generative model
        model = GenerativeModel("gemini-1.5-pro-001")

        print(f"Generating schema using the document {input_uri}...")

        # Generate schema content
        responses = model.generate_content(
            [text1, document1],
            generation_config=generation_config,
            safety_settings=safety_settings,
            stream=True,
        )

        # Concatenate generated text
        for response in responses:
            schema += response.text

        # Convert generated text to JSON
        schema_json = json.loads(schema)

    except json.JSONDecodeError as e:
        print(f"Error parsing generated schema into JSON: {e}")
        raise

    except Exception as e:
        print(f"An error occurred while generating the schema: {e}")
        raise

    return schema_json

### 4.Run the code

In [None]:
def main():
    text1 = """Please analyze the structure of the attached document and provide me with a schema definition as an object array. Return object array in a single line.

    The schema should include:
    *Fields and their data types (e.g., text, number, date)
    *Any relationships between fields (e.g., nested objects, arrays)
    *Each object in the object array will have key names: name, value_type, occurrence_type and display_name
    *All key names should be less than 64 characters and should be snake cased. If there are multiple words in key name, replace space with underscore and join the words.
    *The \\\"name\\\" key is the title of the field. This should be a semantically named field. 
    *The \\\"value_type\\\" describe the type of field and is either string, number, currency, money, datetime, address or checkbox. If the field is a parent entity, then the value_type should be equal to the \\\"name\\\"
    *The \\\"occurrence_type\\\" describes the number of times the field is expected and can either be REQUIRED_ONCE, REQUIRED_MULTIPLE, OPTIONAL_ONCE or OPTIONAL_MULTIPLE. In most of the cases, it will either be OPTIONAL_ONCE or OPTIONAL_MULTIPLE. When the field is an identifier or you feel is a mandatory field, use REQUIRED_ONCE or REQUIRED_MULTIPLE.
    *The \\\"display_name\\\" is blank unless the entity is a child entity. In that case, the display_name should be equal to the parent entity\\\'s name.

    Make sure the schema you provide is in accordance with the Document AI schema format. Retain the language of label names as in original documents"""

    document1 = Part.from_uri(
        mime_type=mime_type,
        uri=input_uri,
    )

    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 1,
        "top_p": 0.95,
    }

    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        ),
    ]

    # Generating the schema
    schema_json = generate(project_id, location, mime_type, input_uri)

    # Exporting the Schema to the GCS Destination Bucket Folder
    export_schema(
        schema_json,
        output_gcs_bucket,
        output_gcs_folder,
        input_uri.split("/")[-1].split(".")[0],
    )


main()

### 5.Output

The output will be the schema saved in "<input_document_name>_schema_exported.xlsx" file as shown below

#### Exported Schema in the SpreadSheet
<img src="./Images/Exported_Schema.png" width=800 height=400 ></img>

## Importing Document schema from a spreadsheet

### 1.Importing Required Modules

In [None]:
import numpy as np
import math
import pandas as pd
from google.cloud import documentai_v1beta3
from google.cloud import storage

### 2.Setup the inputs
* `project_id` : Provide GCP project Number
* `new_location` : The region where the new resources or services are hosted.
* `new_processor_id` : The unique identifier of the newly created processor.
* `schema_bucket_name` : The name of the Google Cloud Storage bucket where the schema file is stored.
* `schema_file_path` : The file path within the GCS bucket for the schema file.

#### Note : Importing the schema can be done in new processor or else processor with empty schema

In [None]:
project_id = "xxxxxxxxxxxxxxxx"  # Project ID of the project
new_location = "us"  # location of the processor
new_processor_id = (
    "xxxxxxxxxxxxx"  # Processor id of processor to which the schema has to be imported
)
schema_bucket_name = "your-bucket"  # Bucket name where exported schema file is stored
schema_file_path = "your-folder/<document_name>_schema_exported.xlsx"  # exported schema file path in GCS bucket

### 3.Run the required functions

In [None]:
def download_from_gcs(
    bucket_name: str, source_blob_name: str, destination_file_path: str
) -> None:
    """
    Downloads a file from a Google Cloud Storage (GCS) bucket to local storage.

    Args:
        bucket_name (str): The name of the GCS bucket.
        source_blob_name (str): The name of the file (blob) in the GCS bucket to be downloaded.
        destination_file_path (str): The local file path where the downloaded file will be saved.

    Returns:
        None: This function does not return any value. It downloads the file from GCS to local storage.
    """

    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob (file) in the bucket
    blob = bucket.blob(source_blob_name)

    # Download the file from the bucket to the local storage
    blob.download_to_filename(destination_file_path)

    print(f"File {source_blob_name} downloaded to {destination_file_path}.")


def get_dataset_schema(processor_name: str) -> documentai_v1beta3.DatasetSchema:
    """
    Retrieves the dataset schema for a specific processor in Document AI.

    Args:
        processor_name (str): The full resource name of the processor, typically in the format:
                              'projects/{project_id}/locations/{location}/processors/{processor_id}'.

    Returns:
        documentai_v1beta3.DatasetSchema: The schema of the dataset associated with the specified processor.
    """

    # Create a client
    client = documentai_v1beta3.DocumentServiceClient()

    # Initialize request argument(s)
    request = documentai_v1beta3.GetDatasetSchemaRequest(
        name=processor_name + "/dataset/datasetSchema",
    )

    # Make the request
    response = client.get_dataset_schema(request=request)

    return response


def update_dataset_schema(
    schema: documentai_v1beta3.DatasetSchema,
) -> documentai_v1beta3.DatasetSchema:
    """
    Updates the dataset schema for a specified processor in Document AI.

    Args:
        schema (documentai_v1beta3.DatasetSchema): The dataset schema object that includes the
                                                   updated schema details (name and document schema).

    Returns:
        documentai_v1beta3.DatasetSchema: The updated dataset schema object returned by the API.
    """

    from google.cloud import documentai_v1beta3

    # Create a client
    client = documentai_v1beta3.DocumentServiceClient()

    # Initialize request argument(s)
    request = documentai_v1beta3.UpdateDatasetSchemaRequest(
        dataset_schema={"name": schema.name, "document_schema": schema.document_schema}
    )

    # Make the request
    response = client.update_dataset_schema(request=request)

    # Handle the response
    return response

### 4.Run the code

In [None]:
def main():
    # Download the schema excel file from GCS
    local_excel_path = schema_file_path.split("/")[-1]
    download_from_gcs(schema_bucket_name, schema_file_path, local_excel_path)

    # time.sleep(10)

    # Import the Excel file back into a data frame
    imported_df = pd.read_excel(local_excel_path, engine="openpyxl")

    # Convert the data frame back to a list of dictionaries
    imported_data = imported_df.to_dict(orient="records")

    parent_entities = []
    nested_entities = {}
    for data in imported_data:
        temp_data = {key: value for key, value in data.items() if key != "display_name"}
        if isinstance(data["display_name"], float) and math.isnan(data["display_name"]):
            parent_entities.append(temp_data)
        else:
            if data["display_name"] in nested_entities.keys():
                nested_entities[data["display_name"]].append(temp_data)
            else:
                nested_entities[data["display_name"]] = [temp_data]

    schema_line = []

    for line, properties in nested_entities.items():
        client = documentai_v1beta3.types.DocumentSchema.EntityType()
        client.name = line
        client.base_types = ["object"]
        client.properties = properties
        client.display_name = line
        schema_line.append(client)

    new_processor_name = (
        f"projects/{project_id}/locations/{new_location}/processors/{new_processor_id}"
    )

    response_newprocessor = get_dataset_schema(new_processor_name)
    # updating into the processor
    for i in response_newprocessor.document_schema.entity_types:
        for e3 in parent_entities:
            i.properties.append(e3)

    for e4 in schema_line:
        response_newprocessor.document_schema.entity_types.append(e4)

    response_update = update_dataset_schema(response_newprocessor)
    print("Schema Imported Successfully...")


main()

### 5.Output

The schema will be saved in new processor version id

#### Exported Schema in the DocAI Processor UI

<img src="./Images/DocAI_Processor_Schema.png" width=800 height=400 ></img>