# Export and Import Document schema from a processor (using spreadsheet).



## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. 


## Objective

This document Guides how to export a schema from a processor to a spreadsheet(.xlsx extension) and import a schema from a spreadsheet to a processor . This approach considers 3 level nesting as well.



## Prerequisites

* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Processor details to import the processor
* Permission For Google Storage and Vertex AI Notebook.


## 1. Exporting Document schema to a spreadsheet


#### Input
* `project_id`="xxxxxxxxxx" # Project ID of the project
* `location`="us" # location of the processor 
* `processor_id`="xxxxxxxxxxxxxxx" #Processor id of processor from which the schema has to be exported to spreadsheet

In [None]:
processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
# get document schema
from google.cloud import documentai_v1beta3


def get_dataset_schema(processor_name):
    # Create a client
    client = documentai_v1beta3.DocumentServiceClient()

    # dataset_name = client.dataset_schema_path(project, location, processor)
    # Initialize request argument(s)
    request = documentai_v1beta3.GetDatasetSchemaRequest(
        name=processor_name + "/dataset/datasetSchema",
    )

    # Make the request
    response = client.get_dataset_schema(request=request)

    return response


response_document_schema = get_dataset_schema(processor_name)
dataset_schema = []
for schema_metadata in response_document_schema.document_schema.entity_types:
    if len(schema_metadata.properties) > 0:
        for schema_property in schema_metadata.properties:
            temp_schema_metadata = {
                "name": schema_property.name,
                "value_type": schema_property.value_type,
                "occurrence_type": schema_property.occurrence_type.name,
            }
            if len(schema_metadata.display_name) == 0:
                dataset_schema.append(temp_schema_metadata)
            else:
                temp_schema_metadata["display_name"] = schema_metadata.display_name
                dataset_schema.append(temp_schema_metadata)

import pandas as pd

df = pd.DataFrame(dataset_schema)
df.to_excel("Document_Schema_exported.xlsx", index=False)

### Output 
* The output will be the schema saved in "Document_Schema_exported.xlsx" file as shown below
<img src="./Images/Exported_schema.png" width=800 height=400></img>

#### * `Columns`
#### Name:
Entity type which can be parent entity or child entities

#### Value_type:

* Value type is the data type of the entities, if the entity is a parent item the value type will be same as entity type.if it is final child type then value type is data type

#### Occurance_type :

* Occurance type is the occurance type of respective entity

#### display_name:

* Display name is the name of the parent entity for child entities. if entity itself is the parent entity then display_name will be empty

## 2. Importing Document schema from a spreadsheet

#### Input
* `project_id`="xxxxxxxxxx" # Project ID of the project
* `new_location`="us" # location of the processor 
* `new_processor_id`="xxxxxxxxxxxxxxx" #Processor id of processor to which the schema has to be imported
* `schema_xlsx_path`="Document_Schema_exported.xlsx"

* Add any entities in the xlsx file to be added in the new processor

## Note

* Make sure the entities in the spreadsheet are not already in the schema of the processor to avoid issues


In [None]:
import numpy as np
import math
import pandas as pd
from google.cloud import documentai_v1beta3

# Import the Excel file back into a data frame
imported_df = pd.read_excel(schema_xlsx_path)

# Convert the data frame back to a list of dictionaries
imported_data = imported_df.to_dict(orient="records")

parent_entities = []
nested_entities = {}
for data in imported_data:
    temp_data = {key: value for key, value in data.items() if key != "display_name"}
    if isinstance(data["display_name"], float) and math.isnan(data["display_name"]):
        parent_entities.append(temp_data)
    else:
        if data["display_name"] in nested_entities.keys():
            nested_entities[data["display_name"]].append(temp_data)
        else:
            nested_entities[data["display_name"]] = [temp_data]

schema_line = []

for line, properties in nested_entities.items():
    client = documentai_v1beta3.types.DocumentSchema.EntityType()
    client.name = line
    client.base_types = ["object"]
    client.properties = properties
    client.display_name = line
    schema_line.append(client)

new_processor_name = (
    f"projects/{project_id}/locations/{new_location}/processors/{new_processor_id}"
)

response_newprocessor = get_dataset_schema(new_processor_name)
# updating into the processor
for i in response_newprocessor.document_schema.entity_types:
    for e3 in parent_entities:
        i.properties.append(e3)

for e4 in schema_line:
    response_newprocessor.document_schema.entity_types.append(e4)


def update_dataset_schema(schema):
    from google.cloud import documentai_v1beta3

    # Create a client
    client = documentai_v1beta3.DocumentServiceClient()

    # Initialize request argument(s)
    request = documentai_v1beta3.UpdateDatasetSchemaRequest(
        dataset_schema={"name": schema.name, "document_schema": schema.document_schema}
    )

    # Make the request
    response = client.update_dataset_schema(request=request)

    # Handle the response
    return response


response_update = update_dataset_schema(response_newprocessor)

### Output 
* The schema of new processor will be updated as per spreadsheet given