# Entity data extraction from DocAI Parsed JSON

* Author: docai-incubator@google.com


## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.

## Purpose and Description
This tool allows you to extract entities and their confidence scores from the Input DocAI parsed JSON files, saving the results in either JSON or CSV format.


## Prerequisites

1. Vertex AI Notebook or Google Colab
2. GCS bucket for processing of  the input json and output json


## Step by Step procedure 

### 1. Install the required libraries

In [None]:
%pip install google-cloud-storage
%pip install google-cloud-documentai

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

### 2. Import the required libraries/Packages

In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from pathlib import Path
from tqdm import tqdm
import json
import pandas as pd
from utilities import (
    file_names,
    documentai_json_proto_downloader,
    store_document_as_json,
)

### 3. Input Details

<ul>
    <li><b>gcs_input_path :</b> GCS Storage name. It should contain DocAI processed output json files. This bucket is used for processing input files.</li>
    <li><b>gcs_output_path:</b> This is the path where the extracted entities with their associated confidence scores are put together in a Json file and stored in the output bucket.</li>
    <li><b>local_output_csv_path:</b> This is the path where the csv file will be stored </li>
    
</ul>

In [None]:
gcs_input_path = "gs://Bucket_name/path_to_docs/"
gcs_output_path = "gs://Bucket_name/path_to_docs/"
local_output_csv_path = "output.csv"

### 4.Execute the code

#### Storing the entities data as Json File


In [None]:
def get_hierarchical_data(json_dict):
    def parse_entity(entity):
        parsed_entity = {
            "Entity Type": entity.type,
            "Entity Mentiontext": entity.mention_text,
            "Confidence Score": entity.confidence,
        }
        # print(entity)
        if entity.properties:
            parsed_entity["Children"] = [
                parse_entity(sub_entity) for sub_entity in entity.properties
            ]
        return parsed_entity

    entities_data = [parse_entity(entity) for entity in json_dict.entities]
    return entities_data


# Function to process each file and generate the structured output
def process_files(file_dict):
    data = []
    for filename, filepath in tqdm(file_dict.items(), desc="Progress"):
        print("Processing >>>>>>>>>", filename)
        input_bucket_name = gcs_input_path.split("/")[2]
        document_proto = documentai_json_proto_downloader(input_bucket_name, filepath)
        # Get hierarchical data for the current file
        entities_data = get_hierarchical_data(document_proto)
        # Append the structured data to the list
        data.append({"File Name": filename, "Entities": entities_data})
    return data


def main(gcs_input_path, gcs_output_path):
    # Load the file names and paths (you need to define this function or replace it with your implementation)
    file_names_list, file_dict = file_names(gcs_input_path)
    # Process the files and get the structured output
    data = process_files(file_dict)
    # Save the structured output to GCS
    output_bucket_name = gcs_output_path.split("/")[2]
    output_path_within_bucket = "/".join(gcs_output_path.split("/")[3:])
    for entry in data:
        filename = entry["File Name"]
        store_document_as_json(
            json.dumps(entry),
            output_bucket_name,
            f"{output_path_within_bucket}/{filename}",
        )


main(gcs_input_path, gcs_output_path)

#### Storing the entities data in a CSV file

In [None]:
def flatten_hierarchical_data(entity, parent_type="", parent_text=""):
    flat_data = []
    entity_data = {
        "Parent Entity Type": parent_type,
        "Parent Entity Mentiontext": parent_text,
        "Entity Type": entity.type,
        "Entity Mentiontext": entity.mention_text,
        "Confidence Score": entity.confidence,
    }
    flat_data.append(entity_data)
    if entity.properties:
        for sub_entity in entity.properties:
            flat_data.extend(
                flatten_hierarchical_data(sub_entity, entity.type, entity.mention_text)
            )
    return flat_data


def get_flat_data(json_dict):
    flat_data = []
    for entity in json_dict.entities:
        flat_data.extend(flatten_hierarchical_data(entity))
    return flat_data


# Function to process each file and generate the structured output
def process_files(file_dict):
    all_data = []
    for filename, filepath in tqdm(file_dict.items(), desc="Progress"):
        print("Processing >>>>>>>>>", filename)
        input_bucket_name = gcs_input_path.split("/")[2]
        document_proto = documentai_json_proto_downloader(input_bucket_name, filepath)
        # Get flat data for the current file
        flat_data = get_flat_data(document_proto)
        for entry in flat_data:
            entry["File Name"] = filename
            all_data.append(entry)
    return all_data


def main(Gcs_input_path, local_output_csv_path):
    # Load the file names and paths (you need to define this function or replace it with your implementation)
    file_names_list, file_dict = file_names(gcs_input_path)
    # Process the files and get the structured output
    data = process_files(file_dict)
    # Convert the structured output to a DataFrame
    df = pd.DataFrame(data)
    # Save the DataFrame to a CSV file
    output_csv_path = local_output_csv_path  # Replace with your desired output path
    df.to_csv(output_csv_path, index=False)
    print(f"Data has been successfully saved to {output_csv_path}")


main(gcs_input_path, local_output_csv_path)

### 5.Output

The post-processed JSON file be found in the storage path provided by the user during the script execution, which is output_bucket_path. The CSV file is also located in the specified location. <br><hr>
<b>Sample json And CSV file </b><br><br>
<i><h4>Post processing results<h4><i><br>
    
<b>json  file </b>
    
<img src= "./images/image1.png" width=800 height=400>
<br><br>   
<b> CSV  file </b>
<br>
<img src= "./images/image2.png" width=800 height=400>