# PII Data Redaction Tool



* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. 


## Objective

This tool uses parsed json files and a list of entities which have PII data and converts the json into pdf after redacting the entities provided.


## Prerequisites

* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Storage Bucket for storing input and output json files
* Permission For Google Storage and Vertex AI Notebook.
* list of entities to be redacted


## Step by Step procedure

### 1. Importing Required Modules

In [None]:
!pip install pandas numpy google-cloud-storage google-cloud-documentai==2.16.0 PyPDF2 configparser
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
import io
from PIL import Image
from google.cloud import documentai_v1beta3 as documentai
from typing import (
    Container,
    Iterable,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Union,
)
import json
from tqdm import tqdm
import numpy
import cv2
from utilities import *
from typing import Tuple, List, Dict, Union

### 2. Input and Output Paths
* In the list of entities, provide the entities which have to be redacted. If there is a child item to be redacted please specify as parent item type/child item type  even though the child item does not have the parent name in type. Example: line_item/amount

In [None]:
# input details
gcs_input_path = "gs://xxxx/xxxx/xx"  # GCS path where doc ai parsed files saved
gcs_output_path = "gs://xxx/xxx/xx/"  # GCS path to save the redacted pdfs
pii_entities = [
    "customer_account_name",
    "supplier_name",
    "line_item/quantity",
]  # List of entities to be redacted ,sample given

**`gcs_input_path`** : GCS Input Path. It should contain DocAI processed output json files.      
**`gcs_output_path`** : GCS Output Path. The updated synthesized data in the pdf.       
**`pii_entities`** : Entities for which the mentiontext has to be redacted and replaced with synthetic data given in the excel

### 3. Run the Code

In [None]:
# functions


def get_page_bbox(entity: documentai.Document.Entity) -> Tuple[str, List[float]]:
    """
    Extract the page number and bounding box from a given Document AI entity.

    Args:
        entity (documentai.Document.Entity): The Document AI entity.

    Returns:
        Tuple[str, List[float]]: A tuple containing the page number and bounding box [min_x, min_y, max_x, max_y].
    """
    bound_poly = entity.page_anchor.page_refs
    norm_ver = bound_poly[0].bounding_poly.normalized_vertices
    x_1 = []
    y_1 = []
    for xy in norm_ver:
        x_1.append(xy.x)
        y_1.append(xy.y)
    bbox = [min(x_1), min(y_1), max(x_1), max(y_1)]
    try:
        page = bound_poly[0].bounding_poly.page
    except:
        page = "0"

    return page, bbox


def get_bbox_page_wise(
    json_data: documentai.Document, pii_entities: List[str]
) -> Dict[str, List[List[Union[float, str]]]]:
    """
    Extract page-wise bounding boxes of specified PII entities from Document AI output.

    Args:
        json_data (documentai.Document): The Document AI output.
        pii_entities (List[str]): List of PII entities to extract.

    Returns:
        Dict[str, List[List[Union[float, str]]]]: A dictionary containing page-wise bounding boxes of PII entities.
    """
    page_wise_bbox = {}
    for pii_ent in pii_entities:
        for entity in json_data.entities:
            if "/" not in pii_ent:
                if entity.type_ == pii_ent:
                    page, bbox = get_page_bbox(entity)
                    if page in page_wise_bbox.keys():
                        page_wise_bbox[page].append(bbox)
                    else:
                        page_wise_bbox[page] = [bbox]
            else:
                parent_name = pii_ent.split("/")[0]
                if entity.properties:
                    if entity.type_ == parent_name:
                        for sub_ent in entity.properties:
                            if (
                                sub_ent.type_ == pii_ent.split("/")[-1]
                                or sub_ent.type_ == pii_ent
                            ):
                                page, bbox = get_page_bbox(sub_ent)
                                if page in page_wise_bbox.keys():
                                    page_wise_bbox[page].append(bbox)
                                else:
                                    page_wise_bbox[page] = [bbox]
    return page_wise_bbox


def get_synthesized_images(json_data: documentai.Document) -> List[Image.Image]:
    """
    Convert JSON data representing images into a list of PIL Image objects.

    Args:
        json_data (documentai.Document): The Document AI output containing image data.

    Returns:
        List[Image.Image]: A list of PIL Image objects.
    """
    synthesized_images = []

    def decode_image(image_bytes: bytes) -> Image.Image:
        with io.BytesIO(image_bytes) as image_file:
            image = Image.open(image_file)
            image.load()
        return image

    for i in range(len(json_data.pages)):
        synthesized_images.append(decode_image(json_data.pages[i].image.content))

    return synthesized_images


def draw_black_box(
    synthesized_images: List[Image.Image],
    page_wise_bbox: Dict[str, List[List[Union[float, str]]]],
) -> io.BytesIO:
    """
    Draw black boxes on images for specified PII entity bounding boxes and compile into a PDF.

    Args:
        synthesized_images (List[Image.Image]): List of PIL Image objects.
        page_wise_bbox (Dict[str, List[List[Union[float, str]]]]): Page-wise bounding boxes for PII entities.

    Returns:
        io.BytesIO: PDF stream containing images with black boxes drawn.
    """
    open_cv_image = {}
    for i in range(len(synthesized_images)):
        open_cv_image[i] = numpy.array(synthesized_images[i].convert("RGB"))
    img_final = []
    for i in range(len(open_cv_image)):
        size = open_cv_image[i].shape
        for page, bbox_list in page_wise_bbox.items():
            if str(i) == page:
                for bbox in bbox_list:
                    x1 = int(bbox[0] * size[1])
                    y1 = int(bbox[1] * size[0])
                    x2 = int(bbox[2] * size[1])
                    y2 = int(bbox[3] * size[0])
                    cv2.rectangle(
                        open_cv_image[i],
                        (x1, y1),
                        (x2, y2),
                        (0, 0, 0),
                        thickness=cv2.FILLED,
                    )
        img_temp = Image.fromarray(open_cv_image[i])
        img_final.append(img_temp)
    pdf_stream = io.BytesIO()
    img_final[0].save(
        pdf_stream,
        save_all=True,
        append_images=img_final[1:],
        resolution=100.0,
        quality=95,
        optimize=True,
        format="PDF",
    )

    return pdf_stream


def store_blob(pdf_stream, output_path, file_name):
    """
    Store files in cloud storage.
    """
    from google.cloud import storage

    storage_client = storage.Client()

    path_ = output_path.split("/")
    result_bucket = storage_client.bucket(path_[2])

    output_prefix = "/".join(path_[3:])
    filename = file_name.split(".")[0] + ".pdf"

    blob = result_bucket.blob(f"{output_prefix}{filename}")
    pdfbytes = pdf_stream.getvalue()
    blob.upload_from_string(pdfbytes, content_type="application/pdf")


def main():
    """Calling the functions"""
    file_names_list, file_names_dict = file_names(gcs_input_path)
    for filename, filepath in tqdm(file_names_dict.items(), desc="Progress"):
        if ".json" in filename:
            print(filename)
            try:
                json_data = documentai_json_proto_downloader(
                    gcs_input_path.split("/")[2], filepath
                )
                page_wise_bbox = get_bbox_page_wise(json_data, pii_entities)
                synthesized_images = get_synthesized_images(json_data)
                pdf_stream = draw_black_box(synthesized_images, page_wise_bbox)
                store_blob(pdf_stream, gcs_output_path, filename)
            except Exception as e:
                print("unable to redact the file: {filename}", filename)
                continue


main()

### 4.Output

* The New pdf documents with synthesized data will be saved in gcs_output_path

* Entities will be redacted and pdf will be saved in the GCS output folder provided.

<img src="./Images/Output.png" width=800 height=400></img>