# PII Redaction with Synthetic Data


* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. 


## Objective

This document guides to replace the PII data with synthetic data using parsed jsons and entity types to be redacted as input and gives a pdf document with synthetic data.


## Prerequisites

* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Storage Bucket for storing input and output json files
* Permission For Google Storage and Vertex AI Notebook.
* Excel file which contains Synthetic data


## Step by Step procedure

### 1. Importing Required Modules

In [None]:
!pip install pandas numpy google-cloud-storage google-cloud-documentai==2.16.0 PyPDF2 configparser
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [1]:
import io
from PIL import Image
from google.cloud import documentai_v1beta3 as documentai
from typing import (
    Container,
    Iterable,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Union,
)
import json
from tqdm import tqdm
import numpy
import cv2
from utilities import *
from typing import Tuple, List, Dict, Union, Any
import pandas as pd

### 2. Input and Output Paths

In [7]:
# input details
gcs_input_path = "gs://test_vb1/translation_14th_sep/english_jsons_1/"  # GCS path where doc ai parsed files saved
gcs_output_path = (
    "gs://test_vb1/synthetic_data_feb20/"  # GCS path to save the redacted pdfs
)
pii_entities = [
    "receiver_address",
    "receiver_email",
    "receiver_name",
    "receiver_phone",
    "receiver_tax_id",
    "receiver_website",
    "remit_to_address",
    "remit_to_name",
    "ship_from_address",
    "ship_from_name",
    "ship_to_address",
    "ship_to_name",
    "supplier_address",
    "supplier_email",
    "supplier_iban",
    "supplier_name",
    "supplier_payment_ref",
    "supplier_phone",
    "supplier_registration",
    "supplier_tax_id",
    "supplier_website",
]  # List of entities to be redacted ,sample given
redact_text = [
    "Machine translated by google"
]  # only to redact the text and cannot be replaced with any synthetic data , sample given, change the data

synthetic_data_path = "ACN Synthetic Data .xlsx"  # synthetic data path in xlsx format
sheet_name = "data"  # sheet name where the synthetic data is present

redact_only = False  # Change this Flag to `True` to Only Redact the entities with Black Bounding boxes without any Synthetic values

**`gcs_input_path`** : GCS Input Path. It should contain DocAI processed output json files.        
**`gcs_output_path`** : GCS Output Path. The updated synthesized data in the pdf.         
**`project_id`** : It should contains the project id of your current project.         
**`pii_entities`** : Entities for which the mentiontext has to be redacted and replaced with synthetic data given in the excel     
**`redact_text`** : Redacting text with text as input                 
**`synthetic_data_path`** : xlsx file which has synthetic data , column names matching entity type and corresponding values having synthetic data like below.          
**`sheet_name`** : Sheet name where the synthetic data is present         
**Synthetic data will be chosen randomly.    


<img src="./Images/synthetic_data.png" width=800 height=400></img>

### 3. Run the Code

In [None]:
# functions


def get_page_bbox(entity: documentai.Document.Entity) -> Tuple[str, List[float]]:
    """
    Extract the page number and bounding box from a given Document AI entity.

    Args:
        entity (documentai.Document.Entity): The Document AI entity.

    Returns:
        Tuple[str, List[float]]: A tuple containing the page number and bounding box [min_x, min_y, max_x, max_y].
    """
    bound_poly = entity.page_anchor.page_refs
    norm_ver = bound_poly[0].bounding_poly.normalized_vertices
    x_1 = []
    y_1 = []
    for xy in norm_ver:
        x_1.append(xy.x)
        y_1.append(xy.y)
    bbox = [min(x_1), min(y_1), max(x_1), max(y_1)]
    try:
        page = bound_poly[0].bounding_poly.page
    except:
        page = "0"

    return page, bbox


def get_bbox_page_wise(
    json_data: documentai.Document, pii_entities: List[str]
) -> Dict[str, List[List[Union[float, str]]]]:
    """
    Extract page-wise bounding boxes of specified PII entities from Document AI output.

    Args:
        json_data (documentai.Document): The Document AI output.
        pii_entities (List[str]): List of PII entities to extract.

    Returns:
        Dict[str, List[List[Union[float, str]]]]: A dictionary containing page-wise bounding boxes of PII entities.
    """
    page_wise_bbox = {}
    entity_wise_bbox = {}
    for pii_ent in pii_entities:
        for entity in json_data.entities:
            if "/" not in pii_ent:
                if entity.type_ == pii_ent:
                    page, bbox = get_page_bbox(entity)
                    if page in page_wise_bbox.keys():
                        page_wise_bbox[page].append(bbox)
                        if entity.type in entity_wise_bbox.keys():
                            entity_wise_bbox[entity.type].append(
                                {
                                    "page": page,
                                    "bbox": bbox,
                                    "old_text": entity.mention_text,
                                }
                            )
                        else:
                            entity_wise_bbox[entity.type] = [
                                {
                                    "page": page,
                                    "bbox": bbox,
                                    "old_text": entity.mention_text,
                                }
                            ]
                    else:
                        page_wise_bbox[page] = [bbox]
                        if entity.type in entity_wise_bbox.keys():
                            entity_wise_bbox[entity.type].append(
                                {
                                    "page": page,
                                    "bbox": bbox,
                                    "old_text": entity.mention_text,
                                }
                            )
                        else:
                            entity_wise_bbox[entity.type] = [
                                {
                                    "page": page,
                                    "bbox": bbox,
                                    "old_text": entity.mention_text,
                                }
                            ]
            else:
                parent_name = pii_ent.split("/")[0]
                if entity.properties:
                    if entity.type_ == parent_name:
                        for sub_ent in entity.properties:
                            if (
                                sub_ent.type_ == pii_ent.split("/")[-1]
                                or sub_ent.type_ == pii_ent
                            ):
                                page, bbox = get_page_bbox(sub_ent)
                                if page in page_wise_bbox.keys():
                                    page_wise_bbox[page].append(bbox)
                                    if sub_ent.type in entity_wise_bbox.keys():
                                        entity_wise_bbox[sub_ent.type].append(
                                            {
                                                "page": page,
                                                "bbox": bbox,
                                                "old_text": sub_ent.mention_text,
                                            }
                                        )
                                    else:
                                        entity_wise_bbox[sub_ent.type] = [
                                            {
                                                "page": page,
                                                "bbox": bbox,
                                                "old_text": sub_ent.mention_text,
                                            }
                                        ]
                                else:
                                    page_wise_bbox[page] = [bbox]
                                    if sub_ent.type in entity_wise_bbox.keys():
                                        entity_wise_bbox[sub_ent.type].append(
                                            {
                                                "page": page,
                                                "bbox": bbox,
                                                "old_text": sub_ent.mention_text,
                                            }
                                        )
                                    else:
                                        entity_wise_bbox[sub_ent.type] = [
                                            {
                                                "page": page,
                                                "bbox": bbox,
                                                "old_text": sub_ent.mention_text,
                                            }
                                        ]
    return page_wise_bbox, entity_wise_bbox


def get_synthesized_images(json_data: documentai.Document) -> List[Image.Image]:
    """
    Convert JSON data representing images into a list of PIL Image objects.

    Args:
        json_data (documentai.Document): The Document AI output containing image data.

    Returns:
        List[Image.Image]: A list of PIL Image objects.
    """
    synthesized_images = []

    def decode_image(image_bytes: bytes) -> Image.Image:
        with io.BytesIO(image_bytes) as image_file:
            image = Image.open(image_file)
            image.load()
        return image

    for i in range(len(json_data.pages)):
        synthesized_images.append(decode_image(json_data.pages[i].image.content))

    return synthesized_images


def add_synthetic_data(entity_bbox, synthesize_data, open_cv_image, bbox_synthesize):
    import random

    for en_name, coords in entity_bbox.items():
        for bb2 in coords:
            if (
                bbox_synthesize["page"] == bb2["page"]
                and bbox_synthesize["bbox"] == bb2["bbox"]
            ):
                x1 = bbox_synthesize["bbox"][0]
                y1 = bbox_synthesize["bbox"][1]
                x2 = bbox_synthesize["bbox"][2]
                y2 = bbox_synthesize["bbox"][3]

                if en_name in synthesize_data.keys():
                    synthesize_text = random.choice(synthesize_data[en_name])

                    if "\n" in bb2["old_text"]:
                        length = len(synthesize_text)
                        num_parts = bb2["old_text"].count("\n") + 1
                        part_size = length // num_parts
                        parts = [
                            synthesize_text[i * part_size : (i + 1) * part_size]
                            for i in range(num_parts)
                        ]
                    else:
                        parts = [synthesize_text]

                    bbox_width = x2 - x1
                    bbox_height = y2 - y1
                    font = cv2.FONT_HERSHEY_TRIPLEX
                    font_thickness = 1
                    font_color = (0, 0, 0)
                    line_spacing_factor = 1.2  # You can adjust this value based on your desired line spacing

                    for i, part in enumerate(parts):
                        font_scale = min(bbox_width, bbox_height) / len(str(part))

                        # Get the size of the text bounding box
                        text_size, _ = cv2.getTextSize(
                            str(part), font, font_scale, font_thickness
                        )

                        # Calculate the position to center the text within the bounding box
                        text_x = x1 + (bbox_width - text_size[0]) // 2
                        text_y = (
                            y1
                            + (bbox_height + text_size[1]) // 2
                            + i * int(line_spacing_factor * text_size[1])
                        )

                        text_size, baseline = cv2.getTextSize(
                            str(part), font, font_scale, font_thickness
                        )

                        while text_size[0] > (x2 - x1) or text_size[1] > (y2 - y1):
                            font_scale -= 0.01
                            text_size, baseline = cv2.getTextSize(
                                str(part), font, font_scale, font_thickness
                            )
                        if font_scale < 0.5:
                            font_scale = 0.5
                        # Calculate the position to center the text within the bounding box
                        text_x = x1 + (x2 - x1 - text_size[0]) // 2
                        text_y = (
                            y1
                            + (y2 - y1 + text_size[1]) // 2
                            + i * int(line_spacing_factor * text_size[1])
                        )

                        # Draw the new text on the image with the adjusted font size
                        cv2.putText(
                            open_cv_image,
                            str(part),
                            (text_x, text_y),
                            font,
                            font_scale,
                            font_color,
                            font_thickness,
                            cv2.LINE_AA,
                        )

    return open_cv_image


def draw_black_box(
    synthesized_images: List[Image.Image],
    page_wise_bbox: Dict[str, List[List[float]]],
    entity_wise_bbox: Dict[str, List[Dict[str, Any]]],
    synthesize_data: Dict[str, List[str]],
    redact_only: bool = False,  # New parameter to control behavior
) -> io.BytesIO:
    """
    Draw black or white boxes for PII entities and optionally add synthetic data within those boxes.

    Parameters:
        synthesized_images (List[Image.Image]): List of synthesized images.
        page_wise_bbox (Dict[str, List[List[float]]]): Dictionary containing page-wise bounding boxes.
        entity_wise_bbox (Dict[str, List[Dict[str, Any]]]): Dictionary containing entity-wise bounding boxes.
        synthesize_data (Dict[str, List[str]]): Dictionary containing synthetic data for each entity.
        redact_only (bool): If True, redact entities with a black box; otherwise, add synthetic data.

    Returns:
        io.BytesIO: PDF stream containing the images with boxes and optionally synthetic data.
    """
    open_cv_image = {}
    for i in range(len(synthesized_images)):
        open_cv_image[i] = numpy.array(synthesized_images[i].convert("RGB"))
    img_final = []
    for i in range(len(open_cv_image)):
        size = open_cv_image[i].shape
        for page, bbox_list in page_wise_bbox.items():
            if str(i) == page:
                for bbox in bbox_list:
                    x1 = int(bbox[0] * size[1])
                    y1 = int(bbox[1] * size[0])
                    x2 = int(bbox[2] * size[1])
                    y2 = int(bbox[3] * size[0])
                    if redact_only:
                        # Draw a black box for redaction
                        cv2.rectangle(
                            open_cv_image[i],
                            (x1, y1),
                            (x2, y2),
                            (0, 0, 0),
                            thickness=cv2.FILLED,
                        )
                    else:
                        # Draw a white box and add synthetic data
                        cv2.rectangle(
                            open_cv_image[i],
                            (x1, y1),
                            (x2, y2),
                            (255, 255, 255),
                            thickness=cv2.FILLED,
                        )
                        bbox_synthesize = {"page": page, "bbox": [x1, y1, x2, y2]}
                        open_cv_image[i] = add_synthetic_data(
                            entity_wise_bbox,
                            synthesize_data,
                            open_cv_image[i],
                            bbox_synthesize,
                        )
        img_temp = Image.fromarray(open_cv_image[i])
        img_final.append(img_temp)

    pdf_stream = io.BytesIO()
    img_final[0].save(
        pdf_stream,
        save_all=True,
        append_images=img_final[1:],
        resolution=100.0,
        quality=95,
        optimize=True,
        format="PDF",
    )

    return pdf_stream


def store_blob(pdf_stream, output_path, file_name):
    """
    Store files in cloud storage.
    """
    from google.cloud import storage

    storage_client = storage.Client()

    path_ = output_path.split("/")
    result_bucket = storage_client.bucket(path_[2])

    output_prefix = "/".join(path_[3:])
    filename = file_name.split(".")[0] + ".pdf"

    blob = result_bucket.blob(f"{output_prefix}{filename}")
    pdfbytes = pdf_stream.getvalue()
    blob.upload_from_string(pdfbytes, content_type="application/pdf")


def get_redact_bbox_from_text(
    text_redact: str, full_text: str, json_data: dict
) -> Dict[str, list]:
    """
    Get the bounding box coordinates for redacting specified text in a document.

    Args:
        text_redact (str): The text to be redacted.
        full_text (str): The full text of the document.
        json_data (dict): The JSON representation of the document.

    Returns:
        Dict[str, list]: A dictionary mapping page numbers to lists of bounding box coordinates.
    """
    from fuzzywuzzy import fuzz
    import re

    pattern = r"{}.*{}".format(
        re.escape(text_redact.split(" ")[0]), re.escape(text_redact.split(" ")[-1])
    )
    match = re.search(pattern, full_text, flags=re.DOTALL | re.IGNORECASE)
    start = match.start()

    end_temp = full_text[start : start + 50].find(text_redact.split(" ")[-1])
    end = start + end_temp + len(text_redact.split(" ")[-1])
    page_anc = {"x": [], "y": []}
    page_num = 0

    for page in json_data.pages:
        for token in page.tokens:
            text_anch = token.layout.text_anchor.text_segments
            for an in text_anch:
                start_temp_token = an.start_index
                end_temp_token = an.end_index
                if (
                    int(start_temp_token) >= int(start)
                    and int(end_temp_token) <= int(end) + 2
                ):
                    norm_ver = token.layout.bounding_poly.normalized_vertices
                    for ver in norm_ver:
                        page_anc["x"].append(ver.x)
                        page_anc["y"].append(ver.y)
                    page = page_num
        page_num = page_num + 1
    redact_bbox = {
        str(page): [
            [
                min(page_anc["x"]),
                min(page_anc["y"]),
                max(page_anc["x"]),
                max(page_anc["y"]),
            ]
        ]
    }

    return redact_bbox


def read_excel_to_dict(file_path: str, sheet_name: str = "data") -> dict:
    """
    Read the specified sheet from the Excel file into a dictionary.

    Args:
        file_path (str): Path to the Excel file.
        sheet_name (str, optional): Name of the sheet to read. Defaults to 'data'.

    Returns:
        dict: A dictionary with column names as keys and lists of column data as values.
    """
    # Read the specified sheet from the Excel file into a pandas DataFrame
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Convert DataFrame to a dictionary with columns as keys and lists of column data as values
    data_dict = {column: df[column].tolist() for column in df.columns}

    return data_dict


def de_normalize_bbox(entity_bbox: dict, synthesized_images: list) -> dict:
    """
    De-normalize bounding box coordinates based on the original image size.

    Args:
        entity_bbox (dict): A dictionary containing entity bounding box information.
        synthesized_images (list): List of synthesized images.

    Returns:
        dict: De-normalized entity bounding box coordinates.
    """
    open_cv_image = {}
    for i in range(len(synthesized_images)):
        open_cv_image[i] = numpy.array(synthesized_images[i].convert("RGB"))
    for j in range(len(open_cv_image)):
        size = open_cv_image[j].shape
        for en_name1, coords1 in entity_bbox.items():
            for bbox1 in coords1:
                if str(j) == bbox1["page"]:
                    bbox1["bbox"] = [
                        int(bbox1["bbox"][0] * size[1]),
                        int(bbox1["bbox"][1] * size[0]),
                        int(bbox1["bbox"][2] * size[1]),
                        int(bbox1["bbox"][3] * size[0]),
                    ]

    return entity_bbox


def main():
    """Calling the functions"""
    file_names_list, file_names_dict = file_names(gcs_input_path)

    file_path = synthetic_data_path
    synthesize_data = read_excel_to_dict(file_path, sheet_name=sheet_name)

    for filename, filepath in tqdm(file_names_dict.items(), desc="Progress"):
        if ".json" in filename:
            print(filename)
            # try:
            json_data = documentai_json_proto_downloader(
                gcs_input_path.split("/")[2], filepath
            )
            redact_bbox = {}
            try:
                page_wise_bbox, entity_wise_bbox = get_bbox_page_wise(
                    json_data, pii_entities
                )
                for p2, b2 in page_wise_bbox.items():
                    if p2 in redact_bbox.keys():
                        redact_bbox[p2].extend(b2)
                    else:
                        redact_bbox[p2] = b2
            except:
                pass
            try:
                if len(redact_text) > 0:
                    redact_bbox_text = {}
                    for t1 in redact_text:
                        page_wise_bbox_text = get_redact_bbox_from_text(
                            t1, json_data.text, json_data
                        )
                        for p1, b1 in page_wise_bbox_text.items():
                            if p1 in redact_bbox.keys():
                                redact_bbox[p1].extend(b1)
                            else:
                                redact_bbox[p1] = b1

            except Exception as e:
                # print(e)
                pass

            synthesized_images = get_synthesized_images(json_data)
            updated_entity_wise_bbox = de_normalize_bbox(
                entity_wise_bbox, synthesized_images
            )

            pdf_stream = draw_black_box(
                synthesized_images,
                redact_bbox,
                updated_entity_wise_bbox,
                synthesize_data,
                redact_only,
            )
            import io
            from pdf2image import convert_from_bytes
            from IPython.display import display

            pdf_stream.seek(0)  # Go to the beginning of the stream
            images = convert_from_bytes(pdf_stream.read())

            # # Display images
            # for image in images:
            #     display(image)

            store_blob(pdf_stream, gcs_output_path, filename)


main()

### 4.Output

The New pdf documents with synthesized data will be saved in gcs_output_path