# Watermarks and Line Removal

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.

# Objective
The python script is designed to facilitate the removal of half-tone (grey) watermarks and Lines from images (PDF’s , JPG's, JPEG,etc ) using image processing techniques. Its purpose is to automate the pre-processing of eliminating visible watermarks  and lines present in the images.

# Prerequisites
* Vertex AI Notebook or Google Colab
* GCS bucket for processing of  the input files and output files

# Step-by-Step Procedure

## 1. Import Modules/Packages

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-storage
!pip install google-cloud-documentai

In [None]:
from mimetypes import MimeTypes
import io
from typing import ByteString
import cv2
import numpy as np
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from PIL import Image
import fitz
from utilities import file_names, store_document_as_json

## 2. Input Details

* **INPUT_GCS_PATH** : It is input GCS folder path which contains DocumentAI processor JSON results
* **OUTPUT_GCS_PATH** : It is a GCS folder path to store post-processing results
* **EXPECTED_DOCUMENT_TYPE** : It contains document types

In [None]:
expected_document_type = ["application/pdf", "image/tiff"]  # document types
INPUT_PATH = "gs://{bucket-name}/{sub-folders-path}/"  #  Path to your input files.
OUTPUT_PATH = (
    "gs://{bucket-name}/{sub-folders-path}"  # Path where output files will be saved
)

## 3. Run Below Code-Cells

In [None]:
input_bucket_name = INPUT_PATH.split("/")[2]
output_bucket_name = OUTPUT_PATH.split("/")[2]
mime = MimeTypes()


def read_image_from_bytecode(image_bytecode: bytes) -> np.ndarray:
    """
    Convert bytecode to a numpy array and decode it into an image.

    Parameters:
    image_bytecode (bytes): The bytecode of the image to be converted.

    Returns:
    numpy.ndarray: The decoded image in BGR format.
    """

    nparr = np.frombuffer(image_bytecode, np.uint8)
    # Decode image from numpy array
    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    return image


def store_image_to_bytecode(image: np.ndarray, file_format: str = ".png") -> bytes:
    """
    Encode image to bytecode.

    Parameters:
    image (numpy.ndarray): The image to be encoded.
    file_format (str): The format to encode the image into. Default is '.png'.

    Returns:
    bytes: The bytecode of the encoded image.
    """

    is_success, buffer = cv2.imencode(file_format, image)
    if not is_success:
        raise ValueError("Failed to encode image")
    bytecode = buffer.tobytes()
    return bytecode


def remove_watermark_image(doc_byte: ByteString) -> bytes:
    """
    Remove watermark from an image given as byte data.

    Parameters:
    - doc_byte (bytes): Byte data of the image file.

    Returns:
    - bytes: Byte data of the image with watermark removed.
    """
    # Read the image from bytecode
    image = read_image_from_bytecode(doc_byte)

    # Convert the image to grayscale
    grayscale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Create a copy of the grayscale image for background processing
    background = grayscale.copy()

    # Define a kernel for morphology operations
    kernel_size = 2
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))

    # Apply morphological operations to remove background noise
    background = cv2.morphologyEx(background, cv2.MORPH_CLOSE, kernel)
    background = cv2.morphologyEx(background, cv2.MORPH_OPEN, kernel)

    # Calculate the difference between the background and the grayscale image
    difference = cv2.subtract(background, grayscale)

    # Apply thresholding to create a binary mask of the watermark
    _, binary = cv2.threshold(
        difference, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )

    # Apply the binary mask to the original image to remove the watermark
    result = cv2.bitwise_and(image, image, mask=binary)

    # Encode the result image as PNG and return as bytes
    _, result_bytes = cv2.imencode(".png", result)
    return result_bytes.tobytes()


def remove_straight_lines(doc_byte: ByteString) -> ByteString:
    """
    Remove straight horizontal lines from an image given as byte data.

    Parameters:
    - doc_byte (bytes): Byte data of the image file.

    Returns:
    - bytes: Byte data of the image with straight horizontal lines removed.
    """
    # Read the image from bytecode
    image = read_image_from_bytecode(doc_byte)
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    # Apply Canny edge detection
    edged = cv2.Canny(blurred, 50, 150, apertureSize=3)
    # Detect lines using Hough transform
    lines = cv2.HoughLinesP(
        edged, 1, np.pi / 180, threshold=100, minLineLength=200, maxLineGap=10
    )
    if lines is not None:
        # Iterate over detected lines
        for line in lines:
            x1, y1, x2, y2 = line[0]
            # Check if the line is nearly horizontal (thresholded by y-axis difference)
            if abs(y2 - y1) < 10:
                # Draw a white line over the detected straight line
                cv2.line(image, (x1, y1), (x2, y2), (255, 255, 255), 2)
        # Convert image back to RGB (if necessary)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Convert image to byte data
        image_byte = store_image_to_bytecode(image_rgb)
        return image_byte
    return doc_byte


def document_downloader(
    bucket_name: str, blob_name_with_prefix_path: str
) -> documentai.Document:
    """
    Downloads a file from a specified Google Cloud Storage bucket
    and converts it into a DocumentAI Document proto.

    Args:
        bucket_name (str): The name of the GCS bucket from which to download the file.
        blob_name_with_prefix_path (str): The full path (prefix) to the JSON blob in the bucket.

    Returns:
        documentai.Document: A DocumentAI Document proto representation of the downloaded JSON.
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name_with_prefix_path)
    doc = blob.download_as_bytes()

    return doc


def convert_pdf_to_bytecode(pdf_bytes: bytes, dpi: int = 400) -> list:
    """
    Convert each page of a PDF document to byte data in JPEG format.

    Parameters:
    - pdf_bytes (bytes): Byte data of the PDF document.
    - dpi (int): Dots per inch for rendering PDF pages (default is 400).

    Returns:
    - list: List of byte data for each page of the PDF as JPEG images.
    """
    pdf_document = fitz.open("pdf", pdf_bytes)
    bytecode_list = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        # Transform to zoom based on DPI
        zoom_x = dpi / 72.0
        zoom_y = dpi / 72.0
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)
        # Get the image bytes directly from the Pixmap
        img_bytes = pix.tobytes("jpeg")
        # Convert the bytes to an image using Pillow
        byte_array = io.BytesIO(img_bytes)
        image = Image.open(byte_array)
        # Save the image to a new BytesIO object in the desired format (JPEG)
        byte_array_converted = io.BytesIO()
        image.save(byte_array_converted, format="JPEG")
        bytecode = byte_array_converted.getvalue()
        bytecode_list.append(bytecode)
    return bytecode_list


def convert_docs_to_bytecode(doc: bytes, dpi=(400, 400)) -> list:
    """
    Convert a multi-frame document (like a multi-page TIFF) from byte data to a list of byte data,
    with each frame having specified DPI settings.

    Parameters:
    - doc (bytes): Byte data of the multi-frame document.
    - dpi (tuple): DPI settings in the form of (dpi_x, dpi_y) (default is (400, 400)).

    Returns:
    - list: List of byte data for each frame of the document.
    """
    with Image.open(io.BytesIO(doc)) as img:
        bytecode_list = []
        for i in range(img.n_frames):
            img.seek(i)  # Go to the ith frame
            # Save the frame to a BytesIO object with specified DPI and format
            byte_array = io.BytesIO()
            img.save(byte_array, format="TIFF", dpi=dpi)
            bytecode = byte_array.getvalue()
            bytecode_list.append(bytecode)
        return bytecode_list


def reduce_image_size(image_bytes, output_format="JPEG"):
    """
    Optimize the size of an image without significantly affecting its quality.

    :param image_bytes: Bytes of the input image.
    :param output_format: Desired output format (e.g., 'JPEG', 'PNG'). Default is 'JPEG'.
    :return: Bytes of the optimized image.
    """
    input_buffer = io.BytesIO(image_bytes)
    output_buffer = io.BytesIO()
    with Image.open(input_buffer) as img:
        # Create a new image to remove EXIF metadata
        img_without_metadata = Image.new(img.mode, img.size)
        img_without_metadata.putdata(list(img.getdata()))
        # Optimize and save the image
        if output_format.upper() == "JPEG":
            img_without_metadata.save(
                output_buffer, format=output_format, quality=95, optimize=True
            )
        else:
            img_without_metadata.save(
                output_buffer, format=output_format, optimize=True
            )
    return output_buffer.getvalue()


document_paths = list(file_names(INPUT_PATH)[1].values())

for document_path in document_paths:
    file_name = document_path.split("/")[-1]
    print(file_name)
    folder_name = file_name.split(".")[0]
    output_full_path = ("/".join(OUTPUT_PATH.split("/")[3:])) + f"/{folder_name}"
    doc_mime_type = mime.guess_type(document_path)
    if doc_mime_type[0] not in expected_document_type:
        print("[x] Not an expected document type : ", document_path)
        continue
    document_bytes = document_downloader(input_bucket_name, document_path)
    print(doc_mime_type)
    COUNT = 0
    if doc_mime_type[0] == "application/pdf":
        document_byte_list = convert_pdf_to_bytecode(document_bytes)
        for document_byte in document_byte_list:
            removed_watermark_bytes = remove_watermark_image(document_byte)
            removed_straight_bytes = remove_straight_lines(removed_watermark_bytes)

            reduced_image_bytes = reduce_image_size(removed_straight_bytes)

            store_document_as_json(
                reduced_image_bytes,
                output_bucket_name,
                output_full_path + f"/{folder_name}_{COUNT}.jpeg",
            )
            COUNT += 1
    else:
        document_byte_list = convert_docs_to_bytecode(document_bytes)
        for document_byte in document_byte_list:
            removed_watermark_bytes = remove_watermark_image(document_byte)
            removed_straight_bytes = remove_straight_lines(removed_watermark_bytes)

            store_document_as_json(
                removed_straight_bytes,
                output_bucket_name,
                output_full_path + f"/{folder_name}_{COUNT}.jpeg",
            )
            COUNT += 1

# 4. Output Details

Refer below images for Input and Output File

<table>
    <tr>
        <td>
            <b>Input Image</b>
        </td>
        <td>
            <b>Output Image</b>
        </td>
    </tr>
    <tr>
        <td>
            <img src='./Images/INPUT_1.png' width=400 height=600 alt='input_1'></img>
        </td>
        <td>
            <img src='./Images/OUTPUT_1.png' width=400 height=600 alt='output_1'></img>
        </td>
    </tr>
    <tr>
        <td>
            <img src='./Images/INPUT_2.png' width=400 height=600 alt='input_2'></img>
        </td>
        <td>
            <img src='./Images/OUTPUT_2.png' width=400 height=600 alt='output_2'></img>
        </td>
    </tr>
</table>
    