# Amount rectification from words to numbers


* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied. 


## Objective

This document provides instructions for rectifying the amount in cheques using the words with entity types and parsed jsons as input.

<b>Note: This tool was developed to address output produced by a processor trained on a dataset of cheques.</b>


## Prerequisites

* Vertex AI Notebook Or Colab (If using Colab, use authentication)
* Storage Bucket for storing input and output json files
* Permission For Google Storage and Vertex AI Notebook.



## Step by Step procedure

### 1. Importing Required Modules

In [None]:
!pip install pandas numpy google-cloud-storage google-cloud-documentai==2.16.0 PyPDF2 configparser
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [1]:
from word2number import w2n
import difflib
from fuzzywuzzy import fuzz
import re
from indian_word2number import indian_w2n
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from pathlib import Path
from utilities import *
from typing import Any, List, Dict

### 2. Input and Output Paths

<b>Alter the `amount_in_words_entity_name` and `amount_in_figures_entity_name` list values to modify to fit your use case.</b>

In [3]:
input_path = "gs://xxxx/xxxx/xxx/"  # GCS parsed jsons path
output_path = "gs://xxxx/xxxx/xxx/"  # GCS path to save the updated docs
project_id = "xxxx-xxxx-xxx"  # Project ID
amount_in_words_entity_name = [
    "amount_in_words",
    "amountInWords1",
    "amountInWords2",
    "amountInWords",
]  # entity name for letters
amount_in_figures_entity_name = [
    "amount_in_figures",
    "amount",
]  # entity name for figures

* `input_path` : GCS Input Path. It should contain DocAI processed output json files. 
* `output_path` : GCS Output Path. The updated jsons will be saved in output path. 
* `project_id` : It should contains the project id of your current project.
* `amount_in_words_entity_name` :  entity name for letters
* `amount_in_figures_entity_name` : entity name for figures


### 3. Run the Code

In [None]:
# functions


def translate_text(target: str, text: str) -> dict:
    """
    Translates text into the target language.

    Args:
        target (str): The target language, specified as an ISO 639-1 language code.
                     See https://g.co/cloud/translate/v2/translate-reference#supported_languages
        text (str): The text to be translated.

    Returns:
        dict: A dictionary containing translation information.
    """
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    result = translate_client.translate(text, target_language=target)

    return result


def convert_amount_in_words_to_numbers(text: str) -> int:
    """
    Convert amounts written in words to numerical values.

    Args:
        text (str): The input text containing the amount written in words.

    Returns:
        int: The numerical value corresponding to the input text.
    """
    indian_numbering_words = ["lakh", "crore"]
    is_indian_system = any(word in text for word in indian_numbering_words)

    if is_indian_system:
        return indian_w2n.word_to_num(text)
    else:
        return w2n.word_to_num(text)


def remove_repetitive_words(text: str) -> str:
    """
    Remove repetitive consecutive words from the given text.

    Args:
        text (str): The input text containing potentially repetitive consecutive words.

    Returns:
        str: The text with repetitive consecutive words removed.
    """
    cleaned_text = re.sub(r"\b(\w+)\s+\1\b", r"\1", text, flags=re.IGNORECASE)

    return cleaned_text


def words_nearest_match(text: str) -> str:
    """
    Find the nearest matches for words in the given text.

    Args:
        text (str): The input text containing words to find matches for.

    Returns:
        str: The text with replaced words that are nearest matches to the predefined dictionary.
    """
    word_to_number = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
        "eleven": 11,
        "twelve": 12,
        "thirteen": 13,
        "fourteen": 14,
        "fifteen": 15,
        "sixteen": 16,
        "seventeen": 17,
        "eighteen": 18,
        "nineteen": 19,
        "twenty": 20,
        "thirty": 30,
        "forty": 40,
        "fifty": 50,
        "sixty": 60,
        "seventy": 70,
        "eighty": 80,
        "ninety": 90,
        "lakh": 100000,
        "crore": 10000000,
        "hundred": 100,
        "thousand": 1000,
        "million": 1000000,
        "billion": 1000000000,
    }
    single_digit = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
    }
    multi_digit = {
        "twenty": 20,
        "thirty": 30,
        "forty": 40,
        "fifty": 50,
        "sixty": 60,
        "seventy": 70,
        "eighty": 80,
        "ninety": 90,
        "lakh": 100000,
        "crore": 10000000,
        "hundred": 100,
        "thousand": 1000,
        "million": 1000000,
        "billion": 1000000000,
    }
    unwanted_list = ["only", "Rupees", "and"]
    text = re.sub(r"[,.\-_]", " ", text)

    def find_nearest_match(word: str, word_list: list) -> str:
        """
        Find the nearest match for a given word in a list of words.

        Args:
            word (str): The word for which to find the nearest match.
            word_list (list): The list of words to search for a match.

        Returns:
            str: The nearest match, or None if no match is found.
        """
        closest_match = difflib.get_close_matches(
            word.lower(), word_list, n=1, cutoff=0.6
        )
        if closest_match:
            if closest_match[0] == "one":
                if "one" not in word.lower():
                    return None
                else:
                    return closest_match[0]
            else:
                return closest_match[0]

        return None

    def matching_ratio(unwanted_list: list, text: str) -> int:
        """
        Calculate the similarity ratio between the given text and a list of unwanted strings.

        Args:
            unwanted_list (list): The list of unwanted strings.
            text (str): The text for which to calculate the similarity ratio.

        Returns:
            int: The similarity index (1 if a match is found, 0 otherwise).
        """
        threshold = 75
        similarity_index = 0
        similarity_ratio = 0
        for string in unwanted_list:
            similarity = fuzz.ratio(text, string)
            if similarity >= threshold:
                similarity_index = 1
                similarity_ratio = similarity
        return similarity_index

    matching_text = ""
    for i in text.split():
        if (
            i.lower() not in unwanted_list
            and matching_ratio(unwanted_list, i.lower()) != 1
        ):
            match = find_nearest_match(i.lower(), word_to_number.keys())
            if match != None:
                matching_text = matching_text + " " + match

    split_match = matching_text.split()
    if split_match[-1] in single_digit.keys():
        right_match_2 = find_nearest_match(split_match[-2], multi_digit.keys())
        split_match[-2] = right_match_2

    matching_text = (" ").join(split_match)

    return matching_text


def get_amount_in_text(json_data: Any, amount_in_words_entity_name: List[str]) -> str:
    """
    Extract and sort text entities based on their position on a page, specifically focusing on amounts written in words.

    Args:
        json_data (Any): The input JSON data containing information about text entities.
        amount_in_words_entity_name (List[str]): A list of entity types corresponding to amounts in words.

    Returns:
        str: The merged and sorted text representing amounts in words.
    """

    ent_type = {}
    n = 0
    merged_text = ""
    for entity in json_data.entities:
        if entity.type in amount_in_words_entity_name:
            bound_poly = entity.page_anchor.page_refs
            coordinates_xy = bound_poly[0].bounding_poly.normalized_vertices
            x_1 = [xy.x for xy in coordinates_xy]
            y_1 = [xy.y for xy in coordinates_xy]
            temp_text_anc = []
            for t1 in entity.text_anchor.text_segments:
                temp_text_anc.append(
                    {"start_index_temp": t1.start_index, "end_index_temp": t1.end_index}
                )
            for an2 in temp_text_anc:
                text_sorted = text_sequenced(
                    json_data,
                    entity.mention_text,
                    an2["start_index_temp"],
                    an2["end_index_temp"],
                )
                ent_type[n] = text_sorted
                n += 1
    if len(ent_type) > 1:
        sorted_data = sorted(
            ent_type.items(),
            key=lambda item: (item[1]["ent_y"], item[1]["ent_x"])
            if item[1]["ent_y"]
            - min(ent_type.values(), key=lambda x: x["ent_y"])["ent_y"]
            > 0.05
            else (item[1]["ent_x"], item[1]["ent_y"]),
        )
        sorted_dict = {key: value for key, value in sorted_data}
        merged_text = " ".join([item["text"] for item in sorted_dict.values()])
    elif len(ent_type) == 1:
        merged_text = ent_type[0]["text"]

    return merged_text


def get_min_max_xy(
    json_temp: Any, start_index_1: str, end_index_1: str
) -> Dict[str, float]:
    """
    Get the minimum and maximum coordinates (x, y) based on the provided text indices.

    Args:
        json_temp (Any): The input JSON data containing information about text entities.
        start_index_1 (str): The start index of the text segment.
        end_index_1 (str): The end index of the text segment.

    Returns:
        Dict[str, float]: A dictionary containing the minimum and maximum coordinates (x, y).
    """
    x_2 = ""
    y_2 = ""
    for page in json_temp.pages:
        for token in page.tokens:
            if (
                abs(
                    int(start_index_1)
                    - int(token.layout.text_anchor.text_segments[0].start_index)
                )
                <= 2
                and abs(
                    int(end_index_1)
                    - int(token.layout.text_anchor.text_segments[0].end_index)
                )
                <= 2
            ):
                coordinates_x_y = token.layout.bounding_poly.normalized_vertices
                x_2 = [xy.x for xy in coordinates_x_y]
                y_2 = [xy.y for xy in coordinates_x_y]

            elif int(token.layout.text_anchor.text_segments[0].start_index) >= int(
                start_index_1
            ) and int(token.layout.text_anchor.text_segments[0].end_index) <= int(
                end_index_1
            ):
                coordinates_x_y = token.layout.bounding_poly.normalized_vertices
                x_2 = [xy.x for xy in coordinates_x_y]
                y_2 = [xy.y for xy in coordinates_x_y]

    return {"min_x": min(x_2), "min_y": min(y_2), "max_x": max(x_2), "max_y": min(y_2)}


def text_sequenced(
    json_temp: Any, temp_text: str, start_index_temp: int, end_index_temp: int
) -> Dict[str, Any]:
    """
    Process text data, extract information about matched text sequences, and organize the data based on coordinates.

    Args:
        json_temp (Any): The input JSON data containing information about text entities.
        temp_text (str): The text to process and sequence.
        start_index_temp (int): The start index of the text segment.
        end_index_temp (int): The end index of the text segment.

    Returns:
        Dict[str, Any]: A dictionary containing the sorted and organized text information.
    """
    temp_list = temp_text.split()
    text_correction_dict = {}
    k = 0
    for t1 in temp_list:
        matches = []
        unique_matches = []
        matches = [
            {
                "start_index": match.start() + start_index_temp,
                "end_index": match.start() + start_index_temp + len(t1),
            }
            for match in re.finditer(
                t1.lower(), json_temp.text[start_index_temp:end_index_temp].lower()
            )
        ]
        unique_data = set(tuple(d.items()) for d in matches)
        unique_matches = [dict(t) for t in unique_data]
        try:
            for match in unique_matches:
                dict_xy = get_min_max_xy(
                    json_temp, match["start_index"], match["end_index"]
                )
                text_correction_dict[k] = {
                    "mt": t1,
                    "min_x": dict_xy["min_x"],
                    "min_y": dict_xy["min_y"],
                }
                k += 1
        except:
            continue

    seen_values = set()
    unique_data = {}

    for key, value in text_correction_dict.items():
        value_tuple = frozenset(value.items())
        if value_tuple not in seen_values:
            seen_values.add(value_tuple)
            unique_data[key] = dict(value_tuple)

    unique_data_2 = {}
    for key, value in unique_data.items():
        min_x_2 = value["min_x"]
        min_y_2 = value["min_y"]

        if (min_x_2, min_y_2) in unique_data_2:
            if len(value["mt"]) > len(unique_data_2[(min_x_2, min_y_2)]["mt"]):
                unique_data_2[(min_x_2, min_y_2)] = value
        else:
            unique_data_2[(min_x_2, min_y_2)] = value

    unique_data_mod = dict(enumerate(unique_data_2.values()))

    sorted_dict_1 = sorted(
        unique_data_mod.values(),
        key=lambda x: (x["min_y"], x["min_x"])
        if x["min_y"] - unique_data_mod[0]["min_y"] > 0.05
        else (x["min_x"], x["min_y"]),
    )
    sorted_dict_1 = {i: data for i, data in enumerate(sorted_dict_1)}

    mt_values = [item["mt"] for item in sorted_dict_1.values()]
    joined_mt = " ".join(mt_values)
    min_x_value = min(sorted_dict_1.values(), key=lambda item: item.get("min_x"))[
        "min_x"
    ]
    min_y_value = min(sorted_dict_1.values(), key=lambda item: item.get("min_y"))[
        "min_y"
    ]
    joined_mt = " ".join(mt_values)
    mt_sorted_1 = {"text": joined_mt, "ent_y": min_y_value, "ent_x": min_x_value}

    return mt_sorted_1


def change_mt_amount(
    json_data: Any,
    amount_in_figures_entity_name: List[str],
    cheque_amount_predicted: float,
) -> Any:
    """
    Change the mention text of an entity in the provided JSON data with the specified cheque amount.

    Args:
        json_data (Any): The input JSON data containing information about text entities.
        amount_in_figures_entity_name (List[str]): A list of entity types corresponding to amounts in figures.
        cheque_amount_predicted (float): The predicted cheque amount to set for the specified entity.

    Returns:
        Any: The updated JSON data with the mention text changed.
    """

    if len(str(cheque_amount_predicted)) != 0:
        for en in json_data.entities:
            if en.type in amount_in_figures_entity_name:
                en.mention_text = str(cheque_amount_predicted)
                # print(en)
                return json_data


file_name_list, file_path_dict = file_names(input_path)
import pandas as pd

for i in range(len(file_name_list)):
    file_path = (
        "gs://" + input_path.split("/")[2] + "/" + file_path_dict[file_name_list[i]]
    )
    json_data = documentai_json_proto_downloader(
        file_path.split("/")[2], ("/").join(file_path.split("/")[3:])
    )
    print(file_path)
    cheque_amount = ""
    cheque_amount_predicted = ""
    amount_in_words = ""
    nearest_match = ""
    updated_amount = ""
    for en in json_data.entities:
        if en.type in amount_in_figures_entity_name:
            cheque_amount = en.mention_text
    if cheque_amount == "":
        new_entity = documentai.Document.Entity()
        new_entity.type_ = amount_in_figures_entity_name[0]
        new_entity.mention_text = "None"
        json_data.entities.append(new_entity)
    try:
        list_type = []
        amount_in_words = get_amount_in_text(json_data, amount_in_words_entity_name)
        amount_pred_letters = translate_text("en-US", amount_in_words)["translatedText"]
        nearest_match = words_nearest_match(amount_pred_letters)
        clean_text = remove_repetitive_words(nearest_match)
        cheque_amount_predicted = convert_amount_in_words_to_numbers(clean_text)
        json_data = change_mt_amount(
            json_data, amount_in_figures_entity_name, cheque_amount_predicted
        )

    except (re.error, IndexError) as e:
        print("Exception:  ", e)

    store_document_as_json(
        documentai.Document.to_json(json_data),
        output_path.split("/")[2],
        ("/").join(output_path.split("/")[3:]) + "/" + file_name_list[i],
    )

### 4.Output

The corrected jsons will be saved in output path

Issue file

<img src="./Images/issue.png" width=800 height=400></img>

Corrected file will be as below

<img src="./Images/corrected.png" width=800 height=400></img>