# Test Harness Tool V2

* Author: docai-incubator@google.com

## Disclaimer

This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the **DocAI Incubator Team**. No guarantees of performance are implied.


## Objective

This tool automates the evaluation of Google Cloud Document AI (DocAI) Custom Document Extractor (CDE) processors by fetching a sample of documents from Google Cloud Storage (GCS), processing them through multiple iterations of a Generative AI-based CDE model, and comparing extraction results for accuracy and consistency. It measures entity matching percentage, fuzzy ratios, document-level and entity-level consistency, and latency metrics. The final output is a Google Sheets report summarizing extraction accuracy, consistency, and performance insights.

## Prerequisites
* Vertex AI JupyterLab Environment
* Google Cloud Storage Bucket
* CDE Processor

## Step by Step procedure 

### 1.Importing Required Modules

In [None]:
!wget https://raw.githubusercontent.com/GoogleCloudPlatform/document-ai-samples/main/incubator-tools/best-practices/utilities/utilities.py

In [None]:
!pip install google-cloud-documentai google-api-core google-cloud-storage tqdm pillow pypdf2 openpyxl

In [None]:
from google.api_core.operation import Operation
from google.longrunning import operations_pb2
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
import json
from pathlib import Path
from tqdm import tqdm
from google.cloud import storage
from typing import (
    Container,
    Iterable,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Union,
    Any,
    Dict,
)
from io import BytesIO
from pprint import pprint
import copy
from PIL import Image
from PyPDF2 import PdfFileReader
import io
from tqdm.notebook import tqdm
import time
import concurrent.futures
from datetime import datetime
import warnings
import json_folder_comparison
import openpyxl
from openpyxl import Workbook
from openpyxl.styles import Alignment
from datetime import datetime, timedelta, timezone

warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font
from utilities import file_names, store_document_as_json

### 2.Setup the inputs

* `project_id` : This is the unique identifier for the Google Cloud project.
* `location` : This specifies the location or region where the resources are located.
* `processor_id` : This is the unique CDE identifier for a processor in Google Cloud.
* `processor_version_id` : This identifies the specific version of the processor or model you are using.
* `gcs_input_path` : This is the path to a Google Cloud Storage (GCS) bucket and folder where input documents are stored.
* `mime_type` : This specifies the MIME type of the files being processed.
* `gcs_sync_output_path` : This is the GCS path where output from synchronous processing will be stored.
* `gcs_async_output_path` : This is the GCS path where output from asynchronous (batch) processing will be stored.
* `gcs_result_path` : This is the path to store the results, likely to be Excel sheets or similar, that summarize or process the output from both synchronous and asynchronous processing.
* `process_type` : This is a list specifying which types of processes to use for the task. "async" refers to asynchronous processing, "sync" refers to synchronous processing, and both can be chosen if you want to use both types.
* `timeout` : This defines the timeout period (in seconds) for the process to complete. If the process takes longer than this time, it will be stopped or failed.
* `iteration` : This specifies the number of iterations to process.

In [None]:
project_id = "project_id"
location = "us"
processor_id = "processor_id"
processor_version_id = "pretrained-foundation-model-v1.2-2024-05-10"
gcs_input_path = "gs://bucket_name/pdf_files_path/"  # Contains Input PDF Files
mime_type = "application/pdf"
gcs_sync_output_path = "gs://bucket_name/output_sync_process_files_path/"  # Stores the gcs sync output files
gcs_async_output_path = "gs://bucket_name/output_async_process_files_path/"  # Stores the gcs sync output files

gcs_result_path = "gs://bucket_name/sheets_output/"  # Stores the resulted sheets

process_type = ["async", "sync"]  # the values are sync or async or both.

timeout = 6000
iteration = 2  # No of iteration required
sync_runtime = {}
batch_runtime = {}

### 3.Run the required functions

In [None]:
def get_current_ist_time() -> str:
    """
    Get the current time in Indian Standard Time (IST).

    Returns:
        str: The current IST time formatted as 'YYYY-MM-DD HH:MM:SS IST'.
    """
    ist_offset = timedelta(hours=5, minutes=30)  # IST is UTC+5:30
    ist_time = datetime.now(timezone.utc) + ist_offset
    return ist_time.strftime("%Y-%m-%d %H:%M:%S IST")


def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_bucket_name: str,
    gcs_blob_name: str,
    mime_type: str,
) -> Any:
    """
    Processes a document using the Document AI Online Processing API with a file from a GCS bucket.

    Args:
        project_id (str): The Google Cloud project ID.
        location (str): The Google Cloud location (e.g., "us").
        processor_id (str): The ID of the Document AI processor.
        processor_version_id (str): The version of the Document AI processor to use.
        gcs_bucket_name (str): The name of the Google Cloud Storage bucket where the file is stored.
        gcs_blob_name (str): The name of the file (blob) in the GCS bucket.
        mime_type (str): The MIME type of the document (e.g., "application/pdf").

    Returns:
        Any: The processed document result returned by Document AI, which contains the extracted information.
    """
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client for Document AI
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # Instantiates a client for GCS
    storage_client = storage.Client()

    # Access the bucket and blob
    bucket = storage_client.bucket(gcs_bucket_name)
    blob = bucket.blob(gcs_blob_name)

    # Download the file content as bytes
    image_content = blob.download_as_bytes()

    # The full resource name of the processor
    resource_name = documentai_client.processor_version_path(
        project_id, location, processor_id, processor_version_id
    )

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the sample form
    result = documentai_client.process_document(request=request)

    document = result.document

    # Uncomment to print the extracted text
    # print("Text.......")
    # print(document.text)

    return document


def batch_process_documents(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_input_path: str,
    gcs_async_output_path: str,
    gcs_async_output_path_prefix: str,
    timeout: int = 6000,
) -> operations_pb2.Operation:
    """
    Batch process documents using Document AI's asynchronous processing API.

    Args:
        project_id (str): The Google Cloud project ID.
        location (str): The location of the Document AI processor (e.g., "us" or "eu").
        processor_id (str): The ID of the Document AI processor.
        processor_version_id (str): The version of the Document AI processor to use.
        gcs_input_path (str): The GCS URI for the input files (e.g., "gs://bucket/path/").
        gcs_async_output_path (str): The GCS URI where the output should be stored.
        gcs_async_output_path_prefix (str): The prefix for the output files in the GCS bucket.
        timeout (int, optional): The maximum wait time (in seconds) for the batch processing to finish. Default is 6000 seconds (100 minutes).

    Returns:
        documentai.types.operations_pb2.Operation: The operation object that tracks the status of the batch processing request.
    """
    from google.cloud import documentai_v1beta3 as documentai

    # You must set the api_endpoint if you use a location other than 'us', e.g.:
    opts = {}
    if location == "eu":
        opts = {"api_endpoint": "eu-documentai.googleapis.com"}
    elif location == "us":
        opts = {"api_endpoint": "us-documentai.googleapis.com"}
        # opts = {"api_endpoint": "us-autopush-documentai.sandbox.googleapis.com"}
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    destination_uri = f"{gcs_async_output_path}{gcs_async_output_path_prefix}/"
    input_config = documentai.BatchDocumentsInputConfig(
        gcs_prefix=documentai.GcsPrefix(gcs_uri_prefix=gcs_input_path)
    )

    # Where to write results
    output_config = documentai.DocumentOutputConfig(
        gcs_output_config={"gcs_uri": destination_uri}
    )

    # Location can be 'us' or 'eu'
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_documents=input_config,
        document_output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=timeout)
    return operation


def process_file_wrapper(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_bucket_name: str,
    gcs_blob_name: str,
    mime_type: str,
    run_id: int,
    gcs_sync_output_path: str,
) -> Any:
    """
    Wrapper function to call the `online_process` function for document processing.

    It logs the start and end times for the processing, calculates the runtime,
    and stores the result as a JSON file in the specified GCS path.

    Args:
        project_id (str): The ID of the Google Cloud project.
        location (str): The location (region) where the processor is hosted.
        processor_id (str): The ID of the document processor.
        processor_version_id (str): The version ID of the processor.
        gcs_bucket_name (str): The name of the GCS bucket containing the input file.
        gcs_blob_name (str): The name of the input file in the GCS bucket.
        mime_type (str): The MIME type of the document (e.g., "application/pdf").
        run_id (int): The iteration run ID for tracking multiple runs.
        gcs_sync_output_path (str): The GCS path to store the output JSON file.

    Returns:
        Any: The processed document (likely a `documentai.Document` object).
    """
    start_time = datetime.now()
    # print(f"Run {run_id} started at {start_time}")

    # Simulate processing
    document = online_process(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        processor_version_id=processor_version_id,
        gcs_bucket_name=gcs_bucket_name,
        gcs_blob_name=gcs_blob_name,
        mime_type=mime_type,
    )

    end_time = datetime.now()
    # print(f"Run {run_id} ended at {end_time}")
    # print(f"Run {run_id} duration: {end_time - start_time}")

    file = gcs_blob_name.split("/")[-1].split(".")[0] + ".json"

    if file not in sync_runtime.keys():
        sync_runtime[file] = [0] * iteration
    sync_runtime[file][int(run_id) - 1] = (end_time - start_time).total_seconds()

    json_data = documentai.Document.to_dict(document)

    file_name = (
        "/".join(gcs_sync_output_path.split("/")[3:])
        + "iteration_"
        + str(run_id)
        + "/"
        + gcs_blob_name.split("/")[-1].split(".")[0]
        + ".json"
    )
    # print(gcs_sync_output_path.split("/")[2],file_name)
    store_document_as_json(
        json.dumps(json_data), gcs_sync_output_path.split("/")[2], file_name
    )
    # print(sync_runtime)
    return document


def run_parallel_processing() -> None:
    """
    Run the processing of the same file multiple times in parallel and confirm parallel execution.

    This function executes the `process_file_wrapper` in parallel for each file in `gcs_input_path`,
    and runs the processing `iteration` times in parallel for each file.

    Assumes that the following global variables are defined:
        gcs_input_path (str): The GCS URI where the input files are stored.
        iteration (int): Number of parallel runs for each file.
        project_id (str): The Google Cloud project ID.
        location (str): The location of the Document AI processor (e.g., 'us').
        processor_id (str): The processor ID to be used for processing.
        processor_version_id (str): The processor version ID to be used.
        mime_type (str): The MIME type of the input file (e.g., 'application/pdf').
        gcs_sync_output_path (str): The GCS URI to store the processed output.

    Returns:
        None: This function does not return any value, but it will print status for each processed document.
    """

    gcs_bucket_name = gcs_input_path.split("/")[2]

    for gcs_blob_names in file_names(gcs_input_path)[1].values():
        gcs_blob_name = gcs_blob_names

        # Number of parallel runs
        num_runs = iteration

        # Use ThreadPoolExecutor for parallel execution
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Submit tasks for parallel execution
            futures = [
                executor.submit(
                    process_file_wrapper,
                    project_id,
                    location,
                    processor_id,
                    processor_version_id,
                    gcs_bucket_name,
                    gcs_blob_name,
                    mime_type,
                    run_id,
                    gcs_sync_output_path,
                )
                for run_id in range(1, num_runs + 1)
            ]

            # Collect results
            for future in concurrent.futures.as_completed(futures):
                try:
                    document = future.result()  # Get the result of the task
                    print(f"\t\tSuccessfully processed document.")
                except Exception as e:
                    print(f"Task failed with error: {e}")


def batch_process_file_wrapper(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version_id: str,
    gcs_input_path: str,
    gcs_async_output_path: str,
    gcs_async_output_path_prefix: str,
    timeout: int,
) -> Any:
    """
    Wrapper function to call the `batch_process_documents` function for batch processing of documents.

    It logs the start and end times for the processing, calculates the runtime,
    and stores the results in the specified GCS output path prefix.

    Args:
        project_id (str): The ID of the Google Cloud project.
        location (str): The location (region) where the processor is hosted.
        processor_id (str): The ID of the document processor.
        processor_version_id (str): The version ID of the processor.
        gcs_input_path (str): The GCS path containing the input documents for batch processing.
        gcs_async_output_path (str): The GCS path to store the output files.
        gcs_async_output_path_prefix (str): The prefix for the output files in the GCS path.
        timeout (int): The timeout duration for the operation in seconds.

    Returns:
        Any: The operation result from `batch_process_documents`, typically an operation object.
    """
    start_time = datetime.now()
    # print(f"Run {run_id} started at {start_time}")

    # Simulate processing
    operation = batch_process_documents(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        processor_version_id=processor_version_id,
        gcs_input_path=gcs_input_path,
        gcs_async_output_path=gcs_async_output_path,
        gcs_async_output_path_prefix=gcs_async_output_path_prefix,
        timeout=timeout,
    )

    end_time = datetime.now()
    # print(f"Run {run_id} ended at {end_time}")
    # print(f"Run {run_id} duration: {end_time - start_time}")
    key = gcs_async_output_path_prefix.replace("iteration_", "Iteration ")
    if key not in batch_runtime.keys():
        batch_runtime[key] = [(end_time - start_time).total_seconds()]

    return operation


def run_parallel_batch_processing() -> None:
    """
    Run the batch processing of the same file multiple times in parallel,
    confirming parallel execution and processing efficiency.

    This function will submit tasks for batch processing (with `batch_process_file_wrapper`)
    and execute them concurrently using a ThreadPoolExecutor. The results (success or failure)
    will be printed once the tasks complete.
    """

    # Number of parallel runs
    num_runs = iteration
    # print(gcs_async_output_path)
    # Use ThreadPoolExecutor for parallel execution
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks for parallel execution
        futures = [
            executor.submit(
                batch_process_file_wrapper,
                project_id,
                location,
                processor_id,
                processor_version_id,
                gcs_input_path,
                gcs_async_output_path,
                "iteration_" + str(run_id),
                timeout,
            )
            for run_id in range(1, num_runs + 1)
        ]

        # Collect results
        for future in concurrent.futures.as_completed(futures):
            try:
                document = future.result()  # Get the result of the task
                print(f"\t\tSuccessfully processed document.")
            except Exception as e:
                print(f"Task failed with error: {e}")


def get_xlsx_from_gcp(file_path: str) -> pd.ExcelFile:
    """
    Downloads an Excel file from GCP storage and returns it as a pandas ExcelFile object.

    Args:
    - file_path (str): The GCS path to the Excel file (e.g., 'gs://bucket_name/path/to/file.xlsx').

    Returns:
    - pd.ExcelFile: The loaded Excel file object.

    Raises:
    - FileNotFoundError: If the specified file is not found in the GCS bucket.
    - Exception: For any other errors during the file retrieval process.
    """
    bucket_name = file_path.split("/")[2]
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    file_path = "/".join(file_path.split("/")[3:])
    # print(file_path)
    content = bucket.blob(file_path).download_as_bytes()
    return pd.ExcelFile(BytesIO(content))


def compare_latencies(
    previous_file: str,
    previous_file_name: str,
    new_file: BytesIO,  # Assuming new_file is passed as BytesIO (from the `save` method)
    new_file_name: str,
    gcs_result_path: str,
    date: str,
    process_type: List[str] = ["sync", "async"],
) -> None:
    """
    Compares the latencies from two Excel files and generates a comparison report.

    Args:
        previous_file (str): The path to the previous Excel file.
        previous_file_name (str): The name of the previous file.
        new_file (BytesIO): The file object for the new Excel file.
        new_file_name (str): The name of the new file.
        gcs_result_path (str): The GCS path where the output file will be uploaded.
        date (str): The date for file naming.
        process_type (List[str]): List specifying the process types to compare ("sync" and/or "async").

    Returns:
        None: Saves the comparison file locally and uploads it to GCS.
    """
    excel_buffer = BytesIO()
    new_file.save(excel_buffer)
    excel_buffer.seek(0)
    output_file = "latency_comparison.xlsx"

    valid_types = {"sync", "async"}
    if not set(process_type).issubset(valid_types):
        raise ValueError(
            "process_type must be a list containing any of 'sync' and 'async'"
        )

    comparisons = {}
    file_names = []

    def compare_values(prev: float, new: float) -> Tuple[str, float]:
        """
        Compares two values (previous and new) and classifies the change as
        "Increased", "Decreased", or "No Change". Also calculates the difference.

        Args:
            prev (float or None): The previous latency value.
            new (float or None): The new latency value.

        Returns:
            Tuple[str, float or None]: A tuple containing the classification of the change
            ("Increased", "Decreased", "No Change") and the difference (or None if missing data).
        """
        if prev is None or new is None:
            return "Missing Data", None
        difference = new - prev
        return (
            "Increased" if new > prev else "Decreased" if new < prev else "No Change"
        ), difference

    if "sync" in process_type:
        previous_sync = pd.read_excel(previous_file, sheet_name="Sync_RunTime")
        new_sync = pd.read_excel(excel_buffer, sheet_name="Sync_RunTime")
        sync_comparison = pd.merge(
            previous_sync[["FileName", "Average"]],
            new_sync[["FileName", "Average"]],
            on="FileName",
            suffixes=("_Previous", "_New"),
            how="outer",
        )

        sync_comparison[["Sync_Comparison", "Sync_Difference"]] = sync_comparison.apply(
            lambda row: compare_values(row["Average_Previous"], row["Average_New"]),
            axis=1,
            result_type="expand",
        )
        comparisons["Sync_Comparison"] = sync_comparison
        file_names = sync_comparison["FileName"].tolist()

    if "async" in process_type:
        previous_async = pd.read_excel(previous_file, sheet_name="Batch_RunTime")
        new_async = pd.read_excel(excel_buffer, sheet_name="Batch_RunTime")

        previous_batch_latency = (
            previous_async["Average"].values[0] if not previous_async.empty else None
        )
        new_batch_latency = (
            new_async["Average"].values[0] if not new_async.empty else None
        )

        if not file_names:
            file_names = (
                previous_async["Filename"].tolist()
                if "Filename" in previous_async
                else []
            )
            file_names = (
                new_async["Filename"].tolist()
                if not file_names and "Filename" in new_async
                else file_names
            )

        async_comparison = pd.DataFrame(
            {
                "Filename": file_names,
                "Previous_Batch_Average": [previous_batch_latency] * len(file_names),
                "New_Batch_Average": [new_batch_latency] * len(file_names),
                "Batch_Comparison": [
                    compare_values(previous_batch_latency, new_batch_latency)[0]
                ]
                * len(file_names),
                "Batch_Difference": [
                    compare_values(previous_batch_latency, new_batch_latency)[1]
                ]
                * len(file_names),
            }
        )
        comparisons["Batch_Comparison"] = async_comparison

    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        for sheet_name, df in comparisons.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    wb = load_workbook(output_file)
    if "Batch_Comparison" in comparisons:
        batch_sheet = wb["Batch_Comparison"]
        if len(file_names) > 1:

            def merge_cells(
                sheet, start_row: int, end_row: int, cols_to_merge: List[int]
            ) -> None:
                """
                Merges cells in the specified columns for the given row range.

                Args:
                    sheet (Worksheet): The Excel worksheet object.
                    start_row (int): The starting row index.
                    end_row (int): The ending row index.
                    cols_to_merge (List[int]): List of columns to merge.

                Returns:
                    None: Merges the specified cells in the worksheet.
                """
                for col in cols_to_merge:
                    col_letter = get_column_letter(col)
                    sheet.merge_cells(f"{col_letter}{start_row}:{col_letter}{end_row}")

            merge_cells(batch_sheet, 2, len(file_names) + 1, [2, 3, 4, 5])

    input_sheet = wb.create_sheet(title="Input_Details", index=0)
    input_details = {
        "Previous File": previous_file_name,
        "New File": f"{'/'.join(gcs_result_path.split('/')[:3])}/{new_file_name}",
        "Process Type": ", ".join(process_type),
    }
    for row_index, (key, value) in enumerate(input_details.items(), start=1):
        input_sheet.cell(row=row_index, column=1, value=key).font = Font(bold=True)
        input_sheet.cell(row=row_index, column=2, value=value)

    wb.save(output_file)
    json_folder_comparison.upload_xlsx_to_gcs(
        gcs_result_path.split("/")[2],
        output_file,
        "/".join(gcs_result_path.split("/")[3:])
        + f"latency_comparison/{output_file.split('.')[0]}_{date}.xlsx",
    )

    print(f"Comparison file saved as {output_file} with merged batch columns!")

### 4.Run the code

In [None]:
if __name__ == "__main__":
    # Create a new workbook
    valid_types = {"sync", "async"}
    if not set(process_type).issubset(valid_types):
        raise ValueError(
            "process_type must be a list containing any of 'sync' and 'async'"
        )
    workbook = openpyxl.Workbook()

    # Add Input_Details Sheet as the first sheet
    input_details_sheet = workbook.active
    input_details_sheet.title = "Input_Details"

    # Input details dictionary with current date and time
    input_details = {
        "Processor ID": processor_id,
        "Processor Version ID": processor_version_id,
        "Date": get_current_ist_time(),
    }

    # Write the input details to the sheet
    for row_index, (key, value) in enumerate(input_details.items(), start=1):
        input_details_sheet.cell(row=row_index, column=1, value=key)
        input_details_sheet.cell(row=row_index, column=2, value=value)

    print("Done Input Details Processing")

    if "sync" in process_type:
        print("Running Online or Sync Processing")
        run_parallel_processing()

        # Add Sync Runtime Sheet
        sync_sheet = workbook.create_sheet(title="Sync_RunTime")

        # Determine the number of iterations and set headers
        num_iters = max(len(values) for values in sync_runtime.values())
        sync_headers = ["FileName"] + [f"Iteration {i+1}" for i in range(num_iters)]

        # Write the sync headers
        for col_index, header in enumerate(sync_headers, start=1):
            sync_sheet.cell(row=1, column=col_index, value=header)

        # Write the sync data rows
        for row_index, (filename, values) in enumerate(sync_runtime.items(), start=2):
            sync_sheet.cell(
                row=row_index, column=1, value=filename
            )  # Write the filename
            for col_index, value in enumerate(values, start=2):
                sync_sheet.cell(
                    row=row_index, column=col_index, value=value
                )  # Write iteration values
        # calculate Average and add to new column
        sync_process_average = []
        for index, values in enumerate(sync_runtime):
            sync_process_average.append(
                sum(sync_runtime[values]) / len(sync_runtime[values])
            )
        sync_max_column = sync_sheet.max_column + 1

        for i, value in enumerate(sync_process_average, start=1):
            sync_sheet.cell(row=i + 1, column=sync_max_column, value=value)
        sync_sheet.cell(row=1, column=sync_max_column, value="Average")

        print("Done Online or Sync Processing")
        json_folder_comparison.main(
            gcs_sync_output_path,
            iteration,
            "sync",
            gcs_result_path,
            input_details["Date"],
        )

    if "async" in process_type:
        batch_runtime["Filename"] = file_names(gcs_input_path)[0]
        print("Running Batch or Async Processing")
        run_parallel_batch_processing()

        # Add Batch Runtime Sheet
        batch_sheet = workbook.create_sheet(title="Batch_RunTime")

        batch_headers = ["Filename"] + list(batch_runtime.keys())[1:]

        for col_index, header in enumerate(batch_headers, start=1):
            batch_sheet.cell(row=1, column=col_index, value=header)

        # Write filenames
        for row_index, filename in enumerate(batch_runtime["Filename"], start=2):
            batch_sheet.cell(row=row_index, column=1, value=filename)

        # Write and merge iter values
        for col_index, (key, value) in enumerate(batch_runtime.items()):
            if key != "Filename":
                iter_value = value[0]
                start_row = 2
                end_row = start_row + len(batch_runtime["Filename"]) - 1
                # Merge cells for the iter column
                batch_sheet.merge_cells(
                    start_row=start_row,
                    start_column=col_index + 1,
                    end_row=end_row,
                    end_column=col_index + 1,
                )
                merged_cell = batch_sheet.cell(row=start_row, column=col_index + 1)
                merged_cell.value = iter_value
                merged_cell.alignment = Alignment(
                    horizontal="center", vertical="center"
                )

        batch_process_average = []
        batch_process_total_time = 0
        count = 0
        for index, values in enumerate(batch_runtime):
            if index != 0:
                count += 1
                batch_process_total_time += batch_runtime[values][0]
        batch_process_total_time = batch_process_total_time / count
        batch_process_average.append(batch_process_total_time)
        batch_max_column = batch_sheet.max_column + 1
        for i, value in enumerate(batch_process_average, start=1):
            batch_sheet.cell(row=i + 1, column=batch_max_column, value=value)
        batch_sheet.merge_cells(
            start_row=2,
            start_column=batch_max_column,
            end_row=batch_sheet.max_row,
            end_column=batch_max_column,
        )
        batch_sheet.cell(row=1, column=batch_max_column, value="Average")

        print("Done Batch or Async Processing")
        json_folder_comparison.main(
            gcs_async_output_path,
            iteration,
            "async",
            gcs_result_path,
            input_details["Date"],
        )

    # Save the workbook
    output_file = "runtime_data.xlsx"
    workbook.save(output_file)
    if len(process_type) == 2:
        latency_file_path = f"{'/'.join(gcs_result_path.split('/')[3:])}latency/both/"
        latency_file_name = (
            f"{latency_file_path}latency_data_{input_details['Date']}.xlsx"
        )
    elif len(process_type) == 1:
        latency_file_path = (
            f"{'/'.join(gcs_result_path.split('/')[3:])}latency/{process_type[0]}/"
        )
        latency_file_name = (
            f"{latency_file_path}latency_data_{input_details['Date']}.xlsx"
        )
    json_folder_comparison.upload_xlsx_to_gcs(
        gcs_result_path.split("/")[2], output_file, latency_file_name
    )
    print(f"Excel file '{output_file}' created successfully")

    old_files_to_compare = file_names(
        f"{'/'.join(gcs_result_path.split('/')[:3])}/{latency_file_path}"
    )[0]
    old_files_to_compare.remove(latency_file_name.split("/")[-1])
    clean_old_files_to_compare = [
        "".join(i.split("_")[2:]).replace(" IST.xlsx", "") for i in old_files_to_compare
    ]
    if len(clean_old_files_to_compare) != 0:
        sorted_date = sorted(
            clean_old_files_to_compare,
            key=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"),
            reverse=True,
        )[0]
        for i in old_files_to_compare:
            if sorted_date in i:
                previous_sheet_name = (
                    f"{'/'.join(gcs_result_path.split('/')[:3])}/{latency_file_path}{i}"
                )
                previous_sheet = get_xlsx_from_gcp(previous_sheet_name)
                compare_latencies(
                    previous_sheet,
                    previous_sheet_name,
                    workbook,
                    latency_file_name,
                    gcs_result_path,
                    input_details["Date"],
                    process_type,
                )
                break
    else:
        print(
            "Only One Latency File Generated Present Now, So we can't compare with the previous sheet."
        )

### 5.Output

The updated JSONs containing line information will be saved to the specified output folder.