In [None]:
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import pprint
from PIL import Image, ImageDraw
import numpy as np

import os
import json

# import text analysis service
from text_analysis import generate_analysis

# async imports
import asyncio


async def main():

    def get_page_ref_data(pdf_path):
        # Convert PDF to images
        images = convert_from_path(pdf_path)

        # Initialize a list to store word data
        word_data = []

        # Loop through each page
        page_ref = {}

        for page_num, image in enumerate(images, start=1):
            print(f"Processing page {page_num}...")
            # Get OCR data
            ocr_data = pytesseract.image_to_data(
                image, output_type=pytesseract.Output.DATAFRAME
            )

            # Add the page number to the data
            ocr_data["page"] = page_num

            # Filter out rows without text (confidence = -1)
            ocr_data = ocr_data[ocr_data.conf != -1]

            # Append to word data
            word_data.append(ocr_data)

            page_ref[page_num] = (image, ocr_data)

        return page_ref, word_data

    def create_highlighted_page(image, data, words_to_highlight):
        # Convert the image to RGB
        image = image.convert("RGB")

        # Create a drawing object
        draw = ImageDraw.Draw(image)

        # Create a semi-transparent overlay
        overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))

        # Loop through each row in the data
        for index, row in data.iterrows():

            # check if words in important
            # if not, skip
            row_text = row["text"]

            # covert all to string
            row_text = str(row_text)

            # convert to lowercase
            row_text = row_text.lower()

            # filter out non-alphanumeric characters
            row_text = "".join(filter(str.isalnum, row_text))
            if row_text not in words_to_highlight:
                continue

            # Get the bounding box coordinates
            left = row["left"]
            top = row["top"]
            width = row["width"]
            height = row["height"]

            OPACITY = 1
            # create an overlay with the bounding box
            overlay_draw = ImageDraw.Draw(overlay)
            overlay_draw.rectangle(
                [left, top, left + width, top + height],
                fill=(229, 235, 52, OPACITY),
            )

            # Composite the overlay with the original image
            image = Image.alpha_composite(image.convert("RGBA"), overlay)

        image = image.convert("RGB")

        return image

    # Path to the PDF file
    pdf_path = "example1.pdf"

    page_ref, word_data = None, None

    if os.path.exists("page_ref.json") and os.path.exists("word_data.json"):
        # read page ref
        with open("page_ref.json") as f:
            page_ref = json.load(f)

        # read word data
        with open("word_data.json") as f:
            word_data = json.load(f)

    else:
        # Get page reference
        page_ref, word_data = get_page_ref_data(pdf_path)

        # save page ref to json
        with open("page_ref.json", "w") as f:
            json.dump(page_ref, f)

        # save word data to json
        with open("word_data.json", "w") as f:
            json.dump(word_data, f)

    full_text = final_data.text.str.cat(sep=" ")

    # Get analysis
    analysis = await generate_analysis(full_text)

    # get topic sentences
    sentences = analysis["sentences"]
    # words_to_highlight = analysis["keywords"]

    # # get individual words words in the sentences
    words_to_highlight = [i for i in sentences if len(i.split()) > 1]

    words_to_highlight = [i.split() for i in words_to_highlight]
    # words_to_highlight = [item for sublist in words_to_highlight for item in sublist]

    # ensure that words are strings
    words_to_highlight = [str(i).lower() for i in words_to_highlight]
    # only retain alphanumeric characters for each word
    words_to_highlight = ["".join(filter(str.isalnum, i)) for i in words_to_highlight]

    # # print the words to highlight
    print(words_to_highlight)

    for page_num, (image, ocr_data) in page_ref.items():
        width, height = image.size
        print(f"Page {page_num} dimensions: {width} x {height}")

        # Create a highlighted page
        highlighted_page = create_highlighted_page(image, ocr_data, words_to_highlight)
        # Save the image
        highlighted_page.save(f"highlighted_page_{page_num}.jpg")

    # Save to CSV or print
    final_data.to_csv("word_locations.csv", index=False)
    print("Word locations saved to 'word_locations.csv'")


# Run the main function
asyncio.run(main())
