In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import fitz
import PIL.Image as Image
import io
import os

os.environ['OPENAI_API_KEY'] = "[YOUR-API-KEY]"

def save_image(block, page_num, base_path):
    """Saves the image from the block and returns the file path."""
    try:
        image_filename = f"image_page{page_num+1}_{block['number']}.{block['ext']}"
        image_folder = os.path.join(base_path, image_filename)
        image = Image.open(io.BytesIO(block["image"]))
        image.save(image_folder)
        print("Extracted images from pdf")
        return image_folder
    except Exception as exe:
        print("Error extracting images: "+ str(exe))


def process_text_block(block, page_num, line_number):
    """Processes a text block and returns formatted text."""
    text_content = []
    for line in block["lines"]:
        line_text = " ".join([span["text"] for span in line["spans"]])
        if line_text.strip():
            text_content.append((line_text.strip(), line_number, page_num + 1, block["bbox"]))
            line_number += 1
    return text_content, line_number


def extract_data_from_pdf(pdf_path, path):
    """Extracts text and images from a PDF, returning structured data."""
    try:
        doc = fitz.open(pdf_path)
        extracted_data = []

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]
            line_number = 1
            for block in blocks:
                if block["type"] == 0:  # Text block
                    text_content, line_number = process_text_block(block, page_num, line_number)
                    extracted_data.extend(text_content)

                elif block["type"] == 1:  # Image block
                    image_path = save_image(block, page_num, image_save_path)
                    extracted_data.append(("<img>" + image_path + "</img>", None, page_num + 1, block["bbox"]))

        doc.close()
        print("Extracted data from pdf")
        return extracted_data

    except Exception as exe:
        print("Error extracting data from pdf: "+ str(exe))

def process_extracted_data(data):
    """Processes extracted data for final output."""
    # Sort by page number and bbox upper left corner
    data.sort(key=lambda x: (x[2], x[3][1], x[3][0]))
    return [item[0] for item in data ], [{ "page": item[2], "line_number": item[1]} for item in data]


def pdf_2_txt(document, metadata):
    try:
        new_metadata = []
        text = ""
        current_line = "1"
        tmp = ""
        for i in range(len(document)-1):
            if document[i+1].startswith("<img>"):
                text = text + document[i].strip(".") + " "

            elif document[i].startswith("<img>"):
                text = text + document[i] + "\n"

            else:
                text = text + document[i] + " "
                if metadata[i]['line_number'] == 1:
                    new_metadata.append({'line_number': tmp, 'page': metadata[i]['page']})
                else:
                    current_line = current_line + "_" + str(metadata[i]['line_number'])
                    new_metadata.append({'line_number': tmp, 'page': metadata[i]['page']})

            tmp = tmp + str(metadata[i]['line_number']) + "_"

        #text_file_name = os.path.splitext(pdf_path)[0] + ".txt"
        with open("1.txt", 'w', encoding="utf-8") as text_file:
            text_file.write(text)
        print("Converted pdf to text")
        return metadata

    except Exception as exe:
        print("Error converting pdf to text: "+ str(exe))



def process_text_file(text_file_name="1.txt"):
    # using text loader to load text file
    try:
        loader = TextLoader(text_file_name, encoding='utf8')
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_documents(documents)
        return texts
    
def final(user_input):
    # OpenAI Embeddings
    embeddings = OpenAIEmbeddings()
    #Chroma Vector DB
    vectordb = Chroma.from_documents(documents=process_text_file(), 
                                     embedding=embeddings,
                                     persist_directory=persist_directory)
    vectordb.persist()

    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    
    # Chat Model
    llm = ChatOpenAI(model_name='gpt-3.5-turbo-1106')

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    query = f"###Prompt {user_input}"
    try:
        llm_response = qa(query)
        #print(llm_response["result"])
        return llm_response["result"]
    except Exception as err:
        #print('Exception occurred. Please try again', str(err))
        return str(e)



def main(pdf_path, ID):
    extracted_data = extract_data_from_pdf(pdf_path, path)
    document, metadata = process_extracted_data(extracted_data)
    pdf_2_txt(document, metadata)
    final(input("Enter your query"))
    os.remove("1.txt")


if __name__ == "__main__":
    pdf_path = "C:/Users/Raja/Downloads/sdgp.pdf"
    ID = "999-999-999"
    main(pdf_path, ID)

