In [11]:
import os
import PyPDF2
import pdfplumber
import pytesseract
from PIL import Image
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API key from environment
openai_api_key = os.getenv('OPENAI_API_KEY')

if not openai_api_key:
    st.error("API key is not found. Please set it in the .env file.")

client = OpenAI(api_key=openai_api_key)

# Function to extract text, images, and tables from PDF
def extract_data_from_pdf(pdf_file):
    text = ""
    tables = []
    images = []

    # Using pdfplumber for better text and table extraction
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
            tables.extend(page.extract_tables())
            for i, image in enumerate(page.images):
                # Convert the image to a PIL Image
                img = page.to_image()
                img_path = f"images/page_{page.page_number}_img_{i}.png"
                img.save(img_path)
                images.append(img_path)

    return text, tables, images

# Function to analyze and extract text from images
def analyze_images(image_paths):
    image_descriptions = []
    for img_path in image_paths:
        img = Image.open(img_path)
        text = pytesseract.image_to_string(img)
        image_descriptions.append({"path": img_path, "text": text})
    return image_descriptions

# Function to create a vector store from extracted data
def create_vector_store(text, tables, images):
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(text)

    # Create embeddings
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_texts(texts, embeddings)

    # Create embeddings for tables and images
    table_texts = [str(table) for table in tables]
    table_embeddings = FAISS.from_texts(table_texts, embeddings)

    image_texts = [desc['text'] for desc in images]
    image_embeddings = FAISS.from_texts(image_texts, embeddings)

    return vector_store, table_embeddings, image_embeddings

# Function to query GPT-4
def query_gpt4(query, context):
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
                {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        st.error(f"Error querying GPT-4: {str(e)}")
        return None

# Main function for the Streamlit app
def main():
    st.title("RAG System for Unstructured Data")

    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        # Extract data from PDF
        text, tables, images = extract_data_from_pdf(uploaded_file)

        # Extracting image documents
        image_docs = analyze_images(images)
        print(image_docs)  # Ensure this returns a list of dictionaries

        # Join the text from the dictionary objects
        context_images = "\n".join([desc['text'] for desc in image_docs]) 

        # Create vector store
        vector_store, table_embeddings, image_embeddings = create_vector_store(text, tables, image_docs)

        st.session_state['vector_store'] = vector_store
        st.session_state['table_embeddings'] = table_embeddings
        st.session_state['image_embeddings'] = image_embeddings
        st.success("PDF processed successfully!")

        # Chat interface
        query = st.text_input("Your question about the PDF:")

        if query:
            # Retrieve relevant documents from text vector store
            docs = vector_store.similarity_search(query, k=3)
            context_text = "\n".join([doc.page_content for doc in docs])

            # Retrieve relevant tables
            table_docs = table_embeddings.similarity_search(query, k=3)
            context_tables = "\n".join([str(doc) for doc in table_docs])

            # Retrieve relevant images
            image_docs = image_embeddings.similarity_search(query, k=3)
            context_images = "\n".join([desc['text'] for desc in image_docs])

            # Compile all contexts
            final_context = context_text + "\n" + context_tables + "\n" + context_images

            # Query GPT-4
            response = query_gpt4(query, final_context)
            if response:
                st.markdown(f"**Answer:** {response}")

if __name__ == "__main__":
    main()


