In [None]:
import os
import warnings
import streamlit as st
import PyPDF2
import fitz  # PyMuPDF

# Streamlit page configuration
st.set_page_config(
    page_title="Document Q&A",
    page_icon="📚",
    layout="wide"
)

# Import necessary libraries
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv()

class DocumentAnalyzer:
    @staticmethod
    def analyze_pdf(pdf_path):
        """
        Comprehensive PDF analysis

        Args:
            pdf_path (str): Path to the PDF file

        Returns:
            dict: Analysis results
        """
        try:
            # Open the PDF
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                # Basic document info
                analysis = {
                    'filename': os.path.basename(pdf_path),
                    'total_pages': len(pdf_reader.pages),
                    'total_words': 0,
                    'file_size': os.path.getsize(pdf_path) / 1024  # size in KB
                }

            # Use PyMuPDF for more detailed analysis
            pdf_document = fitz.open(pdf_path)

            # Additional analysis
            analysis['images'] = 0
            analysis['headings'] = []

            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]

                # Count images
                images = page.get_images()
                analysis['images'] += len(images)

                # Extract text and count words
                text = page.get_text()
                analysis['total_words'] += len(text.split())

                # Simple heading extraction
                blocks = page.get_text("blocks")
                for block in blocks:
                    if block[6] > 12:  # font size > 12
                        heading = block[4].strip()
                        if heading and heading not in analysis['headings']:
                            analysis['headings'].append(heading)

            # Close the document
            pdf_document.close()

            return analysis

        except Exception as e:
            st.error(f"Error analyzing PDF {pdf_path}: {e}")
            return None

class DocumentQAApp:
    def __init__(self):
        # Initialize session state
        if 'vectors' not in st.session_state:
            st.session_state.vectors = None
        if 'document_analysis' not in st.session_state:
            st.session_state.document_analysis = []

        # Initialize LLM
        self.setup_llm()

        # Setup prompt template
        self.setup_prompt()

    def setup_llm(self):
        """Initialize Language Model"""
        try:
            # Retrieve API key from environment variable
            groq_api_key = os.getenv('GROQ_API_KEY')

            if not groq_api_key:
                st.error("GROQ API Key not found. Please set it in your .env file.")
                st.stop()

            self.llm = ChatGroq(
                groq_api_key=groq_api_key,
                model_name="Llama3-8b-8192"
            )
        except Exception as e:
            st.error(f"Error initializing LLM: {e}")
            st.stop()

    def setup_prompt(self):
        """Create prompt template"""
        self.prompt = ChatPromptTemplate.from_template(
            """Answer the questions based only on the provided context.
            Provide the most accurate response possible.

            Context:
            {context}

            Question: {input}
            """
        )

    def vector_embedding(self, pdf_directory):
        """Create vector embeddings from PDF documents"""
        # Validate directory
        if not os.path.exists(pdf_directory):
            st.error(f"Directory {pdf_directory} does not exist!")
            return False

        try:
            # Find PDF files
            pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]

            if not pdf_files:
                st.warning("No PDF files found in the directory.")
                return False

            # Process documents
            documents = []
            st.session_state.document_analysis = []

            for pdf_file in pdf_files:
                try:
                    pdf_path = os.path.join(pdf_directory, pdf_file)

                    # Load PDF
                    loader = PyPDFLoader(pdf_path)
                    pages = loader.load()
                    documents.extend(pages)

                    # Analyze PDF
                    analysis = DocumentAnalyzer.analyze_pdf(pdf_path)
                    if analysis:
                        st.session_state.document_analysis.append(analysis)

                except Exception as file_error:
                    st.error(f"Error processing {pdf_file}: {file_error}")

            # Check if any documents were loaded
            if not documents:
                st.error("No documents could be loaded.")
                return False

            # Text Splitting
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200
            )
            split_documents = text_splitter.split_documents(documents)

            # Embedding
            embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-mpnet-base-v2"
            )

            # Create Vector Store
            st.session_state.vectors = FAISS.from_documents(
                split_documents,
                embeddings
            )

            st.success(f"Vector Store created with {len(split_documents)} document chunks.")
            return True

        except Exception as e:
            st.error(f"Comprehensive Error during vector embedding: {e}")
            import traceback
            st.error(traceback.format_exc())
            return False

    def process_query(self, query):
        """Process user query using the vector store and language model"""
        try:
            # Check if vectors are initialized
            if st.session_state.vectors is None:
                st.warning("Please analyze documents first!")
                return "No documents have been processed. Click 'Analyze Documents' first."

            # Perform similarity search
            retriever = st.session_state.vectors.as_retriever(
                search_kwargs={'k': 3}  # Retrieve top 3 most relevant documents
            )
            relevant_docs = retriever.get_relevant_documents(query)

            # Prepare context
            context = "\n\n".join([doc.page_content for doc in relevant_docs])

            # Generate response
            response = self.llm.invoke(
                self.prompt.format_messages(
                    context=context,
                    input=query
                )
            )

            return response.content

        except Exception as e:
            st.error(f"Error processing query: {e}")
            return f"An error occurred: {e}"

def main():
    # Page title
    st.title("📚 Document Insight AI")
    st.write("Smart Document Analysis and Q&A")

    # Initialize the app
    app = DocumentQAApp()

    # PDF Directory Input
    pdf_directory = st.text_input(
        "Enter PDF Directory Path",
        placeholder="C:/path/to/your/pdf/folder"
    )

    # Analyze Documents Button
    if st.button("🔍 Analyze Documents"):
        if not pdf_directory:
            st.warning("Please enter a directory path.")
        else:
            with st.spinner('Processing Documents...'):
                if app.vector_embedding(pdf_directory):
                    # Display document analysis
                    st.subheader("📊 Document Analysis")
                    for doc_info in st.session_state.document_analysis:
                        st.write(f"📄 **{doc_info['filename']}**")
                        col1, col2 = st.columns(2)
                        with col1:
                            st.write(f"Total Pages: {doc_info['total_pages']}")
                            st.write(f"Total Words: {doc_info['total_words']}")
                        with col2:
                            st.write(f"File Size: {doc_info['file_size']:.2f} KB")
                            st.write(f"Images: {doc_info.get('images', 0)}")

    # Query Section
    st.subheader("💬 Ask Your Question")
    query = st.text_input("Enter your question")

    # Query Processing
    if query:
        with st.spinner('Generating Response...'):
            answer = app.process_query(query)
            st.write("🤖 **AI Response:**")
            st.write(answer)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'streamlit'