In [16]:
# --- Cell 1: Imports and Environment Setup ---
import os
import re
import base64
import io
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# ⬇️ ADDED: NougatParser for advanced PDF processing
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, CSVLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader, NougatParser
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_experimental.text_splitter import SemanticChunker
# ⬇️ MODIFIED: Reverted to Google Gemini for the LLM
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.documents import Document
from typing import List
import fitz  # PyMuPDF
from PIL import Image
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from sentence_transformers.cross_encoder import CrossEncoder

ImportError: cannot import name 'NougatParser' from 'langchain_community.document_loaders' (c:\Users\aryan\anaconda3\Lib\site-packages\langchain_community\document_loaders\__init__.py)

In [None]:
# Load environment variables from .env file
load_dotenv()

True

In [None]:
# --- Cell 2: Configuration ---
DOCS_PATH = "./documents"
CHROMA_PERSIST_PATH = "./chroma_db"
EMBEDDING_MODEL = 'BAAI/bge-base-en-v1.5'
CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
# ⬇️ MODIFIED: Reverted to Google Gemini models
LLM_MODEL = "gemini-1.5-flash-latest" 
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


In [None]:
# --- Cell 3: Helper Functions ---

# ⬇️ RE-ENABLED: Image summarization function using Gemini Vision
def get_image_summary(image_bytes: bytes, llm: ChatGoogleGenerativeAI) -> str:
    """Generates a summary for an image using a multi-modal LLM."""
    print("Generating image summary...")
    prompt_messages = [
        HumanMessage(
            content=[
                {"type": "text", "text": "You are an expert at analyzing academic images, diagrams, and charts. Describe this image in detail. What is its main purpose? What key information does it convey? If it's a chart or graph, describe the data, axes, and trend. This summary will be used for a Retrieval-Augmented Generation (RAG) system, so be comprehensive."},
                {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64.b64encode(image_bytes).decode()}"}
            ]
        )
    ]
    try:
        response = llm.invoke(prompt_messages)
        return response.content
    except Exception as e:
        print(f"❌ Error generating image summary: {e}")
        return "Could not generate summary for this image."

class SmartPDFProcessor:
    """Advanced PDF processing using Nougat for text and PyMuPDF for images."""
    def __init__(self, embeddings, llm=None):
        self.text_splitter = SemanticChunker(embeddings)
        self.llm = llm

    def process_pdf(self, pdf_path: str) -> List[Document]:
        """Process PDF, extracting text with Nougat and images for summarization."""
        print(f"Processing PDF with Nougat: {pdf_path}")
        all_docs = []
        try:
            # --- Text Extraction with Nougat ---
            # Nougat is specialized for academic papers and handles complex layouts.
            nougat_parser = NougatParser(file_path=pdf_path)
            nougat_docs = nougat_parser.load()
            
            full_text = "\n\n".join([self._clean_text(doc.page_content) for doc in nougat_docs])
            chunks = self.text_splitter.create_documents([full_text])
            for chunk in chunks:
                chunk.metadata['source'] = pdf_path
            all_docs.extend(chunks)

            # --- Image Extraction and Summarization with Gemini Vision ---
            if self.llm:
                print(f"Extracting images from: {pdf_path}")
                pdf_document = fitz.open(pdf_path)
                for page_num in range(len(pdf_document)):
                    for img_index, img in enumerate(pdf_document.get_page_images(page_num)):
                        xref = img[0]
                        base_image = pdf_document.extract_image(xref)
                        image_bytes = base_image["image"]
                        
                        summary = get_image_summary(image_bytes, self.llm)
                        image_doc = Document(
                            page_content=summary,
                            metadata={ "source": pdf_path, "page": page_num + 1, "chunk_method": "nougat_pdf_image_summary", "image_index": img_index }
                        )
                        all_docs.append(image_doc)

            print(f"✅ Successfully processed {len(all_docs)} text chunks and image summaries from {pdf_path}")
            return all_docs
        except Exception as e:
            print(f"❌ Error processing {pdf_path} with Nougat: {e}")
            return []

    def _clean_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text).strip()
        return text.replace("ﬁ", "fi").replace("ﬂ", "fl")

class SmartDocProcessor:
    """Handles .txt and .docx files using Semantic Chunking."""
    def __init__(self, embeddings):
        self.text_splitter = SemanticChunker(embeddings)
    def process_document(self, doc_path: str) -> List[Document]:
        print(f"Processing document: {doc_path}")
        try:
            if doc_path.lower().endswith(".docx"): loader = Docx2txtLoader(doc_path)
            elif doc_path.lower().endswith(".txt"): loader = TextLoader(doc_path, encoding='utf-8')
            else: return []
            documents = loader.load()
            full_text = "\n\n".join([self._clean_text(doc.page_content) for doc in documents if len(self._clean_text(doc.page_content).strip()) >= 50])
            if not full_text: return []
            splits = self.text_splitter.create_documents([full_text])
            for split in splits:
                split.metadata.update({ "source": doc_path, "chunk_method": "semantic_chunker_text", "char_count": len(split.page_content)})
            print(f"✅ Successfully processed {len(splits)} chunks from {doc_path}")
            return splits
        except Exception as e:
            print(f"❌ Error processing {doc_path}: {e}")
            return []
    def _clean_text(self, text: str) -> str: return re.sub(r'\s+', ' ', text).strip()

class SmartLatexProcessor:
    """Handles .tex files, cleans LaTeX commands, and uses Semantic Chunking."""
    def __init__(self, embeddings):
        self.text_splitter = SemanticChunker(embeddings)
    def process_latex(self, tex_path: str) -> List[Document]:
        print(f"Processing LaTeX file: {tex_path}")
        try:
            loader = TextLoader(tex_path, encoding='utf-8')
            documents = loader.load()
            full_text = "\n".join([doc.page_content for doc in documents])
            cleaned_text = self._clean_latex(full_text)
            if len(cleaned_text.strip()) < 100: return []
            splits = self.text_splitter.create_documents([cleaned_text])
            for split in splits:
                split.metadata.update({ "source": tex_path, "chunk_method": "semantic_chunker_latex", "char_count": len(split.page_content)})
            print(f"✅ Successfully processed {len(splits)} semantic chunks from {tex_path}")
            return splits
        except Exception as e:
            print(f"❌ Error processing {tex_path}: {e}")
            return []
    def _clean_latex(self, text: str) -> str:
        if "\\begin{document}" in text: text = text.split("\\begin{document}")[1]
        text = re.sub(r"%.*?\n", "\n", text)
        text = re.sub(r"\\begin\{(?:figure|table|tabular|verbatim|lstlisting)\*?\}[\s\S]*?\\end\{(?:figure|table|tabular|verbatim|lstlisting)\*?\}", "", text, flags=re.MULTILINE)
        text = re.sub(r"\\documentclass(?:\[.*?\])?\{.*?\}", "", text, flags=re.DOTALL)
        text = re.sub(r"\\usepackage(?:\[.*?\])?\{.*?\}", "", text, flags=re.DOTALL)
        text = re.sub(r"\\(title|author|date|thanks)\{.*?\}", "", text, flags=re.DOTALL)
        text = re.sub(r"\\(maketitle|tableofcontents|listoffigures|listoftables|centering|newpage|section\*|subsection\*|subsubsection\*)\b", "", text)
        text = re.sub(r"\\(begin|end)\{.*?\}", "", text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

class SmartSheetProcessor:
    def process_sheet(self, sheet_path: str) -> List[Document]:
        print(f"Processing sheet: {sheet_path}")
        try:
            if sheet_path.lower().endswith(".csv"): loader = CSVLoader(file_path=sheet_path, encoding='utf-8')
            elif sheet_path.lower().endswith(".xlsx"): loader = UnstructuredExcelLoader(sheet_path, mode="elements")
            else: return []
            return loader.load()
        except Exception as e:
            print(f"❌ Error processing {sheet_path}: {e}")
            return []

class SmartPPTProcessor:
    def process_ppt(self, ppt_path: str) -> List[Document]:
        print(f"Processing PowerPoint: {ppt_path}")
        try:
            loader = UnstructuredPowerPointLoader(ppt_path, mode="elements")
            return loader.load()
        except Exception as e:
            print(f"❌ Error processing {ppt_path}: {e}")
            return []

In [None]:
def process_all_documents(embedding_function, llm_for_summaries) -> List[Document]:
    """Loads and processes all documents from the source folder."""
    print("--- 📂 Processing all source documents... ---")
    all_splits = []
    processors = {
        ".pdf": SmartPDFProcessor(embeddings=embedding_function, llm=llm_for_summaries),
        ".txt": SmartDocProcessor(embeddings=embedding_function),
        ".docx": SmartDocProcessor(embeddings=embedding_function),
        ".tex": SmartLatexProcessor(embeddings=embedding_function),
        ".csv": SmartSheetProcessor(),
        ".xlsx": SmartSheetProcessor(),
        ".pptx": SmartPPTProcessor(),
        ".ppt": SmartPPTProcessor()
    }
    for filename in os.listdir(DOCS_PATH):
        file_path = os.path.join(DOCS_PATH, filename)
        file_ext = os.path.splitext(filename)[1].lower()
        if file_ext in processors:
            processor = processors[file_ext]
            if hasattr(processor, 'process_pdf'): all_splits.extend(processor.process_pdf(file_path))
            elif hasattr(processor, 'process_document'): all_splits.extend(processor.process_document(file_path))
            elif hasattr(processor, 'process_latex'): all_splits.extend(processor.process_latex(file_path))
            elif hasattr(processor, 'process_sheet'): all_splits.extend(processor.process_sheet(file_path))
            elif hasattr(processor, 'process_ppt'): all_splits.extend(processor.process_ppt(file_path))
    return all_splits

def setup_hybrid_retriever(embedding_function, llm_for_summaries):
    """Initializes and returns a hybrid retriever (BM25 keyword search + Chroma vector search)."""
    
    all_documents = process_all_documents(embedding_function, llm_for_summaries)
    if not all_documents:
        raise ValueError(f"No processable documents found in '{DOCS_PATH}'. Processing halted.")

    if os.path.exists(CHROMA_PERSIST_PATH) and os.listdir(CHROMA_PERSIST_PATH):
        print("✅ Loading existing vector store...")
        vectorstore = Chroma(persist_directory=CHROMA_PERSIST_PATH, embedding_function=embedding_function)
    else:
        print("⚠️ No existing vector store found. Creating a new one...")
        vectorstore = Chroma.from_documents(documents=all_documents, embedding=embedding_function, persist_directory=CHROMA_PERSIST_PATH)
        print("✅ Vector store created successfully.")
    
    vector_retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    bm25_retriever = BM25Retriever.from_documents(all_documents, k=10)
    
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, vector_retriever],
        weights=[0.5, 0.5]
    )
    return ensemble_retriever

def create_conversational_rag_chain(retriever, llm):
    contextualize_q_system_prompt = (
        "Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is."
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages([("system", contextualize_q_system_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}")])
    history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
    
    qa_system_prompt = (
        "You are an expert AI Curriculum Assistant. Your task is to answer user questions accurately and concisely based ONLY on the provided context. This context contains text excerpts (extracted by Nougat) and detailed summaries of images, charts, or diagrams. When referencing visual content, explicitly mention it (e.g., 'As seen in the diagram on page 5...'). If the context does not contain the answer, state that you cannot find the information in the provided materials. Do not use any external knowledge.\n\nContext:\n{context}"
    )
    qa_prompt = ChatPromptTemplate.from_messages([("system", qa_system_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}")])
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    return rag_chain


In [None]:
# --- Cell 4: Initialization ---
print("--- 🚀 AI Curriculum Assistant Initializing (with Nougat & Gemini) 🚀 ---")
rag_chain = None
if not GOOGLE_API_KEY:
    print("❌ Error: GOOGLE_API_KEY not found. Please set it in your .env file.")
else:
    try:
        print(f"Initializing embedding model for retrieval: '{EMBEDDING_MODEL}'")
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
        
        print(f"Initializing Gemini LLM: '{LLM_MODEL}'")
        llm = ChatGoogleGenerativeAI(model=LLM_MODEL, google_api_key=GOOGLE_API_KEY, temperature=0.5)
        
        # ⬇️ MODIFIED: Pass the Gemini LLM to the retriever setup for image summarization
        base_retriever = setup_hybrid_retriever(embeddings, llm)

        print(f"Initializing cross-encoder for reranking: '{CROSS_ENCODER_MODEL}'")
        cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
        compressor = CrossEncoderReranker(model=cross_encoder, top_n=3)
        
        retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=base_retriever
        )
        
        rag_chain = create_conversational_rag_chain(retriever, llm)
        chat_history = []
        
        print("\n✅ Assistant is ready with Nougat PDF processing, Hybrid Search, and Reranking!")
        
    except ValueError as e:
        print(f"❌ Error: {e}")
    except Exception as e:
        print(f"❌ An unexpected error occurred during initialization: {e}")

In [None]:
# --- Cell 5: Interactive Q&A (with Source Citation) ---
user_input = "What is the main topic of the documents?"
if rag_chain:
    response = rag_chain.invoke({"input": user_input, "chat_history": chat_history})
    answer = response["answer"]
    print(f"🤖 Assistant: {answer}")
    
    source_docs = response.get("context", [])
    if source_docs:
        print("\n--- 📚 Sources ---")
        unique_sources = set()
        for doc in source_docs:
            source = doc.metadata.get('source', 'Unknown')
            page = doc.metadata.get('page', 'N/A')
            source_info = f"📄 File: {os.path.basename(source)}, Page: {page}"
            # ⬇️ RE-ENABLED: Check for image summary metadata
            if "image_summary" in doc.metadata.get("chunk_method", ""):
                source_info += " (Image Summary)"
            unique_sources.add(source_info)
        for src in sorted(list(unique_sources)):
            print(src)
    
    chat_history.append(HumanMessage(content=user_input))
    chat_history.append(AIMessage(content=answer))
else:
    print("The RAG chain is not initialized. Please run the previous cells successfully.")

In [None]:
# --- Cell 6: Additional Assistant Functions (Unchanged) ---
def find_connections(topic: str, retriever):
    pass
def summarize_document(doc_source_name: str, vectorstore, llm):
    pass
def generate_quiz(topic: str, retriever, llm, num_questions=3):
    pass