In [5]:
import fitz  # PyMuPDF
import pdfplumber
import warnings
import os
import time

from langchain.schema import Document 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from qdrant_client import QdrantClient as RawQdrantClient 
from qdrant_client.http import models 
from langchain.embeddings import HuggingFaceEmbeddings 
from langchain_core.prompts import ChatPromptTemplate 
from langchain_groq import ChatGroq 
from docx import Document as DocxDocument 

# Suppress specific pdfplumber warning
warnings.filterwarnings("ignore", message="CropBox missing from /Page, defaulting to MediaBox")
from dotenv import load_dotenv
load_dotenv()
os.environ["QDRANT_API_KEY"] = os.getenv("QDRANT_API_KEY")
os.environ["QDRANT_URL"] = os.getenv("QDRANT_URL")


In [7]:

# --- Configuration ---
PDF_PATH = "Principles-of-Data-Science-WEB.pdf" # Make sure this PDF is in the same directory or provide full path

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
VECTOR_SIZE = 384 # Dimension of "all-MiniLM-L6-v2"
GROQ_MODEL_NAME = "gemma2-9b-it"

CONTENT_KEY_IN_PAYLOAD = "text_content_for_langchain"

# Define Qdrant collection naming convention
COLLECTION_NAME_PREFIX = "data_science_demo" 

In [8]:

# --- PDF Data Extraction Function ---
def extract_pdf_with_sources(pdf_path):
    """Extracts text and table content from a PDF, retaining source and page info."""
    documents = []
    print(f"Extracting text content from '{os.path.basename(pdf_path)}'...")
    # Extract text pages with PyMuPDF
    try:
        with fitz.open(pdf_path) as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text()
                if text.strip():
                    metadata = {
                        "source": os.path.basename(pdf_path),
                        "page": page_num + 1,
                        "type": "text"
                    }
                    documents.append(Document(page_content=text, metadata=metadata))
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        # Do not exit, attempt table extraction
        pass

    print(f"Extracting table content from '{os.path.basename(pdf_path)}'...")
    # Extract tables with pdfplumber
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table_num, table_data in enumerate(tables):
                    if table_data:
                        table_content = "\n".join(["\t".join(map(str, row)) for row in table_data if row])
                        if table_content.strip():
                            metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "table_num": table_num + 1,
                                "type": "table"
                            }
                            # Add a header to table content to distinguish it
                            documents.append(Document(page_content=f"Table {table_num+1} on page {page_num+1}:\n{table_content}", metadata=metadata))
    except Exception as e:
        print(f"Error extracting tables from {pdf_path}: {e}")
        # Continue even if table extraction fails
        pass

    documents = [doc for doc in documents if doc.page_content.strip()] # Final filter for empty content
    print(f"Extracted {len(documents)} raw documents (pages/tables) from '{os.path.basename(pdf_path)}'.")
    return documents

In [9]:

# --- Document Formatting Function for LLM Context ---
def format_docs_for_context(docs):
    """
    Formats a list of retrieved Documents into a single string suitable for LLM context.
    Includes source information.
    """
    formatted_text = ""
    for i, doc in enumerate(docs):
        source_info_parts = []
        source_info_parts.append(f"Source: {doc.metadata.get('source', 'N/A')}")
        source_info_parts.append(f"Page: {doc.metadata.get('page', 'N/A')}")
        if doc.metadata.get('type') == 'table' and doc.metadata.get('table_num') is not None:
             source_info_parts.append(f"Table: {doc.metadata.get('table_num')}")
        elif doc.metadata.get('type') == 'text' and doc.metadata.get('start_index') is not None:
             # Optionally include chunk start index, though page is usually sufficient for LLMs
             # source_info_parts.append(f"Chunk Start: {doc.metadata.get('start_index')}")
             pass # Don't include start_index in typical source info for LLM

        source_info = ", ".join(source_info_parts)

        # Adjust the formatting here for how context appears to the LLM
        formatted_text += f"--- Document {i+1} ({source_info}) ---\n"
        formatted_text += doc.page_content.strip() + "\n\n"
    return formatted_text.strip() # Remove trailing newlines

In [None]:
def retrieve_documents_manually(query: str, collection_name: str, embeddings_model, k: int = 3):
    """
    Performs a vector search using the raw Qdrant client and manually constructs
    LangChain Document objects with correct metadata from the payload.

    Args:
        query: The user's search query.
        collection_name: The name of the Qdrant collection to search.
        embeddings_model: The embedding model instance used for the query.
        k: The number of top results to retrieve.

    Returns:
        A list of LangChain Document objects with correctly populated page_content and metadata.
    """
    # print(f"\nPerforming manual retrieval for collection '{collection_name}'...")
    try:
        # 1. Embed the query
        query_vector = embeddings_model.embed_query(query)

        # 2. Perform the vector search using the raw Qdrant client
        # Ensure qdrant_api_client is accessible in this scope (it's defined globally below)
        search_result_points = qdrant_api_client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=k,
            with_payload=True, # Ensure payload is returned with results - CRITICAL
            with_vectors=False # No need to fetch vectors
        )

        # 3. Manually construct LangChain Document objects from the results
        manually_created_docs = []
        if search_result_points:
            # print(f"Raw Qdrant search returned {len(search_result_points)} points. Constructing Documents...")
            for i, point in enumerate(search_result_points):
                # Verify payload and content key exist
                if point.payload and CONTENT_KEY_IN_PAYLOAD in point.payload:
                    doc_content = point.payload[CONTENT_KEY_IN_PAYLOAD]

                    # Copy all other payload keys into the metadata dictionary
                    doc_metadata = {k: v for k, v in point.payload.items() if k != CONTENT_KEY_IN_PAYLOAD}

                    # Optionally, add the search score to the metadata
                    doc_metadata["score"] = point.score

                    manually_created_docs.append(
                        Document(page_content=doc_content, metadata=doc_metadata)
                    )
                else:
                     # This warning is helpful during debugging, maybe less so in a final run
                     # print(f"  Warning: Point ID {point.id} from search result did not have expected payload or content key.")
                     pass # Skip points with missing content key

        # print(f"Successfully constructed {len(manually_created_docs)} LangChain Documents.")
        return manually_created_docs

    except Exception as e:
        print(f"Error during manual search or document construction for '{collection_name}': {e}")
        # import traceback
        # traceback.print_exc()
        return [] # Return empty list in case of error