In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from openai import OpenAI
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import PromptTemplate

In [3]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def encode_pdf(file_path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """
    
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    chunks = splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(chunks)

    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

In [22]:
class HyDERetriever:
    def __init__(self, file_path, chunk_size = 1000, chunk_overlap = 200):
        self.llm = ChatOpenAI(temperature=0, model="meta-llama/llama-3.3-70b-instruct", max_tokens=4000)
        
        self.embeddings = OllamaEmbeddings(model="nomic-embed-text")
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectorstore = encode_pdf(file_path, chunk_size, chunk_overlap)
        
        self.hyde_prompt = PromptTemplate(
            input_variables=["query"],
            template="""
                You are summarizing how a 400-page AI engineering textbook explains a concept.

                Based on the question below, write a synthetic passage that reflects
                how the book would discuss this topic across multiple chapters.

                Focus on:
                - practical framing
                - systems and agents
                - engineering perspective
                - how the concept is used, not just defined

                Question:
                {query}
                """
            )
        
        self.hyde_chain = self.hyde_prompt | self.llm
        
    def generate_hypothetical_document(self, query):
        input_variables = {"query": query, "chunk_size": self.chunk_size}
        return self.hyde_chain.invoke(input_variables).content
        
    def retrieve(self, query, k=8):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectorstore.similarity_search(hypothetical_doc, k=k)
        return similar_docs, hypothetical_doc
                   

In [23]:
path = "AI_Engineer_Book.pdf"

retriever = HyDERetriever(path, chunk_size=500, chunk_overlap=100)

In [24]:
test_query = "What is LLM?"
results, hypothetical_doc = retriever.retrieve(test_query, k=10)

In [25]:
import textwrap

def text_wrap(text, width=120):
    """
    Wraps the input text to the specified width.

    Args:
        text (str): The input text to wrap.
        width (int): The width at which to wrap the text.

    Returns:
        str: The wrapped text.
    """
    return textwrap.fill(text, width=width)

def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")

In [26]:
docs_content = [doc.page_content for doc in results]

# Display hypothetical document
print("hypothetical_doc:\n")
print(text_wrap(hypothetical_doc))
print()

# Display retrieved contexts
show_context(docs_content)

hypothetical_doc:

Large Language Models (LLMs) are a class of artificial intelligence (AI) systems that have revolutionized the field of
natural language processing (NLP). From an engineering perspective, LLMs can be viewed as complex software systems that
leverage deep learning techniques to process and generate human-like language. These models are designed to learn
patterns and relationships within vast amounts of text data, enabling them to perform a wide range of tasks, such as
language translation, text summarization, and conversation generation.  In the context of systems and agents, LLMs can
be seen as autonomous agents that interact with their environment through text-based interfaces. They receive input in
the form of text, process it, and generate output that is often indistinguishable from human-generated text. This
interaction can be viewed as a feedback loop, where the LLM adapts to the user's input and adjusts its output
accordingly. For instance, in a conversational AI