In [89]:
import pymupdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.memory import ConversationBufferMemory

from dotenv import load_dotenv

load_dotenv()

True

# Extract text from book

In [90]:
doc = pymupdf.open("zero.pdf")
toc = doc.get_toc()

chapters_info = {}
chapter_pages = []

print("PDF Table of Contents:")
for item in toc:
    level, title, page_num = item
    print(f"Level {level}: '{title}' - Page {page_num}")

    if level <= 2:
        chapters_info[len(chapter_pages)] = {
            'title': title,
            'page': page_num - 1,
            'level': level
        }
        chapter_pages.append(page_num-1)

PDF Table of Contents:
Level 1: 'Preface: Zero to One' - Page 5
Level 2: '1€€€The Challenge of the Future' - Page 7
Level 2: '2   Party Like It’s 1999' - Page 13
Level 2: '3€€€All Happy Companies Are Different' - Page 21
Level 2: '4€€€The Ideology of Competition' - Page 30
Level 2: '5€€€Last Mover Advantage' - Page 36
Level 2: '6€€€You Are Not a Lottery Ticket' - Page 46
Level 2: '7€€€Follow the Money' - Page 61
Level 2: '8€€€Secrets' - Page 69
Level 2: '9€€€Foundations' - Page 78
Level 2: '10€€€The Mechanics of Mafia' - Page 87
Level 2: '11€€€If You Build It, Will They Come?' - Page 94
Level 2: '12€€€Man and Machine' - Page 104
Level 2: '13€€€Seeing Green' - Page 112
Level 2: '14   The Founder’s Paradox' - Page 127
Level 1: 'Conclusion: Stagnation or Singularity?' - Page 142
Level 1: 'Acknowledgments' - Page 146
Level 1: 'Illustration Credits' - Page 147
Level 1: 'Index' - Page 148
Level 1: 'About the Authors' - Page 160


### First three chapters of the book

In [91]:
# start_page = chapter_pages[0]
# end_page = chapter_pages[4]

first_three_chapters = []

for start_page, end_page in ((chapter_pages[1],chapter_pages[2]),(chapter_pages[2],chapter_pages[3]),(chapter_pages[3],chapter_pages[4])):

    txt = ""
    for page_num in range(start_page,min(end_page,doc.page_count)):
        page = doc[page_num]
        txt+=page.get_text()
    
    first_three_chapters.append(txt)

doc.close()

# Initialize LLM and Embeddings

In [92]:
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1000
)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings":True}
)

# Create Document Chunks and Vector Store

In [93]:
print(chapters_info[1]['title'].strip('123€'))

The Challenge of the Future


In [94]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n","\n",".","!","?",","," ",""]
)

chapters = [text_splitter.split_text(text) for text in first_three_chapters]

documents = []

for i,chunks in enumerate(chapters):
    for txt in chunks:
        # print(chapters_info[i+1]['title'].strip('123€'))
        doc = Document(
            page_content=txt,
            metadata={
                "chunk id": i,
                "source": "Zero to One - Chapters 1-3",
                "chunk_size": len(txt),
                "chapter": i+1,
                "chapter_title": chapters_info[i+1]['title'].strip('123€')
            }
        )
        documents.append(doc)

### Vector Store

In [95]:
vector_store = FAISS.from_documents(
    documents=documents,
    embedding=embeddings
)

# RAG Pipeline

In [96]:
# Create retriever with improved search
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Retrieve top 5 most relevant chunks
)

# Create enhanced prompt template that uses chapter metadata
prompt = PromptTemplate(
    template="""
        You are a helpful assistant.
        Answer ONLY from the provided context or memory.
        If the context is insufficient, just say you don't know.

        {context}
        Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [97]:
question          = "what is the capital of india"
retrieved_docs    = retriever.invoke(question)
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
final_prompt = prompt.invoke({"context": context_text, "question": question})
answer = llm.invoke(final_prompt)
print(answer.content)

I don't know.


In [98]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [99]:
parser = StrOutputParser()
main_chain = parallel_chain | prompt | llm | parser


In [100]:
# Initialize memory
memory = ConversationBufferMemory(return_messages=True)

# Modify main_chain to include memory
main_chain_with_memory = main_chain.with_config({"memory": memory})

# Example usage
main_chain_with_memory.invoke('Can you summarize the book')

'The book discusses the principles of startups and successful businesses. It emphasizes the importance of new thinking, small size, and the ability to create a monopoly by solving a unique problem. The author argues that horizontal progress, or doing something that others have already done, is not enough to achieve success, and that vertical progress, or doing something new and innovative, is necessary for growth and sustainability.\n\nThe book also touches on the concept of globalization and the importance of new technology in creating wealth and avoiding devastation. It suggests that simply copying old ways of creating wealth will not lead to riches, but rather to scarcity and devastation.\n\nThe author also highlights the difference between happy and unhappy companies, with happy companies being unique and solving a unique problem, and unhappy companies failing to escape competition.\n\nOverall, the book is about the questions and answers needed to succeed in the business of doing n

In [101]:
main_chain_with_memory.invoke('Hi my name is Harshil')

"Hello Harshil. It seems like we're in the middle of a conversation about business and economics, with references to a book or article about the challenges of the future and the nature of competition."

In [102]:
main_chain_with_memory.invoke('What is my name?')

"I don't know."