In [3]:
#!pip install langchain-community pypdf

In [4]:
from dotenv import load_dotenv
load_dotenv()
import os

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash")

In [6]:
prompt_template="""You are an assistant for question answering tasks. Use the following pieces of retreived context to answer the question.
If you don't know the answer just say you don't know. Keep it brief.
Question: {question}
Context: {context}
"""

In [7]:
with open('MobyDick.txt', encoding='utf-8', errors='replace') as book:
    text = book.read()

text[1000:1100]

'Inn.\n\nCHAPTER 4. The Counterpane.\n\nCHAPTER 5. Breakfast.\n\nCHAPTER 6. The Street.\n\nCHAPTER 7. The Cha'

In [8]:
# Better printing

from IPython.display import Markdown

In [9]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("war.txt", encoding='utf-8')
docs = loader.load()

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # chunk size (characters)
    chunk_overlap=100,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split given book into {len(all_splits)} sub-documents.")

Split given book into 1035 sub-documents.


EMBEDDINGS

In [11]:
# Free alternative to Gemini embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load a small, fast embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a vector store
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

# Adding documents to vector store in small batches (to avoid memory spikes)
batched_ids = []
batch_size = 50  # you can adjust this up/down
for i in range(0, len(all_splits), batch_size):
    batch = all_splits[i:i+batch_size]
    ids = vector_store.add_documents(documents=batch)
    batched_ids.extend(ids)

print(f"Added {len(batched_ids)} documents to vector store using HuggingFace embeddings.")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Added 1035 documents to vector store using HuggingFace embeddings.


In [12]:
#ids

In [13]:
search_results = vector_store.similarity_search_with_score(
    "What is the story about?",
    k = 10
)

search_results

[(Document(id='7b969d56-32cf-4f59-93f6-748f8d2093ac', metadata={'source': 'war.txt', 'start_index': 172155}, page_content='That was the story they told my brother in fragments when presently\nthey stopped again, nearer to New Barnet. He promised to stay with\nthem, at least until they could determine what to do, or until the\nmissing man arrived, and professed to be an expert shot with the\nrevolver—a weapon strange to him—in order to give them confidence.'),
  0.42839167843779713),
 (Document(id='0621d20d-89c8-429b-bc14-da4612f020cc', metadata={'source': 'war.txt', 'start_index': 21606}, page_content='By eight o’clock a number of boys and unemployed men had already\nstarted for the common to see the “dead men from Mars.” That was the\nform the story took. I heard of it first from my newspaper boy about a\nquarter to nine when I went out to get my _Daily Chronicle_. I was\nnaturally startled, and lost no time in going out and across the\nOttershaw bridge to the sand-pits.\n\n\n\n\nIII.

RAG

In [14]:
prompt_template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""

doc_content = "\n\n".join(doc.page_content+"\n"+"="*50+"\n" for (doc,score) in search_results)
print(doc_content)

That was the story they told my brother in fragments when presently
they stopped again, nearer to New Barnet. He promised to stay with
them, at least until they could determine what to do, or until the
missing man arrived, and professed to be an expert shot with the
revolver—a weapon strange to him—in order to give them confidence.


By eight o’clock a number of boys and unemployed men had already
started for the common to see the “dead men from Mars.” That was the
form the story took. I heard of it first from my newspaper boy about a
quarter to nine when I went out to get my _Daily Chronicle_. I was
naturally startled, and lost no time in going out and across the
Ottershaw bridge to the sand-pits.




III.
ON HORSELL COMMON.


That was the story I got from him, bit by bit. He grew calmer telling
me and trying to make me see the things he had seen. He had eaten no
food since midday, he told me early in his narrative, and I found some
mutton and bread in the pantry and brought it into t

In [15]:
# make the LLM read see the prompt, and analyse the retrieved document, and generate response

prompt_template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""

response = model.invoke(prompt_template.format(
    context=doc_content,
    question="What is the story about?"))

In [17]:
Markdown(response.content)

The story is about an invasion by Martians, referred to as "Men from Mars," and the chaos they cause with "fighting-machines" and "Black Smoke." It details the experiences of people trying to survive and escape, including the narrator and his brother, amidst widespread panic and starvation.