In [1]:
# Install required packages
!pip install langchain langchain-google-genai langchain_community pypdf chromadb sentence-transformers -q
!pip install google-generativeai pdfplumber -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.4/20.4 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00

In [2]:
import os
import pdfplumber
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [3]:
from google.colab import userdata
os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI_API_KEY")

In [4]:
def upload_pdf(pdf_path):
    """
    Function to handle PDF uploads.

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: PDF file path if successful
    """
    try:
        # In a real application with Streamlit, you would use:
        # uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
        # But for this notebook, we'll just verify the file exists

        if os.path.exists(pdf_path):
            print(f"PDF file found at: {pdf_path}")
            return pdf_path
        else:
            print(f"Error: File not found at {pdf_path}")
            return None
    except Exception as e:
        print(f"Error uploading PDF: {e}")
        return None

In [5]:
attention_paper_path = "/content/India.pdf"

In [6]:
upload_pdf("India.pdf")

PDF file found at: India.pdf


'India.pdf'

In [7]:
from google.colab import files
uploaded = files.upload()


Saving India.pdf to India (1).pdf


In [8]:
import os
os.listdir('/content')


['.config', 'India (1).pdf', 'India.pdf', 'sample_data']

In [9]:
pdf_path = "/content/India.pdf"


In [10]:
def parse_pdf(pdf_path):
    """
    Function to extract text from PDF files.

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: Extracted text from the PDF
    """
    try:
        text = ""

        # Using pdfplumber to extract text
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"

        # Save the extracted text to a file (optional)
        text_file_path = pdf_path.replace('.pdf', '.txt')
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(text)

        print(f"PDF parsed successfully, extracted {len(text)} characters")
        return text
    except Exception as e:
        print(f"Error parsing PDF: {e}")
        return None

In [48]:
text_file = parse_pdf("/content/India.pdf")




PDF parsed successfully, extracted 246722 characters


In [49]:
def create_document_chunks(text):
    """
    Function to split the document text into smaller chunks for processing.

    Args:
        text (str): The full text from the PDF

    Returns:
        list: List of text chunks
    """
    try:
        # Initialize the text splitter
        # We can tune these parameters based on our needs and model constraints
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,        # Size of each chunk in characters
            chunk_overlap=100,      # Overlap between chunks to maintain context
            length_function=len,
            separators=["\n\n", "\n", " ", ""]  # Hierarchy of separators to use when splitting
        )

        # Split the text into chunks
        chunks = text_splitter.split_text(text)

        print(f"Document split into {len(chunks)} chunks")
        print("chunks: ", chunks)
        return chunks
    except Exception as e:
        print(f"Error creating document chunks: {e}")
        return []

In [50]:
text_chunks = create_document_chunks(text_file)

Document split into 609 chunks


In [51]:
text_chunks

["Asia > South Asia > India\nIndia\nIndia (Hindi: भारत or Bhārat), the largest country in South Asia, has many of the world's highest\nmountains, most populated cities, and longest rivers. As one of the great civilisations of the ancient world,\nIndia's heritage and culture is a rich amalgam of the past and present. This vast country, the most\npopulous in the world, offers the traveller a view of fascinating religions and ethnography, a smorgasbord",
 'of languages, diverse food, and architectural masterpieces that were built millennia ago and remain intact\ntoday. As the nation opens up to a globalised world, India still has a depth of history and intensity of\nculture that awe and fascinate the many who visit there.\nRegions\nIndia is administratively divided into 28 states and 8 union territories. The states are broadly demarcated\non linguistic lines. They vary in size; the larger ones are bigger and more diverse than some countries of',
 'Europe. The union territories are usually

In [52]:
def embed_and_view(text_chunks):
    """
    Embed document chunks and display their numeric embeddings.

    Args:
        text_chunks (list): List of text chunks from the document
    """
    try:
        # Initialize the Gemini embeddings
        embedding_model = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"  # Specify the Gemini Embedding model
        )

        print("Embedding model initialized successfully")

        # Generate and display embeddings for all chunks
        for i, chunk in enumerate(text_chunks):
            embedding = embedding_model.embed_query(chunk)
            print(f"Chunk {i} Embedding:\n{embedding}\n")

    except Exception as e:
        print(f"Error embedding documents: {e}")

# Example usage
sample_chunks = ["This is the first chunk.", "This is the second chunk.", "And this is the third chunk."]
embed_and_view(sample_chunks)

Embedding model initialized successfully
Chunk 0 Embedding:
[0.005306204780936241, -0.019982466474175453, -0.05330009013414383, -0.037803467363119125, 0.0438869446516037, 0.012169086374342442, 0.011968716979026794, 0.030833037570118904, -0.015381194651126862, 0.02207416482269764, -0.01051324326545, 0.05356165021657944, 0.05694853141903877, 0.013736017979681492, 0.014268357306718826, -0.00033483054721727967, 0.026143047958612442, 0.002164868637919426, -0.10417395830154419, 0.03183707222342491, 0.0369376577436924, -0.026903631165623665, 0.035999879240989685, -0.041685134172439575, -0.014223109930753708, 0.002302129054442048, 0.00924470741301775, -0.036460429430007935, 0.037307705730199814, 0.0015566367655992508, 0.058599747717380524, 0.05178055167198181, -0.0052936505526304245, -0.04410144314169884, 0.014856294728815556, 0.018107743933796883, -0.0010075304890051484, 0.017477499321103096, 0.024988515302538872, -0.02734184078872204, -0.08513811230659485, 0.0653407871723175, -0.025275081396

In [53]:
def embed_documents(text_chunks):
    """
    Function to generate embeddings for the text chunks.

    Args:
        text_chunks (list): List of text chunks from the document

    Returns:
        object: Embedding model for further use
    """
    try:
        # Initialize the Gemini embeddings
        embedding_model = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004"  # Specify the Gemini Embedding model
        )

        print("Embedding model initialized successfully")
        return embedding_model, text_chunks
    except Exception as e:
        print(f"Error embedding documents: {e}")
        return None, None

In [54]:
embedded_documents = embed_documents(text_chunks)

Embedding model initialized successfully


In [55]:
def store_embeddings(embedding_model, text_chunks):
    """
    Function to store document embeddings in ChromaDB.

    Args:
        embedding_model: The embedding model to use
        text_chunks (list): List of text chunks to embed and store

    Returns:
        object: Vector store for retrieval
    """
    try:
        # Create a vector store from the documents
        vectorstore = Chroma.from_texts(
            texts=text_chunks,
            embedding=embedding_model,
            persist_directory="./chroma_db"  # Directory to persist the database
        )

        # Persist the vector store to disk
        vectorstore.persist()

        print(f"Successfully stored {len(text_chunks)} document chunks in ChromaDB")
        return vectorstore
    except Exception as e:
        print(f"Error storing embeddings: {e}")
        return None

In [56]:
chroma_store = store_embeddings(embedded_documents[0],embedded_documents[1])

Successfully stored 609 document chunks in ChromaDB


  vectorstore.persist()


In [57]:
def embed_query(query, embedding_model):
    """
    Function to embed the user's query.

    Args:
        query (str): User's question
        embedding_model: The embedding model to use

    Returns:
        list: Embedded query vector
    """
    try:
        # Generate embedding for the query
        query_embedding = embedding_model.embed_query(query)

        print("Query embedded successfully")
        return query_embedding
    except Exception as e:
        print(f"Error embedding query: {e}")
        return None

In [59]:
user_query = "Where can I see snowfall in December in India?"

In [60]:
embedded_query = embed_query(user_query, embedded_documents[0])
print(embedded_query)

Query embedded successfully
[0.026560893282294273, 0.036313313990831375, -0.0328504741191864, -0.02617468684911728, 0.018954548984766006, 0.01828565075993538, -0.04996728152036667, -0.049322012811899185, -0.014404724352061749, 0.026103703305125237, -0.012804482132196426, -0.030055981129407883, 0.07778479158878326, -0.01837978884577751, 0.04608171805739403, -0.009796790778636932, 0.05624466761946678, -0.005680806469172239, -0.07161761820316315, -0.02550024352967739, 0.06926596164703369, -0.02900354191660881, 0.0033598262816667557, 0.011436362750828266, -0.01592583954334259, -0.005911492742598057, -0.0014465807471424341, 0.03347143903374672, -0.03597160801291466, -0.011746061965823174, 0.0666341632604599, 0.06505104899406433, 0.017688391730189323, 0.005886333994567394, 0.003997184336185455, -0.0017253659898415208, 0.021136706694960594, 0.0049724034033715725, 0.0679786279797554, -0.05093260109424591, -0.04512044042348862, 0.07641337811946869, 0.02589518390595913, 0.03509640693664551, -0.0

In [61]:
def retrieve_relevant_chunks(vectorstore, query, embedding_model, k=3):
    """
    Function to retrieve the most relevant document chunks for a query.

    Args:
        vectorstore: The ChromaDB vector store
        query (str): User's question
        embedding_model: The embedding model
        k (int): Number of chunks to retrieve

    Returns:
        list: List of relevant document chunks
    """
    try:
        # Create a retriever from the vector store
        retriever = vectorstore.as_retriever(
            search_type="similarity",  # Can also use "mmr" for Maximum Marginal Relevance
            search_kwargs={"k": k}     # Number of documents to retrieve
        )

        # Retrieve relevant chunks
        relevant_chunks = retriever.get_relevant_documents(query)

        print(f"Retrieved {len(relevant_chunks)} relevant document chunks")
        return relevant_chunks
    except Exception as e:
        print(f"Error retrieving chunks: {e}")
        return []

In [62]:
relevant_chunks = retrieve_relevant_chunks(chroma_store, user_query, embedded_documents[0])

Retrieved 3 relevant document chunks


  relevant_chunks = retriever.get_relevant_documents(query)


In [63]:
relevant_chunks

[Document(metadata={}, page_content='Rainy Season (or "Monsoon") and Winter, though in the\ntropical South calling the 25°C (77°F) weather "Winter"\nwould be stretching the concept. The North experiences some\nGulmarg, Jammu and Kashmir\nextremes of heat in Summer and cold in Winter, but except in\nthe Himalayan regions, snow is almost unheard of. November to January is the winter season and April\nand May are the hot months when everyone eagerly awaits the rains. There is also a brief spring in'),
 Document(metadata={}, page_content='flights are often delayed or cancelled when visibility drops to near-zero.\nAvoid travel in the North at this time of year if you can. If you have no choice, wearing an N95 mask and\nminimizing time spent outdoors can help.\nMany cities and villages in India, not particularly in any region, have sanitation and pollution issues. You\nmight find people dumping and burning garbage in the open. Check the air quality of a city before\nventuring.\nDrinking wate

In [64]:
def get_context_from_chunks(relevant_chunks, splitter="\n\n---\n\n"):
    """
    Extract page_content from document chunks and join them with a splitter.

    Args:
        relevant_chunks (list): List of document chunks from retriever
        splitter (str): String to use as separator between chunk contents

    Returns:
        str: Combined context from all chunks
    """
    # Extract page_content from each chunk
    chunk_contents = []

    for i, chunk in enumerate(relevant_chunks):
        if hasattr(chunk, 'page_content'):
            # Add a chunk identifier to help with tracing which chunk provided what information
            chunk_text = f"[Chunk {i+1}]: {chunk.page_content}"
            chunk_contents.append(chunk_text)

    # Join all contents with the splitter
    combined_context = splitter.join(chunk_contents)

    return combined_context

In [65]:
context = get_context_from_chunks(relevant_chunks)

In [66]:
context

'[Chunk 1]: Rainy Season (or "Monsoon") and Winter, though in the\ntropical South calling the 25°C (77°F) weather "Winter"\nwould be stretching the concept. The North experiences some\nGulmarg, Jammu and Kashmir\nextremes of heat in Summer and cold in Winter, but except in\nthe Himalayan regions, snow is almost unheard of. November to January is the winter season and April\nand May are the hot months when everyone eagerly awaits the rains. There is also a brief spring in\n\n---\n\n[Chunk 2]: flights are often delayed or cancelled when visibility drops to near-zero.\nAvoid travel in the North at this time of year if you can. If you have no choice, wearing an N95 mask and\nminimizing time spent outdoors can help.\nMany cities and villages in India, not particularly in any region, have sanitation and pollution issues. You\nmight find people dumping and burning garbage in the open. Check the air quality of a city before\nventuring.\nDrinking water\n\n---\n\n[Chunk 3]: and Kashmir\'s summer

In [67]:
 final_prompt = f"""You are a helpful assistant answering questions based on provided context.

The context is taken from academic papers, and might have formatting issues like spaces missing between words.
Please interpret the content intelligently, separating words properly when they appear joined together.

Use ONLY the following context to answer the question.
If the answer cannot be determined from the context, respond with "I cannot answer this based on the provided context."

Context:
{context}

Question: {user_query}

Answer:"""

In [68]:
final_prompt

'You are a helpful assistant answering questions based on provided context.\n\nThe context is taken from academic papers, and might have formatting issues like spaces missing between words.\nPlease interpret the content intelligently, separating words properly when they appear joined together.\n\nUse ONLY the following context to answer the question.\nIf the answer cannot be determined from the context, respond with "I cannot answer this based on the provided context."\n\nContext:\n[Chunk 1]: Rainy Season (or "Monsoon") and Winter, though in the\ntropical South calling the 25°C (77°F) weather "Winter"\nwould be stretching the concept. The North experiences some\nGulmarg, Jammu and Kashmir\nextremes of heat in Summer and cold in Winter, but except in\nthe Himalayan regions, snow is almost unheard of. November to January is the winter season and April\nand May are the hot months when everyone eagerly awaits the rains. There is also a brief spring in\n\n---\n\n[Chunk 2]: flights are often

In [69]:
def generate_response(prompt, model="gemini-2.0-flash-thinking-exp-01-21", temperature=0.3, top_p=0.95):
    """
    Function to generate a response using the Gemini model.

    Args:
        prompt (str): The prompt for the model

    Returns:
        str: Model's response
    """

    llm = ChatGoogleGenerativeAI(
            model=model,
            temperature=0.2,  # Lower temperature for more focused answers
            top_p=0.95
        )

    response = llm.invoke(prompt)

    return response.content

In [70]:
generate_response(final_prompt)

'In the Himalayan regions, snow is almost unheard of except in those areas. December falls within the winter season (November to January).'

In [71]:
# ============================================================
# 🔹 Travel Assistant RAG with Web Search Fallback (Fixed)
# ============================================================

from langchain_google_genai import ChatGoogleGenerativeAI

def generate_response_with_fallback(user_query, vectorstore, embedding_model, model="gemini-2.0-flash-thinking-exp-01-21", k=3):
    """
    Generates a response using RAG first, then web fallback if needed.

    Args:
        user_query (str): User's question
        vectorstore: Chroma vector store containing document embeddings
        embedding_model: Embedding model used for documents/queries
        model (str): Gemini LLM model name
        k (int): Number of chunks to retrieve from document

    Returns:
        str: AI-generated answer
    """

    # Step 1: Create retriever
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": k}
    )

    # Step 2: Retrieve relevant document chunks
    relevant_chunks = retriever.get_relevant_documents(user_query)
    context = ""
    if relevant_chunks:
        context = "\n\n---\n\n".join([f"[Chunk {i+1}]: {chunk.page_content}" for i, chunk in enumerate(relevant_chunks)])

    # Step 3: Build prompt for document context
    doc_prompt = f"""
    You are a helpful assistant answering questions based on the provided context.

    Use ONLY the information in the document chunks below to answer the user's question.
    If the document doesn’t contain enough details, respond with: "I cannot answer this based on the provided context."

    Context:
    {context}

    Question: {user_query}

    Answer:
    """

    # Initialize Gemini LLM
    llm = ChatGoogleGenerativeAI(model=model, temperature=0.3, top_p=0.95)

    # Step 4: Try to get answer from document
    if context:
        doc_response = llm.invoke(doc_prompt)
        # Extract content safely
        doc_content = doc_response.content if hasattr(doc_response, "content") else str(doc_response)

        if "I cannot answer this based on the provided context." not in doc_content:
            print("✅ Answer retrieved from document context.")
            return doc_content
        else:
            print("❌ Document could not answer, using web fallback...")
    else:
        print("🌐 No relevant chunks in document. Using web fallback...")

    # Step 5: Web fallback
    web_prompt = f"""
    Search the web for up-to-date, accurate information for:
    "{user_query}"

    Summarize clearly and provide helpful details.
    Include relevant details such as:
    - Author / creators
    - Dates or location info
    - Key highlights / facts
    """

    web_response = llm.invoke(web_prompt)
    web_content = web_response.content if hasattr(web_response, "content") else str(web_response)
    print("✅ Answer generated using web search.")
    return web_content

In [78]:
user_query = "Where can I see snowfall in December in India"
response = generate_response_with_fallback(user_query, chroma_store, embedded_documents[0])
print(response)

✅ Answer retrieved from document context.
Based on the provided context, snow is almost unheard of except in the Himalayan regions. Gulmarg, Jammu and Kashmir is mentioned in the context of the North experiencing cold in Winter, which includes December. Therefore, you can see snowfall in December in the Himalayan regions, such as Gulmarg, Jammu and Kashmir.


In [79]:
user_query = "Which are the cities with rich cultural heritage in india"
response = generate_response_with_fallback(user_query, chroma_store, embedded_documents[0])
print(response)

✅ Answer retrieved from document context.
Based on the provided context, the cities with rich cultural heritage in India include:

*   **Varanasi, Uttar Pradesh**: where some 5,000-year-old rituals are still practised.
*   **Rishikesh and Haridwar**: worth visiting to experience the Ganges.
*   **Goa**: an interesting former Portuguese colony.
*   **Jaipur, Rajasthan**: incredibly rich in forts and palaces, including the Amber Fort, Jal Mahal (Water Palace), and Hawa Mahal.
*   **Nalanda in Bihar**: has the remains of a university of Buddhism that was established in 450 CE.
*   **Ahmedabad**: home to the Gandhi Ashram, founded by Mahatma Gandhi.


In [80]:
user_query = "famous hill stations in india"
response = generate_response_with_fallback(user_query, chroma_store, embedded_documents[0])
print(response)

✅ Answer retrieved from document context.
The famous hill stations in India mentioned are Srinagar, Darjeeling, Shimla, Ooty, and Gangtok.
