## Install/Verify Libraries

In [1]:
%pip install "google-genai>=1.0.0" "pinecone>=3.0.0" python-dotenv tqdm

print("Required libraries checked/installed.")

Note: you may need to restart the kernel to use updated packages.
Required libraries checked/installed.


## Load Environment Variables & Initialize Clients

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from google import genai
from pinecone import Pinecone


# Load API Keys
print("Loading environment variables...")
dotenv_path = find_dotenv(raise_error_if_not_found=False)
if dotenv_path:
    load_dotenv(dotenv_path)
    print(".env file loaded.")
else:
    print("Warning: .env file not found. Will attempt to load API keys from system environment variables if they are set. "
          "Otherwise, please create a .env file with your API keys.")

google_api_key = os.getenv("GOOGLE_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

if not google_api_key:
    raise EnvironmentError("GOOGLE_API_KEY not found. Please set it in .env or as a system environment variable.")
if not pinecone_api_key:
    raise EnvironmentError("PINECONE_API_KEY not found. Please set it in .env or as a system environment variable.")

# Initialize Google Gen AI Client (for embedding and generation)
print("Initializing Google Gen AI Client...")
google_ai_client = None
google_ai_ready = False

try:
    google_ai_client = genai.Client(api_key=google_api_key)
    print("Google Gen AI Client initialized successfully.")
    google_ai_ready = True
except Exception as e:
    print(f"Error initializing Google Gen AI Client: {e}")

# Initialize Pinecone Connection
print("Initializing Pinecone connection...")
pc = None

try:
    pc = Pinecone(api_key=pinecone_api_key)
    print("Pinecone client initialized successfully.")
except Exception as e:
    print(f"Error initializing Pinecone client: {e}")

# Connect to the Pinecone Index
pinecone_index = None
index_name = 'semantic-search-app-index'  # Same index name used in Step 3 notebook (02-embedding-storage.ipynb)

if pc:
    try:
        if index_name in [index_info["name"] for index_info in pc.list_indexes()]:
            pinecone_index = pc.Index(index_name)
            print(f"Connected to Pinecone index '{index_name}'.")
            # Optional: Describe index stats to confirm it has vectors
            # print(pinecone_index.describe_index_stats())
        else:
            print(f"Error: Pinecone index '{index_name}' does not exist. Please run the Step 3 notebook (02-embedding-storage.ipynb) to create and populate it.")
    except Exception as e:
        print(f"Error connecting to Pinecone index: {e}")
else:
    print("Skipping Pinecone index connection due to client initialization failure.")

  from .autonotebook import tqdm as notebook_tqdm


Loading environment variables...
.env file loaded.
Initializing Google Gen AI Client...
Google Gen AI Client initialized successfully.
Initializing Pinecone connection...
Pinecone client initialized successfully.
Connected to Pinecone index 'semantic-search-app-index'.


## Define Embedding Model and LLM

In [3]:
embedding_model_name = None
llm_model_identifier = None  # For the string name
llm_chat_session = None  # For the chat object

if google_ai_ready and google_ai_client:
    embedding_model_name = 'models/text-embedding-004'  # For querying
    print(f"Using embedding model for queries: {embedding_model_name}")

    # Generative Model (LLM) - Gemini 2.0 Flash
    llm_model_identifier = 'gemini-2.0-flash'
    print(f"Target LLM for generation: {llm_model_identifier}")
    
    # Initialize a chat session for multi-turn conversation
    try:
        # Create a new chat session using the client
        llm_chat_session = google_ai_client.chats.create(model=llm_model_identifier)
        print(f"Chat session created with {llm_model_identifier}.")
    except Exception as e:
        print(f"Error creating chat session with {llm_model_identifier}: {e}")
        llm_chat_session = None
        
else:
    print("Skipping model definitions due to Google AI client configuration error.")

Using embedding model for queries: models/text-embedding-004
Target LLM for generation: gemini-2.0-flash
Chat session created with gemini-2.0-flash.


## RAG Function

In [None]:
from google.genai import types

def answer_query_with_rag(user_query: str, chat_session: genai.chats.Chat, top_k: int = 3) -> str:
    """
    Retrieves relevant context from Pinecone and uses Gemini to answer a query
    within a given chat session.

    Args:
        user_query (str): The user's natural language query.
        chat_session: The Gemini chat session object.
        top_k (int): Number of top results to retrieve from Pinecone (default is 3).

    Returns:
        str: The model-generated answer or an error message.
    """

    # Check if all required components are initialized
    if not google_ai_ready or not google_ai_client or not pinecone_index or not chat_session:
        return "Error: Critical components (Google AI Client, Pinecone Index, or Chat Session) not initialized."

    print(f"\nProcessing query: '{user_query}'")

    # 1. Retrieve (Embed query & Search Vector DB)
    print("Embedding user query...")
    try:
        # Use the client object's models attribute for embedding
        embedding_response = google_ai_client.models.embed_content(
            model=embedding_model_name,
            contents=user_query, # Corrected from 'content' to 'contents' for single query
            config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY")
        )
        query_embedding = embedding_response.embeddings[0].values  # list of floats
    except Exception as e:
        return f"Error embedding query: {e}"

    print("Searching Pinecone index for relevant chunks...")
    try:
        query_results = pinecone_index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )
    except Exception as e:
        return f"Error querying Pinecone: {e}"

    # 2. Augment (Create prompt with context)
    retrieved_chunks_text = []
    if query_results.matches:
        for match in query_results.matches:
            if match.metadata and 'text' in match.metadata:
                retrieved_chunks_text.append(match.metadata['text'])
        print(f"Retrieved {len(retrieved_chunks_text)} chunks.")
    else:
        print("No relevant chunks found in Pinecone.")
    
    context_string = "\n\n---\n\n".join(retrieved_chunks_text)

    # Basic Prompt Engineering
    # The chat history is managed by the SDK.
    # For this turn, only the current query and retrieved context are provided.
    prompt_for_current_turn = f"""
    Based ONLY on the following context, answer the question.
    If the answer is not found in the context, state "I cannot answer this question based on the provided information."

    Context:
    {context_string}

    Question: {user_query}

    Answer:
    """

    print("Constructed prompt for current turn.")

    # 3. Generate (Call LLM API using the chat session)
    print(f"Sending message to chat session with LLM: {llm_model_identifier}...")
    try:
        # Send augmented prompt to chat session
        chat_response = chat_session.send_message(prompt_for_current_turn)
        final_answer = chat_response.text
    except Exception as e:
        return f"Error generating answer with LLM via chat session: {e}"

    print("Answer generated.")
    return final_answer

## Test the RAG Pipeline

In [5]:
if google_ai_ready and pinecone_index and llm_chat_session:
    # Test query 1 (initial query)
    print("\n--- Test 1: Initial Query ---")
    test_query_1 = "What is the core idea of attention mechanism in the Transformer model?"
    answer_1 = answer_query_with_rag(test_query_1, llm_chat_session)
    print(f"\nQuery: {test_query_1}")
    print(f"Answer: {answer_1}")

    # Test query 2 (follow-up)
    print("\n--- Test 2: Follow-up Query (utilizing chat history) ---")
    test_query_2 = "Can you elaborate on the 'Scaled Dot-Product Attention' part mentioned?"
    answer_2 = answer_query_with_rag(test_query_2, llm_chat_session) 
    print(f"\nQuery: {test_query_2}")
    print(f"Answer: {answer_2}")

    # Test query 3 (new chat session)
    print("\n--- Test 3: Query Potentially Not in Document ---")
    try:
        new_chat_session_for_test3 = google_ai_client.chats.create(model=llm_model_identifier)
        print(f"\nNew chat session created for Test 3 with {llm_model_identifier}.")
        test_query_3 = "What is the capital of France?"
        answer_3 = answer_query_with_rag(test_query_3, new_chat_session_for_test3, top_k=1)
        print(f"\nQuery: {test_query_3}")
        print(f"Answer: {answer_3}")
    except Exception as e:
        print(f"Error during Test 3 (new chat session or query): {e}")
        
    # Display chat history from the first session
    print("\n--- Chat History (Session 1) ---")
    try:
        for message in llm_chat_session.get_history():
            role = getattr(message, 'role', 'unknown').lower()
            text_content = ""
            if hasattr(message, 'parts') and message.parts:
                text_content = getattr(message.parts[0], 'text', '')
            print(f"Role: {role} - Text: {text_content}")
    except Exception as e:
        print(f"Error retrieving chat history: {e}")

else:
    print("Skipping RAG pipeline test due to initialization errors of critical components (Google AI Client, Pinecone Index, or Chat Session).")


--- Test 1: Initial Query ---

Processing query: 'What is the core idea of attention mechanism in the Transformer model?'
Embedding user query...
Searching Pinecone index for relevant chunks...
Retrieved 3 chunks.
Constructed prompt for current turn.
Sending message to chat session with LLM: gemini-2.0-flash...
Answer generated.

Query: What is the core idea of attention mechanism in the Transformer model?
Answer: The Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution. Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence.


--- Test 2: Follow-up Query (utilizing chat history) ---

Processing query: 'Can you elaborate on the 'Scaled Dot-Product Attention' part mentioned?'
Embedding user query...
Searching Pinecone index for relevant ch