## Install Libraries

In [1]:
%pip install "google-generativeai>=0.5.0" "pinecone>=3.0.0" tqdm python-dotenv langchain pypdf

print("Required libraries checked/installed.")

Note: you may need to restart the kernel to use updated packages.
Required libraries checked/installed.


## Load Environment Variables (Pinecone Credentials)

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

# Attempt to find and load the .env file to get environment variables
dotenv_path = find_dotenv(raise_error_if_not_found=False) # Avoid error if .env is missing

if dotenv_path:
    print("Loading .env file...")
    load_dotenv(dotenv_path)
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    google_api_key = os.getenv("GOOGLE_API_KEY")

    if not pinecone_api_key:
        raise ValueError("Error: PINECONE_API_KEY not found in .env file.")
    else:
        print("Pinecone API Key loaded.")

    if not google_api_key:
        raise ValueError("Error: GOOGLE_API_KEY not found in .env file.")
    else:
        print("Google AI API Key loaded.")
        
else:
    raise FileNotFoundError("Error: .env file not found. Please create one with your Pinecone and Google AI API keys.")

Loading .env file...
Pinecone API Key loaded.
Google AI API Key loaded.


## Load and Chunk Document

In [3]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_file_path = os.path.join('..', 'data', 'Attention is All You Need.pdf')
text_chunks = [] # Initialize an empty list for chunks

if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"Error: PDF file not found at calculated path: {os.path.abspath(pdf_file_path)}")
    
else:
    print(f"Loading PDF from: {pdf_file_path}")
    try:
        # Initialize the PDF reader
        reader = PdfReader(pdf_file_path)

        # Extract text from the PDF
        full_document_text = ""
        for page_num, page in enumerate(reader.pages):
            page_text = page.extract_text()

            # Check if text was extracted from current page
            if page_text:
                full_document_text += page_text + "\n"
            else:
                print(f"Warning: No text extracted from page {page_num + 1}.")

        # Check if any text was extracted
        if not full_document_text:
             print("Warning: No text could be extracted from the PDF. Cannot proceed.")
        else:
            print(f"Successfully loaded document with {len(full_document_text)} characters.")

            # Initialize Recursive Character Text Splitter
            print("Initializing text splitter...")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
                separators=["\n\n", "\n", " ", ""]  # Priorize splitting by paragraph, then line, then space
            )

            # Split text into chunks
            print("Splitting document into chunks...")
            text_chunks = text_splitter.split_text(full_document_text)
            print(f"Document split into {len(text_chunks)} chunks.")

    except Exception as e:
        print(f"An error occurred during PDF loading or chunking: {e}")
        raise e  # Re-raise the exception to stop execution

Loading PDF from: ../data/Attention is All You Need.pdf
Successfully loaded document with 39602 characters.
Initializing text splitter...
Splitting document into chunks...
Document split into 50 chunks.


## Initialize Embedding Model

In [4]:
import google.generativeai as genai

google_ai_configured = False  # Flag to track successful configuration
if google_api_key:
    print("Configuring Google AI Client...")
    try:
        genai.configure(api_key=google_api_key)
        print("Google AI Client configured successfully.")
        google_ai_configured = True
    except Exception as e:
        print(f"Error configuring Google AI Client: {e}")
else:
    print("Skipping Google AI configuration due to missing API key.")

Configuring Google AI Client...
Google AI Client configured successfully.


  from .autonotebook import tqdm as notebook_tqdm


## Initialize Pinecone Connection

In [5]:
from pinecone import Pinecone, ServerlessSpec

pc = None 
if pinecone_api_key and google_ai_configured:
    print(f"Initializing Pinecone connection...")
    try:
        pc = Pinecone(api_key=pinecone_api_key)
        print("Pinecone client initialized successfully.")
    except Exception as e:
        print(f"Error initializing Pinecone client: {e}")
else:
    print("Skipping Pinecone initialization due to missing Pinecone API key or Google AI configuration.")

Initializing Pinecone connection...
Pinecone client initialized successfully.


## Create or Connect to Pinecone Index

In [6]:
pinecone_index = None
if pc and google_ai_configured:  # Check both clients are ready
    index_name = 'semantic-search-app-index'
    embedding_dim = 768  # Dimension for Google's embedding-004 model

    print(f"Checking if Pinecone index '{index_name}' exists...")
    print(f"Required embedding dimension: {embedding_dim}")

    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        print(f"Index '{index_name}' does not exist. Creating...")
        try:
            pc.create_index(
                name=index_name,
                dimension=embedding_dim,
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
            print(f"Index '{index_name}' created successfully. Please wait for initialization...")

            # Optional: Add wait loop if index isn't ready immediately
            # import time
            # while not pc.describe_index(index_name).status['ready']:
            #     print("Waiting for index to be ready...")
            #     time.sleep(5)

        except Exception as e:
            print(f"Error creating Pinecone index: {e}")
            
    else:
        print(f"Index '{index_name}' already exists. Connecting...")

    try:
        pinecone_index = pc.Index(index_name)
        print(f"Connected to index '{index_name}'.")
    except Exception as e:
        print(f"Error connecting to Pinecone index '{index_name}': {e}")
        
else:
    print("Skipping index creation/connection as Pinecone or Google AI client was not initialized.")

Checking if Pinecone index 'semantic-search-app-index' exists...
Required embedding dimension: 768
Index 'semantic-search-app-index' already exists. Connecting...
Connected to index 'semantic-search-app-index'.


## Embed Chunks and Prepare for Upsert

In [7]:
from tqdm.auto import tqdm

if text_chunks and google_ai_configured and pinecone_index:
    print(f"Preparing {len(text_chunks)} chunks for embedding and upserting via Google AI...")

    google_batch_size = 100  # Google's text embedding API limit for texts in a batch
    pinecone_upsert_batch_size = 100  # Pinecone's recommended upsert batch size

    model_name = 'models/text-embedding-004'  # Google's text embedding model

    all_vectors_to_upsert = []

    # Prepare all data with IDs and metadata first
    for i, chunk_text in enumerate(text_chunks):
         chunk_id = f"chunk_{i}"
         all_vectors_to_upsert.append({
             "id": chunk_id,
             "metadata": {"text": chunk_text},
             "values": []  # Placeholder for embedding values
         })

    # Embed in batches suitable for Google API
    print(f"Generating embeddings in batches of {google_batch_size}...")
    for i in tqdm(range(0, len(all_vectors_to_upsert), google_batch_size), desc="Embedding Chunks"):
        i_end = min(i + google_batch_size, len(all_vectors_to_upsert))
        current_batch_items = all_vectors_to_upsert[i:i_end]
        texts_in_current_batch = [item['metadata']['text'] for item in current_batch_items]

        try:
            response = genai.embed_content(model=model_name,
                                           content=texts_in_current_batch,
                                           task_type="retrieval_document")
            
            embeddings_from_api = response['embedding']  # This is a list of lists

            # Assign embeddings back to the items in all_vectors_to_upsert
            for j, embedding_vector in enumerate(embeddings_from_api):
                all_vectors_to_upsert[i+j]['values'] = embedding_vector
            
            # Optional: If hitting rate limits, add a small delay
            # import time
            # time.sleep(1)

        except Exception as e:
            print(f"Error embedding batch starting at index {i}: {e}")
            raise e  # Stop execution if embedding fails

    # Upsert all prepared vectors to Pinecone in batches
    if all_vectors_to_upsert and all_vectors_to_upsert[0]['values']:  # Check if embeddings were generated
        print(f"\nUpserting {len(all_vectors_to_upsert)} vectors to Pinecone in batches of {pinecone_upsert_batch_size}...")
        for i in tqdm(range(0, len(all_vectors_to_upsert), pinecone_upsert_batch_size), desc="Upserting Batches"):
            i_end = min(i + pinecone_upsert_batch_size, len(all_vectors_to_upsert))
            pinecone_batch_to_upsert = all_vectors_to_upsert[i:i_end]

            try:
                pinecone_index.upsert(vectors=pinecone_batch_to_upsert)
            except Exception as e:
                print(f"Error upserting batch to Pinecone starting at index {i}: {e}")
                raise e  # Stop execution if upsert fails
            
        print("Finished embedding and upserting all chunks.")
        # Optional: Check index stats
        # print(pinecone_index.describe_index_stats())

    else:
        print("No vectors with embeddings were prepared for upserting.")
        
else:
    print("Skipping embedding/upserting due to missing dependencies (text_chunks, Google AI config, or Pinecone index).")

Preparing 50 chunks for embedding and upserting via Google AI...
Generating embeddings in batches of 100...


Embedding Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]



Upserting 50 vectors to Pinecone in batches of 100...


Upserting Batches: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]

Finished embedding and upserting all chunks.





## Perform a Test Query

In [8]:
if google_ai_configured and pinecone_index:
    print("\nPerforming a test query...")
    query = "What is the core idea of attention mechanism?"
    print(f"Test Query: '{query}'")
    model_name_for_query = 'models/text-embedding-004'

    try:
        # 1. Embed the query using Google AI API
        response = genai.embed_content(model=model_name_for_query,
                                       content=query,
                                       task_type="retrieval_query")
        query_embedding = response['embedding']

        # 2. Query Pinecone
        query_results = pinecone_index.query(
            vector=query_embedding,
            top_k=3,
            include_metadata=True
        )

        # 3. Print results
        print("\nTop Search Results:")
        if query_results.matches:
            for i, match in enumerate(query_results.matches):
                print(f"\nResult {i+1}:")
                print(f"  ID: {match.id}")
                print(f"  Score (Similarity): {match.score:.4f}")
                if match.metadata and 'text' in match.metadata:
                    print(f"  Text: {match.metadata['text'][:500]}...")
                else:
                    print("  Text: (Metadata or text missing)")
        else:
            print("No matches found.")
    except Exception as e:
        print(f"An error occurred during the query: {e}")
        
else:
    print("Skipping test query as Google AI config or Pinecone index is not ready.")


Performing a test query...
Test Query: 'What is the core idea of attention mechanism?'

Top Search Results:

Result 1:
  ID: chunk_12
  Score (Similarity): 0.6808
  Text: into a matrix Q. The keys and values are also packed together into matrices K and V . We compute
the matrix of outputs as:
Attention(Q, K, V) = softmax(QKT
√dk
)V (1)
The two most commonly used attention functions are additive attention [2], and dot-product (multi-
plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor
of 1√dk
. Additive attention computes the compatibility function using a feed-forward network with
a single hidden layer. While t...

Result 2:
  ID: chunk_11
  Score (Similarity): 0.6770
  Text: where the query, keys, values, and output are all vectors. The output is computed as a weighted sum
3
Scaled Dot-Product Attention
 Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several
attention la