## Install Libraries

In [1]:
%pip install sentence-transformers "pinecone>=3.0.0" torch tqdm pypdf langchain # Ensure pinecone-client is recent

print("Required libraries checked/installed.")

Note: you may need to restart the kernel to use updated packages.
Required libraries checked/installed.


## Load Environment Variables (Pinecone Credentials)

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

# Attempt to find and load the .env file to get environment variables
dotenv_path = find_dotenv(raise_error_if_not_found=False) # Avoid error if .env is missing

if dotenv_path:
    print("Loading .env file...")
    load_dotenv(dotenv_path)
    pinecone_api_key = os.getenv("PINECONE_API_KEY")

    if not pinecone_api_key:
        raise ValueError("Error: PINECONE_API_KEY not found in .env file.")
    else:
        print("Pinecone API Key loaded.")
        
else:
    raise FileNotFoundError("Error: .env file not found. Please create one with your Pinecone API key.")

Loading .env file...
Pinecone API Key loaded.


## Load and Chunk Document

In [3]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_file_path = os.path.join('..', 'data', 'Attention is All You Need.pdf')
text_chunks = [] # Initialize an empty list for chunks

if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"Error: PDF file not found at calculated path: {os.path.abspath(pdf_file_path)}")
    
else:
    print(f"Loading PDF from: {pdf_file_path}")
    try:
        # Initialize the PDF reader
        reader = PdfReader(pdf_file_path)

        # Extract text from the PDF
        full_document_text = ""
        for page_num, page in enumerate(reader.pages):
            page_text = page.extract_text()

            # Check if text was extracted from current page
            if page_text:
                full_document_text += page_text + "\n"
            else:
                print(f"Warning: No text extracted from page {page_num + 1}.")

        # Check if any text was extracted
        if not full_document_text:
             print("Warning: No text could be extracted from the PDF. Cannot proceed.")
        else:
            print(f"Successfully loaded document with {len(full_document_text)} characters.")

            # Initialize Recursive Character Text Splitter
            print("Initializing text splitter...")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
                separators=["\n\n", "\n", " ", ""]  # Priorize splitting by paragraph, then line, then space
            )

            # Split text into chunks
            print("Splitting document into chunks...")
            text_chunks = text_splitter.split_text(full_document_text)
            print(f"Document split into {len(text_chunks)} chunks.")

    except Exception as e:
        print(f"An error occurred during PDF loading or chunking: {e}")
        raise e # Re-raise the exception to stop execution

Loading PDF from: ../data/Attention is All You Need.pdf
Successfully loaded document with 39602 characters.
Initializing text splitter...
Splitting document into chunks...
Document split into 50 chunks.


## Initialize Embedding Model

In [4]:
from sentence_transformers import SentenceTransformer

# Check if chunks were created before loading the model
if text_chunks:
    print("Loading embedding model: all-MiniLM-L6-v2")
    try:
        # Instantiate the Sentence Transformer model
        # Specify device='mps' to try using Apple Silicon GPU if available and PyTorch supports it well
        # Or leave as None to let the library decide (often CPU)
        embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=None)
        print("Embedding model loaded successfully.")

        # Print model details
        print(embedding_model)

    except Exception as e:
        print(f"Error loading Sentence Transformer model: {e}")
        embedding_model = None # Ensure it's None if loading fails
        
else:
    print("Skipping embedding model initialization as no text chunks were loaded.")
    embedding_model = None

  from .autonotebook import tqdm as notebook_tqdm


Loading embedding model: all-MiniLM-L6-v2
Embedding model loaded successfully.
SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


## Initialize Pinecone Connection

In [7]:
from pinecone import Pinecone, ServerlessSpec

# Check if API key and model were loaded
if pinecone_api_key and embedding_model:
    print(f"Initializing Pinecone connection...")
    try:
        # Initialize the Pinecone client with only the API key
        pc = Pinecone(api_key=pinecone_api_key)
        print("Pinecone client initialized successfully.")
        
    except Exception as e:
        print(f"Error initializing Pinecone client: {e}")
        pc = None # Ensure client object is None on error
        
else:
    print("Skipping Pinecone initialization due to missing API key or embedding model.")
    pc = None

Initializing Pinecone connection...
Pinecone client initialized successfully.


## Create or Connect to Pinecone Index

In [None]:
# Check if Pinecone client initialized successfully
if pc:
    index_name = 'semantic-search-app-index'

    # Check existing indexes
    print(f"Checking if Pinecone index '{index_name}' exists...")
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

    if index_name not in existing_indexes:
        print(f"Index '{index_name}' does not exist. Creating...")

        # Get the dimensionality of the embedding model
        embedding_dim = embedding_model.get_sentence_embedding_dimension()
        print(f"Embedding dimension: {embedding_dim}")

        try:
            # Create the index
            pc.create_index(
                name=index_name,
                dimension=embedding_dim,
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws', # Or 'aws', 'azure' - choose one available for current region/plan
                    region='us-east-1' # Choose a region available for the current plan
                )
            )
            print(f"Index '{index_name}' created successfully. Please wait a moment for it to initialize...")

            # Optional: Wait loop checking index readiness
            # import time
            # while not pc.describe_index(index_name).status['ready']:
            #     time.sleep(1)

        except Exception as e:
            print(f"Error creating Pinecone index: {e}")
            pinecone_index = None # Ensure index object is None on error

    else:
        print(f"Index '{index_name}' already exists. Connecting...")

    # Connect to the index
    try:
        pinecone_index = pc.Index(index_name)
        print(f"Connected to index '{index_name}'.")
        
        # Describe index stats
        print(pinecone_index.describe_index_stats())

    except Exception as e:
        print(f"Error connecting to Pinecone index '{index_name}': {e}")
        pinecone_index = None

else:
    print("Skipping index creation/connection as Pinecone client was not initialized.")
    pinecone_index = None

Checking if Pinecone index 'semantic-search-app-index' exists...
Index 'semantic-search-app-index' does not exist. Creating...
Embedding dimension: 384
Index 'semantic-search-app-index' created successfully. Please wait a moment for it to initialize...
Connected to index 'semantic-search-app-index'.
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


## Embed Chunks and Prepare for Upsert

In [11]:
from tqdm.auto import tqdm

# Check if chunks, model, and index connection are available
if text_chunks and embedding_model and pinecone_index:
    print(f"Preparing {len(text_chunks)} chunks for embedding and upserting...")

    batch_size = 100 # Process chunks in batches for efficiency
    vectors_to_upsert = [] # Temp list for batch upsert

    # Prepare all data first
    print("Generating embeddings and preparing data...")
    all_data_to_upsert = []
    for i, chunk_text in enumerate(tqdm(text_chunks, desc="Preparing Chunks")):
         chunk_id = f"chunk_{i}" # Simple unique ID
         
         vector_data = {
             "id": chunk_id,
             "metadata": {"text": chunk_text} # Store original text as metadata
         }

         all_data_to_upsert.append(vector_data)

    # Upsert data in batches
    print(f"Embedding and Upserting {len(all_data_to_upsert)} vectors in batches of {batch_size}...")
    for i in tqdm(range(0, len(all_data_to_upsert), batch_size), desc="Upserting Batches"):
        i_end = min(i + batch_size, len(all_data_to_upsert))
        batch_data = all_data_to_upsert[i:i_end]

        # Get texts for this batch
        texts_to_embed = [item['metadata']['text'] for item in batch_data]

        # Generate embeddings for the batch
        try:
            embeddings_batch = embedding_model.encode(texts_to_embed).tolist()

            # Add embeddings to the batch data
            vectors_for_pinecone = []
            for j, item in enumerate(batch_data):
                 vectors_for_pinecone.append({
                     "id": item['id'],
                     "values": embeddings_batch[j],
                     "metadata": item['metadata']
                 })

            # Upsert the batch to Pinecone
            pinecone_index.upsert(vectors=vectors_for_pinecone)

        except Exception as e:
            print(f"Error embedding or upserting batch {i//batch_size + 1}: {e}")
            raise e # Re-raise the exception to stop execution if a batch fails

    print("Finished embedding and upserting all chunks.")

else:
    print("Skipping embedding/upserting due to missing chunks, model, or index connection.")

Preparing 50 chunks for embedding and upserting...
Generating embeddings and preparing data...


Preparing Chunks: 100%|██████████| 50/50 [00:00<00:00, 131979.36it/s]


Embedding and Upserting 50 vectors in batches of 100...


Upserting Batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]

Finished embedding and upserting all chunks.





## Perform a Test Query

In [None]:
# Check if model and index are ready
if embedding_model and pinecone_index:
    print("\nPerforming a test query...")
    query = "What is the core idea of the attention mechanism?"
    print(f"Test Query: '{query}'")

    try:
        # 1. Embed the query
        query_embedding = embedding_model.encode(query).tolist()

        # 2. Query Pinecone
        # Find the top N most similar chunks (e.g., top 3)
        query_results = pinecone_index.query(
            vector=query_embedding,
            top_k=3, # Number of results to return
            include_metadata=True # Get the original text back
        )

        # 3. Print results
        print("\nTop Search Results:")
        if query_results.matches:
            for i, match in enumerate(query_results.matches):
                print(f"\nResult {i+1}:")
                print(f"  ID: {match.id}")
                print(f"  Score (Similarity): {match.score:.4f}")
                
                # Ensure metadata and text exist before printing
                if match.metadata and 'text' in match.metadata:
                    print(f"  Text: {match.metadata['text'][:500]}...") # Print start of the chunk
                else:
                    print("  Text: (Metadata or text missing)")
        else:
            print("No matches found.")

    except Exception as e:
        print(f"An error occurred during the query: {e}")

else:
    print("Skipping test query as model or index is not ready.")