## Semantic Kernel: Ramp-Up based on SK's Documentation

To get the latest version of SK and PyPDF Python packages, use:

``` bash
pip install --upgrade semantic-kernel pypdf2
```

## 📒 Notebook 5: Vector Store

This notebook uses SK's In-Memory connector to create In-Memory Vector Store with PDF Documents

**Updated for Semantic Kernel June 2025 Vector Store Migration**

### 🪜 Step 1: Configure environment

In [1]:
# Import required packages
import os
import logging
from dataclasses import dataclass, field
from typing import Annotated
from uuid import uuid4
from pathlib import Path
import traceback

# PDF processing
import PyPDF2

# Semantic Kernel imports
from semantic_kernel import Kernel
from semantic_kernel.contents import ChatHistory
from semantic_kernel.connectors.ai.open_ai import (
    AzureChatCompletion,
    AzureTextEmbedding,
    OpenAIChatPromptExecutionSettings
)

# NEW: Vector store imports for updated API
from semantic_kernel.connectors.in_memory import InMemoryCollection
from semantic_kernel.data.vector import (
    VectorStoreField,
    vectorstoremodel,
    DistanceFunction,
    IndexKind
)

In [2]:
# Set Azure OpenAI backend variables
AOAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_API_DEPLOY")
AOAI_ENDPOINT = os.getenv("AZURE_OPENAI_API_BASE")
AOAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AOAI_EMBEDDING = os.getenv("AZURE_OPENAI_API_DEPLOY_EMBED")

# Set data folder path
DATA_FOLDER = "data"

### 🪜 Step 2: Define Data Model and Helper Functions

In [3]:
# Set constants for vector search
DISTANCE_FUNCTION = DistanceFunction.COSINE_SIMILARITY
INDEX_KIND = IndexKind.HNSW

# Class for vector store's data model
@vectorstoremodel
@dataclass
class DocumentChunk:
    """Data model for document chunks with vector embeddings."""
    content: Annotated[
        str,
        VectorStoreField("data", is_full_text_indexed=True),
    ] = "content"
    id: Annotated[str, VectorStoreField("key")] = field(default_factory=lambda: str(uuid4()))
    vector: Annotated[
        list[float] | str | None,
        VectorStoreField(
            "vector",
            dimensions = 1536,
            index_kind = INDEX_KIND,
            distance_function = DISTANCE_FUNCTION,
        ),
    ] = None
    document_name: Annotated[str, VectorStoreField("data", is_indexed=True)] = "document"
    page_number: Annotated[int, VectorStoreField("data", is_indexed=True)] = 0
    chunk_index: Annotated[int, VectorStoreField("data", is_indexed=True)] = 0
    
    def __post_init__(self):
        # Set vector to content if not provided - embedder will generate it
        if self.vector is None:
            self.vector = self.content

In [4]:
# Helper function to extract text from PDF files
def extract_text_from_pdf(pdf_path: Path) -> list[tuple[str, int]]:
    """Extract text from PDF file, returning list of (text, page_number) tuples."""
    pages_text = []
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages, 1):
                text = page.extract_text()
                if text.strip():
                    pages_text.append((text.strip(), page_num))
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    
    return pages_text

# Helper function to split text into smaller pieces
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> list[str]:
    """Split text into overlapping chunks."""
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        
        if end < len(text):
            last_period = chunk.rfind('.')
            if last_period > chunk_size // 2:
                chunk = chunk[:last_period + 1]
                end = start + len(chunk)
        
        chunks.append(chunk)
        start = end - overlap
        
        if end >= len(text):
            break
    
    return chunks

# Helper function to process all PDF files in a folder
def process_pdfs_from_folder(folder_path: str) -> list[DocumentChunk]:
    """Process all PDF files in the specified folder and return DocumentChunk objects."""
    data_path = Path(folder_path)
    
    if not data_path.exists():
        print(f"Warning: Data folder '{folder_path}' does not exist.")
        return []
    
    document_chunks = []
    pdf_files = list(data_path.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in '{folder_path}'.")
        return []
    
    print(f"Found {len(pdf_files)} PDF files to process:")
    for pdf_file in pdf_files:
        print(f"  - {pdf_file.name}")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        pages_text = extract_text_from_pdf(pdf_file)
        
        for page_text, page_num in pages_text:
            chunks = chunk_text(page_text)
            
            for chunk_idx, chunk in enumerate(chunks):
                document_chunk = DocumentChunk(
                    content = chunk,
                    document_name = pdf_file.stem,
                    page_number = page_num,
                    chunk_index = chunk_idx
                )
                document_chunks.append(document_chunk)
    
    print(f"\nTotal chunks created: {len(document_chunks)}")
    return document_chunks

# Helper function to print search results
def print_search_result(result, score: float = None):
    """Print a search result in a formatted way."""
    print(f"Document: {result.document_name}")
    print(f"Page: {result.page_number}, Chunk: {result.chunk_index}")
    if score is not None:
        print(f"Relevance Score: {score:.4f}")
    print(f"Content: {result.content[:200]}{'...' if len(result.content) > 200 else ''}")
    print("-" * 80)

### 🪜 Step 3: Initialise Kernel and Services

In [5]:
# Initialise kernel
kernel = Kernel()

# Configure logging
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

In [6]:
# Add Azure OpenAI embedding
if AOAI_EMBEDDING and AOAI_ENDPOINT and AOAI_API_VERSION:
    embedder = AzureTextEmbedding(
        deployment_name = AOAI_EMBEDDING,
        endpoint = AOAI_ENDPOINT,
        api_version = AOAI_API_VERSION,
        service_id = "embedding"
    )
    kernel.add_service(embedder)
    print("Azure OpenAI embedding service added")
else:
    print("Azure OpenAI embedding not configured")

Azure OpenAI embedding service added


In [7]:
# Add Azure OpenAI chat completion
if AOAI_DEPLOYMENT and AOAI_ENDPOINT and AOAI_API_VERSION:
    chat_completion = AzureChatCompletion(
        deployment_name = AOAI_DEPLOYMENT,
        endpoint = AOAI_ENDPOINT,
        api_version = AOAI_API_VERSION,
        service_id = "azure_openai_chat",
    )
    kernel.add_service(chat_completion)
    print("Azure OpenAI chat completion service added")
else:
    print("Azure OpenAI chat completion not configured")

Azure OpenAI chat completion service added


### 🪜 Step 4: Load and Process PDF Documents

In [8]:
# Process PDF documents from the data folder
print("Processing PDF documents from data folder...")
document_chunks = process_pdfs_from_folder(DATA_FOLDER)

if not document_chunks:
    print("No documents to process. Please ensure PDF files are in the 'data' folder.")
else:
    print(f"Successfully processed {len(document_chunks)} document chunks")
    
    # Display summary
    doc_summary = {}
    for chunk in document_chunks:
        if chunk.document_name not in doc_summary:
            doc_summary[chunk.document_name] = {'pages': set(), 'chunks': 0}
        doc_summary[chunk.document_name]['pages'].add(chunk.page_number)
        doc_summary[chunk.document_name]['chunks'] += 1
    
    print("\nDocument Summary:")
    for doc_name, info in doc_summary.items():
        print(f"- {doc_name}: {len(info['pages'])} pages, {info['chunks']} chunks")

Processing PDF documents from data folder...
Found 2 PDF files to process:
  - NorthwindHealthPlus_BenefitsDetails.pdf
  - Northwind_Standard_Benefits_Details.pdf

Processing: NorthwindHealthPlus_BenefitsDetails.pdf

Processing: Northwind_Standard_Benefits_Details.pdf

Total chunks created: 684
Successfully processed 684 document chunks

Document Summary:
- NorthwindHealthPlus_BenefitsDetails: 109 pages, 344 chunks
- Northwind_Standard_Benefits_Details: 104 pages, 340 chunks


### 🪜 Step 5: Create Vector Store and Upsert Documents

In [9]:
# Create and populate the vector store
async def create_and_populate_vector_store():
    if not document_chunks:
        print("No document chunks to process")
        return None
    
    print("Creating vector store collection...")
    
    # Define collection with embedding_generator parameter
    record_collection = InMemoryCollection[str, DocumentChunk](
        collection_name = "pdf_documents",
        record_type = DocumentChunk,
        embedding_generator = embedder
    )
    
    async with record_collection:
        # Create the collection
        await record_collection.ensure_collection_exists()
        print("Collection created")
        
        # Upsert the document chunks
        print("Upserting records to vector store (embeddings generated automatically)...")
        keys = await record_collection.upsert(document_chunks)
        print(f"Upserted {len(keys)} records to vector store")
        
        return record_collection

# Run the async function
vector_collection = await create_and_populate_vector_store()

Creating vector store collection...
Collection created
Upserting records to vector store (embeddings generated automatically)...
Upserted 684 records to vector store


### 🪜 Step 6: Search Demos

In [10]:
# Search documents in the vector store
async def search_documents(query: str, document_filter: str = None, top_k: int = 5):
    """Search for documents based on a query, prints results and returns them."""

    print(f"Searching for: '{query}'")
    if document_filter:
        print(f"Filtering by document: {document_filter}")

    retrieved_data = []

    try:
        # Set up search options as dict
        options = {
            "vector_property_name": "vector",
            "top": top_k,
        }
        
        # Add filter if specified
        if document_filter:
            options["filter"] = lambda x: x.document_name == document_filter
        
        # Use search method with query string - it generates embedding automatically
        search_results = await vector_collection.search(
            values=query,
            **options,
        )
        
        if search_results.total_count == 0:
            print("No results found")
            return []
        
        print(f"\nFound {search_results.total_count} matching results (showing top {top_k}):")
        print("=" * 80)
        
        async for result in search_results.results:
            retrieved_data.append((result.record, result.score))
                
    except Exception as e:
        print(f"Search error: {e}")
        traceback.print_exc()
        return []
        
    return retrieved_data

print("Search functions ready.")

Search functions ready.


In [11]:
# Example 1: General search
results = await search_documents("Tips for Employees", top_k=3)
for record, score in results:
    print_search_result(record, score)

Searching for: 'Tips for Employees'

Found None matching results (showing top 3):
Document: Northwind_Standard_Benefits_Details
Page: 92, Chunk: 0
Relevance Score: 0.5155
Content: 2. Ask questions. If the employee is unsure about any part of the plan, it is important to ask 
questions in order to make sure that the plan is suitable for their needs.  
3. Research other plans. It...
--------------------------------------------------------------------------------
Document: Northwind_Standard_Benefits_Details
Page: 90, Chunk: 2
Relevance Score: 0.4785
Content: important for employees to make sure they are familiar with 
the provider’s policies and procedures. Employees should also make sure they und erstand 
any additional costs that may be associated with ...
--------------------------------------------------------------------------------
Document: Northwind_Standard_Benefits_Details
Page: 86, Chunk: 2
Relevance Score: 0.4761
Content: and responsibilities under the 
law when it comes to t

In [12]:
# Example 2: Search within a specific document
results = await search_documents(
    query = "What is covered by the Northwind Health Standard Plan?",
    document_filter = "Northwind_Standard_Benefits_Details",
    top_k = 3
)
for record, score in results:
    print_search_result(record, score)

Searching for: 'What is covered by the Northwind Health Standard Plan?'
Filtering by document: Northwind_Standard_Benefits_Details

Found None matching results (showing top 3):
Document: Northwind_Standard_Benefits_Details
Page: 93, Chunk: 2
Relevance Score: 0.7815
Content: ant to note that while Northwind Standard covers a variety of services, there are 
some except ions. These include emergency services, mental health and substance abuse 
coverage, and out -of-network ...
--------------------------------------------------------------------------------
Document: Northwind_Standard_Benefits_Details
Page: 3, Chunk: 0
Relevance Score: 0.7749
Content: Summary of Benefits  
Northwind Standard  
Northwind Standard is a basic plan that provides coverage for medical, vision, a nd dental 
services. This plan also offers coverage for preventive care serv...
--------------------------------------------------------------------------------
Document: Northwind_Standard_Benefits_Details
Page: 99, Ch

### 🪜 Step 7: RAG with PDF Documents

In [13]:
# Perform a Retrieval Augmented Generation (RAG) query
async def perform_rag_query(
    query: str,
    document_filter: str = None,
    top_k_retrieval: int = 3,
    chat_service_id: str = "azure_openai_chat" 
):
    """
    Performs Retrieval Augmented Generation:
    1. Uses search_documents to retrieve relevant document chunks.
    2. Constructs a ChatHistory object with system and user messages.
    3. Calls the chat service's get_chat_message_contents method.
    """

    try:
        chat_completion_service = kernel.get_service(chat_service_id)
    except Exception as e:
        print(f"Error getting chat service '{chat_service_id}' for RAG: {e}")
        return

    # Retrieve relevant document chunks
    search_results_tuples = await search_documents(
        query = query,
        document_filter = document_filter,
        top_k = top_k_retrieval
    )

    retrieved_chunks_content = []
    if not search_results_tuples:
        print("No document chunks retrieved to use for context.")
    else:
        for record, score in search_results_tuples:
            retrieved_chunks_content.append(record.content)

    # Construct ChatHistory
    chat_history = ChatHistory()
    system_message = (
        "You are a helpful AI assistant. You are provided with context from documents. "
        "Your task is to answer the user's question based ONLY on this provided context. "
        "Do not use any external knowledge or make assumptions beyond what is stated in the context. "
        "If the information to answer the question is not present in the context, "
        "clearly state that you cannot answer based on the provided information. "
        "Be concise and directly answer the question."
    )
    chat_history.add_system_message(system_message)

    if retrieved_chunks_content:
        context_str = "\n\n---\n\n".join(retrieved_chunks_content)
        user_message_text = f"""Here is the context from the documents:
<context>
{context_str}
</context>

Based ONLY on the context provided above, please answer the following question.
User Question: {query}"""
    else:
        user_message_text = f"""User Question: {query}

(System note: No specific context was retrieved for this question from the documents. 
Based on your instructions, if the answer is not in the documents, please state that.)"""

    chat_history.add_user_message(user_message_text)

    # Generate answer using chat service
    try:
        execution_settings = OpenAIChatPromptExecutionSettings(
            service_id=chat_service_id,
        )
        
        response_messages = await chat_completion_service.get_chat_message_contents(
            chat_history = chat_history,
            settings = execution_settings
        )

        print("\nLLM Response:")
        if response_messages and isinstance(response_messages, list) and len(response_messages) > 0:
            assistant_message_content = response_messages[0]
            if hasattr(assistant_message_content, 'content') and assistant_message_content.content is not None:
                print(str(assistant_message_content.content))
            elif hasattr(assistant_message_content, 'items') and assistant_message_content.items and \
                 hasattr(assistant_message_content.items[0], 'text'):
                print(assistant_message_content.items[0].text)
            else: 
                print(str(assistant_message_content))
        else:
            print("\nLLM returned an empty or unexpected response format.")
            print(f"Raw response: {response_messages}")

    except AttributeError as e:
        print(f"AttributeError during LLM invocation: {e}")
        traceback.print_exc()
    except Exception as e:
        print(f"Error during LLM invocation: {e}")
        traceback.print_exc()

print("RAG function is ready.")

RAG function is ready.


In [14]:
# General RAG query across all documents
await perform_rag_query(
    query = "What are the key benefits of the Northwind Health Plus plan?",
    top_k_retrieval = 3
)

print("\n" + "="*80 + "\n")

Searching for: 'What are the key benefits of the Northwind Health Plus plan?'

Found None matching results (showing top 3):

LLM Response:
The key benefits of the Northwind Health Plus plan include comprehensive coverage for medical, vision, and dental services; prescription drugs; mental health and substance abuse services; and preventive care. It offers access to a variety of in-network providers such as primary care physicians, specialists, hospitals, and pharmacies. The plan also covers emergency services both in-network and out-of-network. Additionally, it is a group health plan sponsored by Contoso, with shared premium payments between the employer and employee.




### 🪜 Step 8: Cleanup (Optional)

In [15]:
# Clean up the vector collection
async def cleanup_collection():
    if vector_collection:
        try:
            async with vector_collection:
                await vector_collection.ensure_collection_deleted()
            print("Vector collection cleaned up")
        except Exception as e:
            print(f"Cleanup error: {e}")
    else:
        print("No collection to clean up")

await cleanup_collection()

Vector collection cleaned up
