Requirements

In [1]:
# !pip install langgraph-sdk

In [2]:
# !pip install azure-ai-formrecognizer
# !pip install azure-search-documents

Extract PDF Document

In [3]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from dotenv import load_dotenv

load_dotenv()

# Initialize the Document Analysis client using Azure credentials
endpoint = os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"]
credential = os.environ["AZURE_DOCUMENT_INTELLIGENCE_KEY"]
client = DocumentAnalysisClient(endpoint, AzureKeyCredential(credential))

# Path to the PDF document to be analyzed
soc_report_path = os.path.join("data", "documents", "SOC.01 - ADP Autopay.pdf")

# Open the PDF document and analyze it
with open(soc_report_path, "rb") as f:
    # Start the document analysis process using the prebuilt layout model
    poller = client.begin_analyze_document("prebuilt-layout", document=f)
    # Wait for the analysis to complete and get the result
    result = poller.result()

# Extract lines of text from the analysis result
lines = [line.content for page in result.pages for line in page.lines]

# Print the extracted lines to the console
print("\n".join(lines))

# Define the path for saving the extracted text to a file
soc_extract_path = os.path.join("data", "extracts", "SOC.01 - ADP Autopay.txt")

# Save the extracted lines to a text file
with open(soc_extract_path, "w", encoding="utf-8") as file:
    file.writelines(lines)

# Print a confirmation message indicating where the output has been saved
print(f"\nSample output saved to {soc_extract_path}")

NOTICE: Report use and distribution limitations
Ernst & Young LLP ("EY") has prepared the attached report (the "Report") in accordance with the
requirements of the applicable professional standard(s) as described in the attached Report, for Automatic Data
Processing (the "Company"). The Report is intended only to be used by management of the Company, the
Company's existing clients (i.e., "user entities") and their external auditors who audit and report on such user
entities' financial statements or internal control over financial reporting (collectively, the "Specified Parties"),
as stated in the independent service auditor's report contained in the Report.
If you are a Specified Party, you may proceed to the Report.
In addition, entities other than the Specified Parties, including certain prospective user entities, identified by
the Company (collectively, each an "Other Recipient"), may read the Report subject to the terms of this
NOTICE. If you are an Other Recipient, you may only re

Create Vector Index

In [4]:
import os

# Import necessary modules from Azure SDK
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
)

# Retrieve environment variables for Azure Search service configuration
service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]  # Endpoint of the Azure Search service
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"]  # Name of the index to be created or updated
key = os.environ["AZURE_SEARCH_API_KEY"]  # API key for authentication

# Function to define the structure of the search index
def get_index(name: str):
    # Define the fields for the index
    fields = [
        # A simple field for the unique identifier of each chunk
        SimpleField(name="chunkId", type=SearchFieldDataType.String, key=True),
        # A searchable field for the parent document, with additional capabilities like sorting, filtering, and faceting
        SearchableField(
            name="parentDoc",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
        ),
        # A searchable field for the chunk of text
        SearchableField(name="chunk", type=SearchFieldDataType.String),
        # A vector field for storing embeddings (e.g., from a language model)
        SearchField(
            name="chunkVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,  # Dimensions of the vector (e.g., OpenAI embeddings use 1536 dimensions)
            vector_search_profile_name="my-vector-config",  # Reference to the vector search profile
        ),
    ]

    # Define vector search configuration
    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",  # Name of the vector search profile
                algorithm_configuration_name="my-algorithms-config",  # Reference to the algorithm configuration
            )
        ],
        algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],  # HNSW algorithm for vector search
    )

    # Return the SearchIndex object with the defined fields and vector search configuration
    return SearchIndex(name=name, fields=fields, vector_search=vector_search)


# Authenticate using the Azure API key
credential = AzureKeyCredential(key)
# Initialize the SearchIndexClient to interact with the Azure Search service
index_client = SearchIndexClient(service_endpoint, credential)

# Create or update the index
index = get_index(index_name)  # Get the index definition
index_client.create_or_update_index(index)  # Create or update the index in Azure Search

# Verify if the index exists by listing all indexes and checking for the index name
print(f"Index '{index_name}' exists: {index_name in [index.name for index in index_client.list_indexes()]}")

# Optional: Delete the index (commented out to avoid accidental deletion)
# index_client.delete_index(index_name)

Index 'socr-test-idx' exists: True


Chunk the Document, Create Vector embeddings of the Chunks and Store in Azure AI Search vector Store Index

In [5]:
import os

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"]
key = os.environ["AZURE_SEARCH_API_KEY"]
report_name = "SOC.01 - ADP Autopay"

def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    import openai

    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("AZURE_OPENAI_API_KEY")

    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version="2023-03-15-preview",
    )
    embedding = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return embedding.data[0].embedding


def get_documents():
    soc_extract_path = os.path.join("data", "extracts", f"{report_name}.txt")

    # Load example document
    with open(soc_extract_path) as f:
        state_of_the_union = f.read()

    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    texts = text_splitter.create_documents([state_of_the_union])

    docs = []

    for index, item in tqdm(enumerate(texts)):
        chunk_id = f"{index + 1}"
        parent_doc = report_name
        chunk = item.page_content
        chunk_vector = get_embeddings(item.page_content)

        item_dict = {
            "chunkId": chunk_id,
            "parentDoc": parent_doc,
            "chunk": chunk,
            "chunkVector": chunk_vector,
        }

        docs.append(item_dict)

    return docs


if __name__ == "__main__":
    credential = AzureKeyCredential(key)
    client = SearchClient(service_endpoint, index_name, credential)
    documents = get_documents()
    client.upload_documents(documents=documents)

518it [13:52,  1.61s/it]


Query the chunk from Vector DB

In [6]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery

# Load Azure Search service configuration from environment variables
service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"]
key = os.environ["AZURE_SEARCH_API_KEY"]
k_nearest_neighbors = 3  # Number of nearest neighbors to retrieve

def get_embeddings(text: str):
    """
    Get embeddings for the given text using Azure OpenAI.

    Args:
        text (str): The input text to get embeddings for.

    Returns:
        list: The embedding vector for the input text.
    """
    import openai

    # Load OpenAI configuration from environment variables
    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("AZURE_OPENAI_API_KEY")

    # Initialize OpenAI client
    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version="2023-03-15-preview",
    )
    
    # Get embeddings
    embedding = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return embedding.data[0].embedding

def single_vector_search(query: str):
    """
    Perform a single vector search in the Azure Search index.

    Args:
        query (str): The query text to search for.
    """
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    
    # Create a vectorized query
    vector_query = VectorizedQuery(
        vector=get_embeddings(query),
        k_nearest_neighbors=k_nearest_neighbors,
        fields="chunkVector",
    )

    # Execute the search
    results = search_client.search(
        vector_queries=[vector_query],
        select=["chunkId", "parentDoc", "chunk"],
    )

    # Print the results
    for result in results:
        print(result)

def single_vector_search_with_filter(query: str):
    """
    Perform a vector search with a filter applied.

    Args:
        query (str): The query text to search for.
    """
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    
    # Create a vectorized query
    vector_query = VectorizedQuery(
        vector=get_embeddings(query),
        k_nearest_neighbors=k_nearest_neighbors,
        fields="chunkVector",
    )

    # Execute the search with a filter
    results = search_client.search(
        search_text="",
        vector_queries=[vector_query],
        filter="parentDoc eq 'SOC.01'",  # Example filter
        select=["chunkId", "parentDoc", "chunk"],
    )

    # Print the results
    for result in results:
        print(result)

def simple_hybrid_search(query: str):
    """
    Perform a hybrid search combining text and vector search.

    Args:
        query (str): The query text to search for.
    """
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    
    # Create a vectorized query
    vector_query = VectorizedQuery(
        vector=get_embeddings(query),
        k_nearest_neighbors=k_nearest_neighbors,
        fields="chunkVector",
    )

    # Execute the hybrid search
    results = search_client.search(
        search_text=query,
        vector_queries=[vector_query],
        select=["chunkId", "parentDoc", "chunk"],
    )

    # Print the results
    for result in results:
        print(result)

if __name__ == "__main__":
    # Define the query to search
    query = "applications in scope or systems in scope or platforms in scope"
    
    # Perform different types of searches
    print("Single Vector Search Results:")
    # single_vector_search(query=query)
    
    print("\nSingle Vector Search with Filter Results:")
    # single_vector_search_with_filter(query=query)
    
    print("\nSimple Hybrid Search Results:")
    simple_hybrid_search(query=query)

Single Vector Search Results:

Single Vector Search with Filter Results:

Simple Hybrid Search Results:
{'chunkId': '469', 'chunk': "of Control ActivityTest of ControlsResults6.01A formal and documentedapplication development andchange management policy hasbeen developed to guide the in-scope applications'development groups.Inspected the application development andchange management policy documentationto determine whether developmentrequirements were documented forapplication development projects to guidethe development group.No deviations noted6.02Change requests to the in-scopeapplications (AutoPay and theInput/Output", 'parentDoc': 'SOC.01 - ADP Autopay', '@search.score': 0.03201844170689583, '@search.reranker_score': None, '@search.highlights': None, '@search.captions': None}
{'chunkId': '494', 'chunk': 'to determinewhether the applications provided theability for clients to manage user accessbased on roles and functions.No deviations noted7.07Only authorized individualshave update