In [None]:
%pip install python-dotenv
%pip install azure-search-documents==11.5.1



In [None]:
import os

# Get the directory of the currently running file
file_dir = "C:\\Users\\hannahhowell\\OneDrive - Microsoft\\Documents\Git\\AOAI_Labs\\AzureAISearchLab"
# Get the current working directory
current_cwd = os.getcwd()

# Check if the current working directory is the same as the file directory
if current_cwd != file_dir:
    # Change the current working directory to the file directory
    os.chdir(file_dir)
    print(f"Changed current working directory to: {file_dir}")
else:
    print("Already in the correct directory.")


### Set container name to name of newly created container.

In [None]:
container_name = "pdfscompiancecatalyst"

In [None]:
# Load credentials
from dotenv import load_dotenv
import os 
load_dotenv()

# Check the environment variables are set and assign them to variables.
AI_SEARCH_ENDPOINT = os.getenv('AI_SEARCH_ENDPOINT')
AI_SEARCH_KEY = os.getenv('AI_SEARCH_KEY')

BLOB_STORAGE_ACCOUNT_CONNECTION_STRING = os.getenv('BLOB_STORAGE_ACCOUNT_CONNECTION_STRING')
GENAI_CUSTOMSKILL_FUNCTION_URL= os.getenv('GENAI_CUSTOMSKILL_FUNCTION_URL')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_API_ENDPOINT = os.getenv('OPENAI_API_ENDPOINT')

AZURE_AI_KEY = os.getenv('AZURE_AI_KEY')

# Ensure all required environment variables are set
if not all([AI_SEARCH_ENDPOINT, AI_SEARCH_KEY, BLOB_STORAGE_ACCOUNT_CONNECTION_STRING, OPENAI_API_KEY, OPENAI_API_ENDPOINT, AZURE_AI_KEY]):
    missing_vars = [var for var, val in zip(['AI_SEARCH_ENDPOINT', 'AI_SEARCH_KEY', 'BLOB_STORAGE_ACCOUNT_CONNECTION_STRING', 'OPENAI_API_KEY', 'OPENAI_API_ENDPOINT', 'AZURE_AI_KEY'], 
                                            [AI_SEARCH_ENDPOINT, AI_SEARCH_KEY, BLOB_STORAGE_ACCOUNT_CONNECTION_STRING, OPENAI_API_KEY, OPENAI_API_ENDPOINT, AZURE_AI_KEY]) if not val]
    raise ValueError(f"Environment variables {', '.join(missing_vars)} must be set.")

# Print the environment variables
print(f"AI_SEARCH_ENDPOINT: {AI_SEARCH_ENDPOINT}")
print(f"AI_SEARCH_KEY: {AI_SEARCH_KEY}")
print(f"BLOB_STORAGE_ACCOUNT_CONNECTION_STRING: {BLOB_STORAGE_ACCOUNT_CONNECTION_STRING}")
print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")
print(f"OPENAI_API_ENDPOINT: {OPENAI_API_ENDPOINT}")
print(f"AZURE_AI_KEY: {AZURE_AI_KEY}")



In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

## Create a Data Source (Blob Container containting the pdfs)

Although only  PDF files are used here, this can be done at a much larger scale and Azure AI Search supports a range of other file formats including: Microsoft Office (DOCX/DOC, XSLX/XLS, PPTX/PPT, MSG), HTML, XML, ZIP, and plain text files (including JSON).
Azure Search support the following sources: [Data Sources Gallery](https://learn.microsoft.com/EN-US/AZURE/search/search-data-sources-gallery)

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source 
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AI_SEARCH_KEY))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=container_name+"-connection",
    type="azureblob",
    connection_string=BLOB_STORAGE_ACCOUNT_CONNECTION_STRING,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

## Create Index

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

AZURE_SEARCH_CREDENTIAL = AzureKeyCredential(AI_SEARCH_KEY)

# Create a search index  
index_name = container_name+"-index"
index_client = SearchIndexClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="image_sas_url", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="page_number", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
    ]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer_name="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            vectorizer_name="myOpenAI",  
            kind="azureOpenAI",  
            parameters=AzureOpenAIVectorizerParameters(  
                resource_url=OPENAI_API_ENDPOINT,  
                deployment_name=os.getenv("EMBEDDINGS_MODEL_NAME"),
                model_name="text-embedding-ada-002",
                api_key=OPENAI_API_KEY
            ),
        ),  
    ], 
)  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

# Create a Skill Set
One option for data pre-processing is to do it as part of the indexing by using a skillset.
A Skillset is a set of steps in which AI Services can be used to enrich the documents by extracting information, applying OCR, translating, etc.

https://learn.microsoft.com/en-us/azure/search/cognitive-search-working-with-skillsets
https://learn.microsoft.com/en-us/azure/search/cognitive-search-predefined-skills


In [None]:
from azure.search.documents.indexes.models import ( 
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    EntityRecognitionSkill,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    WebApiSkill,
    SearchIndexerDataUserAssignedIdentity
)

# Create a skillset  
skillset_name = container_name+"-skillset"

pdftoimage_skill = WebApiSkill(
    description="Skill to generate images from pdf",
    uri=GENAI_CUSTOMSKILL_FUNCTION_URL+"pdftoimage",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="docUrl", source="/document/metadata_storage_path"),
        InputFieldMappingEntry(name="docSAS", source="/document/metadata_storage_sas_token")
    ],
    outputs=[
        OutputFieldMappingEntry(name="image_sas_urls", target_name="pages"),
    ]
)

imagetomarkdown_skill = WebApiSkill(
    description="Skill to generate images from pdf",
    uri=GENAI_CUSTOMSKILL_FUNCTION_URL+"imagetomarkdown",
    context="/document/pages/*",
    inputs=[
        InputFieldMappingEntry(name="image_sas_url", source="/document/pages/*/sas_url"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="markdown", target_name="markdown"),
        OutputFieldMappingEntry(name="page_number", target_name="page_number")
    ]
)

embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",
    resource_url=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    deployment_name=os.getenv("EMBEDDINGS_MODEL_NAME"),
    model_name="text-embedding-ada-002",
    dimensions=1536,
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*/page_number"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="text_vector")  
    ],  
)



skills = [pdftoimage_skill, imagetomarkdown_skill, embedding_skill]


index_projections = SearchIndexerIndexProjection(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[ 
                InputFieldMappingEntry(name="chunk", source="/document/pages/*/markdown"),
                InputFieldMappingEntry(name="image_sas_url", source="/document/pages/*/sas_url"),  
                InputFieldMappingEntry(name="text_vector", source="/document/pages/*/text_vector"),
                InputFieldMappingEntry(name="page_number", source="/document/pages/*/page_number"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
) 

cognitive_services_account = CognitiveServicesAccountKey(key=AZURE_AI_KEY)


skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,
    index_projection=index_projections,
    cognitive_services_account=cognitive_services_account
)

client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

## Create indexer

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer 
indexer_name = container_name+"-indexer" 

indexer_parameters = None

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    parameters=indexer_parameters
)  

# Create and run the indexer  
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL) 

indexer_result = indexer_client.create_or_update_indexer(indexer)


print(f' {indexer_name} is created and running. Give the indexer a few minutes before running a query.')  