## 📚 Prerequisites

Before executing this notebook, make sure you have properly set up your Azure Services, created your Conda environment, and configured your environment variables as per the instructions provided in the [README.md](README.md) file.

In [1]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbbai-azure-ai-search-indexing"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbbai-azure-ai-search-indexing


## Creating Index

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from src.extractors.blob_data_extractors import 
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    VectorizableTextQuery,
    VectorFilterMode,
)
from azure.search.documents.indexes.models import (
    AzureOpenAIEmbeddingSkill,
    SplitSkill,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    ExhaustiveKnnParameters,
    FieldMapping,
    HnswParameters,
    IndexProjectionMode,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchField,
    SearchFieldDataType,
    IndexingParameters,
    FieldMappingFunction,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionsParameters,
    SearchIndexerSkillset,
    SemanticConfiguration,
    SemanticField,
    SimpleField,
    SplitSkill,
    IndexingParametersConfiguration,
    WebApiSkill,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)

SyntaxError: invalid syntax (1753904716.py, line 3)

In [3]:
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    ExhaustiveKnnAlgorithmConfiguration,
    SemanticPrioritizedFields,
    SemanticConfiguration,
)

In [23]:
service_endpoint = os.getenv("AZURE_AI_SEARCH_SERVICE_ENDPOINT")
index_name = "index-test-5"
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

In [24]:
fields = [
    SearchField(name="path", type=SearchFieldDataType.String, key=True),
    SearchField(name="name", type=SearchFieldDataType.String),
    SearchField(name="url", type=SearchFieldDataType.String),
    SimpleField(name="parent_id", type=SearchFieldDataType.String),
    SimpleField(name="chunk_id", type=SearchFieldDataType.String),
    SearchField(name="chunk", type=SearchFieldDataType.String),
    SimpleField(
        name="enriched", type=SearchFieldDataType.String, searchable=False
    ),  # debugging only
    SearchField(
        name="textVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfile",
    ),
    SearchField(name="metadata", type=SearchFieldDataType.String),  # Add this line
]
vector_config = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myOpenAI",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
            vectorizer="myOpenAI",
        ),
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=os.getenv("AZURE_AOAI_API_ENDPOINT"),
                deployment_id=os.getenv("AZURE_AOAI_EMBEDDING_DEPLOYMENT_ID"),
                api_key=os.getenv("AZURE_AOAI_API_KEY"),
            ),
        ),
    ],
)

semantic_config = SemanticConfiguration(
    name="mySemanticConfig",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")]
    ),
)

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_config)

In [25]:
index_client = SearchIndexClient(service_endpoint, AzureKeyCredential(key))

In [26]:
index_client.create_or_update_index(index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x1f328823520>

## Manual Indexing 

In [5]:
# pip install azure-search-documents==11.4.0

In [2]:
# Import the AzureAIndexer class from the ai_search_indexing module
# Look at the notebook 02-indexing-azure-ai-search.ipynb for the implementation
from src.indexers.ai_search_indexing import AzureAIndexer

DEPLOYMENT_NAME = "foundational-canadaeast-ada"
INDEX_NAME = "test-index-002"

# Create an instance of the AzureAIndexer class
azure_search_indexer_client = AzureAIndexer(
    index_name=INDEX_NAME, embedding_azure_deployment_name=DEPLOYMENT_NAME
)

2024-02-21 20:30:55,938 - micro - MainProcess - INFO     Loading OpenAIEmbeddings object with model, deployment foundational-canadaeast-ada, and chunk size 1000 (ai_search_indexing.py:load_embedding_model:163)
  warn_deprecated(
  warn_deprecated(
2024-02-21 20:30:57,719 - micro - MainProcess - INFO     AzureOpenAIEmbeddings object has been created successfully. You can now access the embeddings
                using the '.embeddings' attribute. (ai_search_indexing.py:load_embedding_model:176)
vector_search_configuration is not a known attribute of class <class 'azure.search.documents.indexes.models._index.SearchField'> and will be ignored
2024-02-21 20:30:59,074 - micro - MainProcess - INFO     The Azure AI search index 'test-index-002' has been loaded correctly. (ai_search_indexing.py:load_azureai_index:227)
2024-02-21 20:30:59,086 - micro - MainProcess - INFO     Successfully loaded environment variables: TENANT_ID, CLIENT_ID, CLIENT_SECRET (sharepoint_data_extractor.py:load_environ

In [3]:
## Calculate all blobs modified since...

from src.extractors.blob_data_extractors import AzureBlobDataExtractor

az_blob = AzureBlobDataExtractor(container_name="customskillspdf")

In [9]:
document_blob_list = az_blob.list_updated_files(updated_since=2, time_unit="months")

In [10]:
document_blob_list

['https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf']

In [6]:
# Define parameters for the load_files_and_split_into_chunks function
splitter_params = {
    "splitter_type": "by_title_brute_force",
    "ocr": True,
    "ocr_output_format": "markdown",
    "chunk_size": 512,
    "chunk_overlap": 128,
    "pages": "3-7",
}

document_chunks_to_index = azure_search_indexer_client.load_files_and_split_into_chunks(
    file_paths=document_blob_list,
    **splitter_params,
)

2024-02-21 20:31:22,328 - micro - MainProcess - INFO     Blob URL detected. Extracting content. (ocr_document_intelligence.py:analyze_document:147)
2024-02-21 20:31:23,249 - micro - MainProcess - INFO     Successfully downloaded blob file instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf (blob_data_extractors.py:extract_content:95)
2024-02-21 20:31:55,724 - micro - MainProcess - INFO     Successfully extracted content from https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf (ocr_data_extractors.py:extract_content:82)
2024-02-21 20:31:55,725 - micro - MainProcess - INFO     Creating a splitter of type: by_title_brute_force (by_character.py:get_splitter:63)
2024-02-21 20:31:55,726 - micro - MainProcess - INFO     Using tiktoken encoder: cl100k_base (by_character.py:get_splitter:92)
2024-02-21 20:31:55,728 - micro - MainProcess - INFO     Obtained splitter of type: Markdo

In [11]:
# Index the document chunks using the Azure Search Indexer client
azure_search_indexer_client.index_text_embeddings(document_chunks_to_index)

2024-02-21 20:32:21,583 - micro - MainProcess - INFO     Embedding and indexing initiated for 10 text chunks. (ai_search_indexing.py:index_text_embeddings:492)
2024-02-21 20:32:23,669 - micro - MainProcess - INFO     Embedding and indexing completed for 10 text chunks. (ai_search_indexing.py:index_text_embeddings:496)


True

## Orchestration Microservices Leveraging Azure Container Apps 

🚀 Azure Container Apps is a serverless platform that simplifies the deployment and management of containerized applications. It eliminates the need for server configuration and container orchestration, providing all the necessary server resources to keep your applications stable and secure.

With Azure Container Apps, we can:

🔍 Deploy API endpoints: Package our code into containers and deploy them as API endpoints. This makes it easy to expose our services over the internet.

⚙️ Host background processing jobs: Run background tasks in containers, which can be scaled up or down based on demand.

🎉 Handle event-driven processing: Respond to events, such as messages on a queue, changes in a database, or HTTP requests.

🛠️ Run microservices: Package each microservice in a separate container and deploy them on Azure Container Apps. This allows us to scale each microservice independently.

🔒 Enable HTTPS ingress: Azure Container Apps supports HTTPS, allowing us to serve our applications securely over the internet.

⚖️ Autoscale based on demand: Automatically scale our applications based on HTTP traffic, event-driven processing, CPU or memory load, or any KEDA-supported scaler.

🤝 Integrate with custom skillsets: Integrate our applications with custom skillsets, such as AI services or data processing libraries, to enhance their capabilities.

By leveraging these capabilities, we can focus more on developing our applications and less on managing infrastructure.

📝 Follow the steps in the README.md file located at gbbai-azure-ai-search-indexing\app\README.md.

The deployment of the following allows us to call the API to index documents from anywhere:

POST https://doc-indexer.yellowtree-64f92beb.eastus.azurecontainerapps.io/indexing_documents HTTP/1.1
Host: doc-indexer.yellowtree-64f92beb.eastus.azurecontainerapps.io
Content-Type: application/json

```json
    {
        "file_paths": ["https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"],
        "splitter_params": {
            "splitter_type": "by_title_brute_force",
            "ocr": true,
            "ocr_output_format": "markdown",
            "pages": "1-2",
            "use_encoder": false,
            "chunk_size": 512,
            "chunk_overlap": 100,
            "verbose": true,
            "keep_separator": true,
            "is_separator_regex": false
        },
        "indexer_config": {
            "index_name": "test-index-002",
            "embedding_azure_deployment_name": "foundational-canadaeast-ada"
        }
    }
```

In [14]:
import requests

def call_api(url, data, headers):
    """
    Makes a POST request to the API endpoint.

    Args:
    - url (str): The URL of the API endpoint.
    - data (dict): The JSON payload to send in the request.
    - headers (dict): The headers to include in the request.

    Returns:
    - dict: The JSON response from the API.
    """
    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Request failed with status code: {response.status_code}")
        return None

import time

def index_documents_sequential():
    """
    Indexes documents by calling the API 10 times.
    """
    url = "https://doc-indexer.yellowtree-64f92beb.eastus.azurecontainerapps.io/indexing_documents"
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "file_paths": ["https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"],
        "splitter_params": {
            "splitter_type": "by_title_brute_force",
            "ocr": True,
            "ocr_output_format": "markdown",
            "pages": "1-2",
            "use_encoder": False,
            "chunk_size": 512,
            "chunk_overlap": 100,
            "verbose": True,
            "keep_separator": True,
            "is_separator_regex": False
        },
        "indexer_config": {
            "index_name": "test-index-002",
            "embedding_azure_deployment_name": "foundational-canadaeast-ada"
        }
    }

    start_time = time.time()

    for _ in range(5):
        response = call_api(url, data, headers)
        if response:
            print("Request successful!")
            print("Response:", response)

    end_time = time.time()
    total_time = end_time - start_time

    print(f"Total time taken: {total_time} seconds")

index_documents_sequential()

Request successful!
Response: {'success': True, 'successful_indexed_files': ['https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf'], 'failed_indexed_files': [], 'total_chunks_indexed': 4, 'index': 'test-index-002', 'indexing_time': 31.26715350151062}
Request successful!
Response: {'success': True, 'successful_indexed_files': ['https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf'], 'failed_indexed_files': [], 'total_chunks_indexed': 4, 'index': 'test-index-002', 'indexing_time': 31.297488689422607}
Request successful!
Response: {'success': True, 'successful_indexed_files': ['https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf'], 'failed_indexed_files': [], 'total_chunks_indexed': 4, 'index': 'test-index-002', 'indexing_ti

In [15]:
import concurrent.futures
import time

def index_documents_parallel():
    """
    Indexes documents by calling the API 10 times in parallel.
    """
    url = "https://doc-indexer.yellowtree-64f92beb.eastus.azurecontainerapps.io/indexing_documents"
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "file_paths": ["https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf"],
        "splitter_params": {
            "splitter_type": "by_title_brute_force",
            "ocr": True,
            "ocr_output_format": "markdown",
            "pages": "1-2",
            "use_encoder": False,
            "chunk_size": 512,
            "chunk_overlap": 100,
            "verbose": True,
            "keep_separator": True,
            "is_separator_regex": False
        },
        "indexer_config": {
            "index_name": "test-index-002",
            "embedding_azure_deployment_name": "foundational-canadaeast-ada"
        }
    }

    # Define a list to hold the futures of the API calls
    futures = []

    start_time = time.time()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit API calls to the ThreadPoolExecutor
        for _ in range(40):
            futures.append(executor.submit(call_api, url, data, headers))

    # Process the results of the API calls
    for future in concurrent.futures.as_completed(futures):
        response = future.result()
        if response:
            print("Request successful!")
            print("Response:", response)

    end_time = time.time()
    total_time = end_time - start_time

    print(f"Total time taken: {total_time} seconds")

index_documents_parallel()

Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request failed with status code: 504
Request successful!
Response: {'success': True, 'successful_indexed_files': ['https://testeastusdev001.blob.core.windows.net/customskillspdf/instruction-manual-fieldvue-dvc6200-hw2-digital-valve-controller-en-123052.pdf'], 'failed_indexed_files': [], 'total_chunks_indexed': 4, 'index': 'test-index-002', 'indexing_time': 33.27682304382324}
Request successful!
Response: {'success': True, 'successful_indexed_files': ['https://testeastusdev001.blob.core.windows.net/customskillspdf/

### Creating Jobs 

## Azure AI Search Orchestration: Indexers, Skillsets, and Skills (Built-in & Custom)

Azure AI Search offers advanced search capabilities through a well-coordinated operation of **indexers**, **skillsets**, and **skills**. This hierarchical relationship ensures seamless and efficient data ingestion, enrichment, and searchability.

![image.png](utils/images/orchestration.png)

### Components of Azure AI Search Orchestration

#### 1. Indexers

- **Definition:** Indexers in Azure AI Search automate the process of ingesting, transforming, and loading data from various data sources into an Azure AI search index.
- **Operation:** An indexer connects to a data source, retrieves content, and optionally applies a skillset to transform and enrich the data before loading it into a search index.
- **Supported Data Sources:** Azure Blob Storage, Azure Cosmos DB, Azure SQL Database, and others.
- **Example:** An indexer might ingest documents from Azure Blob Storage, apply a skillset for OCR and entity recognition, and then populate an Azure AI Search index with the enriched content.

#### 2. Skillsets

- **Definition:** A skillset is a collection of skills that execute built-in AI or custom processing over documents retrieved from an external data source. Skillsets are reusable resources in Azure AI Search.
- **Operation:** Skills within a skillset transform the content based on the skill's function. The outputs can be text, structured data, or image descriptions.
- **Example:** A skillset might include an OCR skill for image content, a text translation skill for multilingual support, and an entity recognition skill.

#### 3. Skills

- **Definition:** Skills are operations that transform content. They can be text-based for full-text search or vector-based for vector search.
- **Types:**
  - **Built-in Skills:**  These skills wrap API calls to Azure resources. They are based on pretrained models from Microsoft and can include operations like entity recognition, language detection, and sentiment analysis.
  - **Custom Skills:** Custom code executed externally to the search service, often hosted on an Azure function app. These skills extend the AI enrichment pipeline with custom processing logic.
  - **Utility Skills:** : Internal to Azure AI Search, these skills perform operations like conditional processing, document extraction, and text splitting.
- **Examples:** Text extraction, language detection, entity recognition, and optical character recognition (OCR).

For more information, please take a look at the [documentation here](https://learn.microsoft.com/en-us/azure/search/search-indexer-overview).

### Connect Data Source 

In [27]:
# Create a data source
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name="testretrieval")
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
    container=container,
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'index-test-5-blob' created or updated


In [28]:
skillset_name = f"{index_name}-skillset"

## Create Skill

### Custom Skills 

To create our application, we will follow a microservice approach. We will package our custom logic into a container app, which allows us to serve at scale using Azure App Service in the backend. This approach enables us to expose our custom logic as an API, which can be easily integrated into our skillset definition. This way, we can connect our custom skills with pre-built cognitive search skills, providing a comprehensive search experience.

Please visit `src\customskills\ocr` to review the app.

What this app does ? We are reading url from blob storage and then, we apply OCR and split by characters.

In [31]:
split_skills = SplitSkill(
    description="A skill that splits pdf",
    name="#1",
    context="/document",
    text_split_mode="pages",
    maximum_page_length=2500,
    page_overlap_length=500,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content"),
    ],
    outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)

In [32]:
embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Skill to generate embeddings via Azure OpenAI",
    name="#2",
    context="/document/pages/*",
    resource_uri=os.getenv("AZURE_AOAI_API_ENDPOINT"),
    deployment_id=os.getenv("AZURE_AOAI_EMBEDDING_DEPLOYMENT_ID"),
    api_key=os.getenv("AZURE_AOAI_API_KEY"),
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/pages/*"),
    ],
    outputs=[OutputFieldMappingEntry(name="embedding", target_name="textVector")],
)

In [11]:
custom_skill = WebApiSkill(
    description="A custom skill that extract the test from pdf and splits the text",
    uri="https://myskill.gentlebay-4474176e.westeurope.azurecontainerapps.io/vectorize",
    http_method="POST",
    timeout="PT60S",
    batch_size=4,
    degree_of_parallelism=4,
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="url", source="/document/url"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="imageVector"),
        OutputFieldMappingEntry(name="description", target_name="description"),
    ],
)

In [33]:
skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset to chunk documents and generating embeddings",
    skills=[split_skills, embedding_skill],
)

In [34]:
client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
client.create_or_update_skillset(skillset)
print(f"Skillset '{skillset.name}' created or updated")

Skillset 'index-test-5-skillset' created or updated


In [35]:
from typing import Any, Dict, Optional, Union
from azure.search.documents.indexes.models import (
    BlobIndexerParsingMode,
    BlobIndexerDataToExtract,
    BlobIndexerImageAction,
    BlobIndexerPDFTextRotationAlgorithm,
    IndexingSchedule,
)

In [36]:
from datetime import datetime, timedelta
import pytz

# Create a timedelta object representing an interval of 12 hours
interval = timedelta(hours=12)

# Create a datetime object representing the current time in UTC
start_time = datetime.now(pytz.utc)

# Create an instance of IndexingSchedule
schedule = IndexingSchedule(interval=interval, start_time=start_time)

In [16]:
?IndexingParametersConfiguration

[1;31mInit signature:[0m
[0mIndexingParametersConfiguration[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0madditional_properties[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mDict[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mAny[0m[1;33m][0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mparsing_mode[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mForwardRef[0m[1;33m([0m[1;34m'_models.BlobIndexerParsingMode'[0m[1;33m)[0m[1;33m][0m [1;33m=[0m [1;34m'default'[0m[1;33m,[0m[1;33m
[0m    [0mexcluded_file_name_extensions[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mindexed_file_name_extensions[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mfail_on_unsupported_content_type[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mfail_on_unprocessable_document[0m[1;33m:[0m [0mbool[

In [17]:
?IndexingParametersConfiguration

[1;31mInit signature:[0m
[0mIndexingParametersConfiguration[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0madditional_properties[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mDict[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mAny[0m[1;33m][0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mparsing_mode[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mForwardRef[0m[1;33m([0m[1;34m'_models.BlobIndexerParsingMode'[0m[1;33m)[0m[1;33m][0m [1;33m=[0m [1;34m'default'[0m[1;33m,[0m[1;33m
[0m    [0mexcluded_file_name_extensions[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mindexed_file_name_extensions[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mfail_on_unsupported_content_type[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mfail_on_unprocessable_document[0m[1;33m:[0m [0mbool[

In [37]:
indexing_params = IndexingParametersConfiguration(
    parsing_mode="text",
    excluded_file_name_extensions=".jpg, .png",
    indexed_file_name_extensions=".pdf",
    fail_on_unsupported_content_type=True,
    fail_on_unprocessable_document=True,
    index_storage_metadata_only_for_oversized_documents=False,
    first_line_contains_headers=False,
    data_to_extract="contentAndMetadata",
    image_action="none",
    allow_skillset_to_read_file_data=True,
    pdf_text_rotation_algorithm="none",
    execution_environment="standard",
    query_timeout=None,
)

In [38]:
indexer_name = f"{index_name}-indexer"

indexer = SearchIndexer(
    name=indexer_name,
    description="Indexer to index documents and generate description and embeddings",
    skillset_name=skillset_name,
    target_index_name=index_name,
    schedule=schedule,
    parameters=IndexingParameters(
        max_failed_items=-1,
        configuration=indexing_params,
    ),
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
    field_mappings=[
        FieldMapping(
            source_field_name="metadata_storage_path",
            target_field_name="path",
            mapping_function=FieldMappingFunction(name="base64Encode"),
        ),
        FieldMapping(
            source_field_name="metadata_storage_name", target_field_name="name"
        ),
        FieldMapping(
            source_field_name="metadata_storage_path", target_field_name="url"
        ),
    ],
    output_field_mappings=[
        FieldMapping(
            source_field_name="/document/textVector", target_field_name="textVector"
        ),
    ],
)

indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
indexer_result = indexer_client.create_or_update_indexer(indexer)

In [39]:
indexer_client.run_indexer(indexer_name)