# Create a search index in Azure AI Search using the Azure SDK for Python

This notebook steps through creating, loading, and querying an index in Azure AI Search index by calling the azure-search-documents library in the Azure SDK for Python. 

## Prerequisites
*If you are doing all the notebooks then this setup is coverd in 00_Setup*
- Create Azure AI Search [instructions here](https://learn.microsoft.com/azure/search/search-create-service-portal) 
(You may have already created them in previous notebooks)
  - Basic tier or higher is recommended.
  - Choose the same region as Azure OpenAI.
  - Enable semantic ranking.
  - Enable a system identity for Azure AI Search. - Settings / Identity / System Assigned / Enable
  - Update the .env file with AI_SEARCH_KEY  (Use admin keys - In the portal go to resources then Settings, Keys on the left.)
  - Update the .env file with AI_SEARCH_ENDPOINT  (Overview page - url)


## Install packages and set variables

In [None]:
#%pip install azure-search-documents==11.5.1 --quiet
#%pip install azure-identity --quiet
#%pip install python-dotenv --quiet
#%pip install pymupdf --quiet
%pip install openai --quiet

In [None]:
# Load credentials
from dotenv import load_dotenv
import os
load_dotenv()

# Check the environment variables are set and assign them to variables.
AI_SEARCH_ENDPOINT = os.getenv('AI_SEARCH_ENDPOINT')
AI_SEARCH_KEY = os.getenv('AI_SEARCH_KEY')

# Ensure all required environment variables are set
if not all([AI_SEARCH_ENDPOINT, AI_SEARCH_KEY]):
    missing_vars = [var for var, val in zip(['AI_SEARCH_ENDPOINT', 'AI_SEARCH_KEY'], 
                                            [AI_SEARCH_ENDPOINT, AI_SEARCH_KEY]) if not val]
    raise ValueError(f"Environment variables {', '.join(missing_vars)} must be set.")

# Print the environment variables
print(f"AI_SEARCH_ENDPOINT: {AI_SEARCH_ENDPOINT}")
print(f"AI_SEARCH_KEY: {AI_SEARCH_KEY}")

index_name: str = "soc_index"

## Create an index

In [None]:
from azure.core.credentials import AzureKeyCredential

credential = AzureKeyCredential(AI_SEARCH_KEY)

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchField,
    SimpleField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex,
    SearchableField
)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_API_ENDPOINT = os.getenv('OPENAI_API_ENDPOINT')

# Create a search schema
index_client = SearchIndexClient(
    endpoint=AI_SEARCH_ENDPOINT, credential=credential)
fields = [
    SimpleField(name="parent_id", type=SearchFieldDataType.String, key=False, searchable=True, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, key=False, searchable=True, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True),
    SearchableField(name="chunk_id", type=SearchFieldDataType.String, key=True, searchable=True, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True, analyzer_name="keyword"),
    SearchableField(name="chunk", type=SearchFieldDataType.String, key=False, searchable=True, filterable=False, retrievable=True, stored=True, sortable=False, facetable=False),
    SearchableField(name="filepath", type=SearchFieldDataType.String, key=False, searchable=True, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True),
    SearchableField(name="header_1", type=SearchFieldDataType.String, key=False, searchable=True, filterable=False, retrievable=True, stored=True, sortable=False, facetable=False),
    SearchableField(name="header_2", type=SearchFieldDataType.String, key=False, searchable=True, filterable=False, retrievable=True, stored=True, sortable=False, facetable=False),
    SearchableField(name="header_3", type=SearchFieldDataType.String, key=False, searchable=True, filterable=False, retrievable=True, stored=True, sortable=False, facetable=False)
]

scoring_profiles = []
suggester = []

# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer_name="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            vectorizer_name="myOpenAI",  
            kind="azureOpenAI",  
            parameters=AzureOpenAIVectorizerParameters(  
                resource_url=OPENAI_API_ENDPOINT,  
                deployment_name=os.getenv("EMBEDDINGS_MODEL_NAME"),
                model_name="text-embedding-ada-002",
                api_key=OPENAI_API_KEY
            ),
        ),  
    ], 
)  


# Create the search index=
index = SearchIndex(name=index_name, fields=fields)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

## Convert PDF to images

In [None]:
import fitz  # PyMuPDF
import os
from azure.storage.blob import BlobServiceClient
import uuid

def convert_pdf_to_images(pdf_path, blob_connection_string=None, container_name=None):
    """
    Converts each page of a PDF into JPEG images and saves them in a directory named after the PDF file.
    Optionally uploads the images to an Azure Blob Storage container.

    Args:
    - pdf_path (str): Path to the PDF file.
    - blob_connection_string (str, optional): Azure Blob Storage connection string.
    - container_name (str, optional): Name of the Azure Blob Storage container.

    Returns:
    - list: List of image file paths saved.
    """
    # Create a directory based on the PDF filename in the same directory as the original file
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_images")
    os.makedirs(output_dir, exist_ok=True)

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    saved_image_paths = []

    # Convert each page of the PDF into images
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        image_path = os.path.join(output_dir, f'page{page_num}.jpg')
        pix.save(image_path)
        saved_image_paths.append(image_path)

        # Upload to Azure Blob Storage if connection string and container name are provided
        if blob_connection_string and container_name:
            blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
            container_client = blob_service_client.get_container_client(container_name)
            if not container_client.exists():
                container_client.create_container()
            blob_client = container_client.get_blob_client(blob=f'{pdf_name}/page{page_num}.jpg')
            with open(image_path, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)
    
    return saved_image_paths

# Find out the current working directory

BLOB_STORAGE_ACCOUNT_CONNECTION_STRING = os.getenv('BLOB_STORAGE_ACCOUNT_CONNECTION_STRING')

pdf_path = "data\Azure DevOps - SOC 2 Type II Report (2023-10-01-to 2024-09-30).pdf"
container_name = "frompdf"
image_paths = convert_pdf_to_images(pdf_path)
print(f"Converted images saved at: {image_paths}")

## Create a documents payload

In [None]:
import base64
from mimetypes import guess_type

def local_image_to_data_url(image_path):
    """
    Get the url of a local image
    """
    mime_type, _ = guess_type(image_path)

    if mime_type is None:
        mime_type = "application/octet-stream"

    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")

    return f"data:{mime_type};base64,{base64_encoded_data}"

In [None]:
from openai import AzureOpenAI
import json

def gpt4o_imagefile(image_file):
    """
    Gpt-4o model
    """
    
    system_prompt = """
    You are an AI assistance that extracts text from the image. You are especially good at extracting tables.
    When you see a table such as:
    
    Monthly Savings
    | Month    | Savings |Details |
    | -------- | ------- |------- 
    | January  | $250    | for holiday    |
    | February | $80     | pension   |
    | March    | $420    | new cat   |

    
    You format the json like this:
    {
    "Title": "Monthly Savings",
    "Data": [
        {
            "Month": "January",
            "Savings": "$250",
            "Details": "for holiday"
        },
        {
            "Month": "February",
            "Savings": "$80",
            "Details": "pension"
        },
        {
            "Month": "March",
            "Savings": "$420",
            "Details": "new cat"
        }
    ]
}
    
    So that each row is a dictionary with the column name as the key and the cell value as the value.
"""
    
    client = AzureOpenAI(
        azure_endpoint=os.environ['OPENAI_API_ENDPOINT'],
        api_key=os.environ['OPENAI_API_KEY'],
        api_version='2023-05-15',
        )

    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract text from the image"},
                    {
                        "type": "image_url",
                        "image_url": {"url": local_image_to_data_url(image_file)},
                    },
                ],
            },
        ],
        max_tokens=2000,
        temperature=0.0,
    )
    
    return json.loads(response.choices[0].message.content)

In [None]:
import json

# Extract data from each image and save to JSON
start_page = 170  # specify the start page number
end_page = 175 # specify the end page number
extracted_data=[]


for image_path in image_paths[start_page:end_page]:
    data = gpt4o_imagefile(image_path)
    extracted_data.append(data)


In [None]:
# Save extracted data to a JSON file

with open("extracted_data.json", "w") as json_file:
    json.dump(extracted_data, json_file, indent=4)

print("Data extracted and saved to extracted_data.json")

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(BLOB_STORAGE_ACCOUNT_CONNECTION_STRING)

# Get the container client
container_client = blob_service_client.get_container_client(container_name)

# Ensure the container exists
if not container_client.exists():
    container_client.create_container()

# Upload each element in the extracted_data list as a separate JSON file
for i, data in enumerate(extracted_data):
    # Define the blob name and path
    blob_name = f"json/extracted_data_{i}.json"
    blob_client = container_client.get_blob_client(blob_name)
    
    # Upload the JSON data to the blob
    blob_client.upload_blob(json.dumps(data), overwrite=True)
    
    print(f"Extracted data uploaded to blob storage at: {blob_name}")
    print(data)

## Upload documents

In [None]:
search_client = SearchClient(endpoint=AI_SEARCH_ENDPOINT,
                      index_name=index_name,
                      credential=credential)
try:
    result = search_client.upload_documents(documents=documents)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print (ex.message)

    index_client = SearchIndexClient(
    endpoint=AI_SEARCH_ENDPOINT, credential=credential)

## Run your first query

In [None]:
# Run an empty query (returns selected fields, all documents)
results =  search_client.search(query_type='simple',
    search_text="*" ,
    select='HotelName,Description',
    include_total_count=True)

print ('Total Documents Matching Query:', results.get_count())
for result in results:
    print(result["@search.score"])
    print(result["HotelName"])
    print(f"Description: {result['Description']}")


## Run a term query

In [None]:
# Run a term query
results =  search_client.search(query_type='simple',
    search_text="wifi" ,
    select='HotelName,Description,Tags',
    include_total_count=True)

print ('Total Documents Matching Query:', results.get_count())
for result in results:
    print(result["@search.score"])
    print(result["HotelName"])
    print(f"Description: {result['Description']}")

## Add a filter

In [None]:
# Add a filter
results = search_client.search(
    search_text="hotels", 
    select='HotelId,HotelName,Rating', 
    filter='Rating gt 4', 
    order_by='Rating desc')

for result in results:
    print("{}: {} - {} rating".format(result["HotelId"], result["HotelName"], result["Rating"]))

## Scope a query to specific searchable fields

In [None]:
results = search_client.search(
    search_text="sublime", 
    search_fields=['HotelName'], 
    select='HotelId,HotelName')

for result in results:
    print("{}: {}".format(result["HotelId"], result["HotelName"]))

## Return facets

In [None]:
# Return facets
results = search_client.search(search_text="*", facets=["Category"])

facets = results.get_facets()

for facet in facets["Category"]:
    print("    {}".format(facet))

## Look up a document 

In [None]:
# Look up a specific document by ID
result = search_client.get_document(key="3")

print("Details for hotel '3' are:")
print("Name: {}".format(result["HotelName"]))
print("Rating: {}".format(result["Rating"]))
print("Category: {}".format(result["Category"]))

## Autocomplete a query

In [None]:
# Autocomplete a query
search_suggestion = 'sa'
results = search_client.autocomplete(
    search_text=search_suggestion, 
    suggester_name="sg",
    mode='twoTerms')

print("Autocomplete for:", search_suggestion)
for result in results:
    print (result['text'])

## See your resource in the portal.

Go to your Search resource then from menu Search Management / Indexes  
See the new index there that you have created. 
Click into it, you can also test the search manually here.



## Clean up

If you are finished with this index, you can delete it by running the following lines. Deleting unnecessary indexes frees up space for stepping through more quickstarts and tutorials.

In [None]:
try:
    result = index_client.delete_index(index_name)
    print ('Index', index_name, 'Deleted')
except Exception as ex:
    print (ex)

Confirm the index deletion by running the following script that lists all of the indexes on your search service. If hotels-quickstart is not listed, you've successfully deleted the index and have completed this quickstart.

In [None]:
try:
    result = index_client.get_index(index_name)
    print (result)
except Exception as ex:
    print (ex)
