In [None]:
%pip install python-dotenv
%pip install azure-search-documents==11.5.1



In [None]:
import os

# Get the directory of the currently running file
file_dir = "C:\\Users\\hannahhowell\\OneDrive - Microsoft\\Documents\Git\\AOAI_Labs\\AzureAISearchLab"
# Get the current working directory
current_cwd = os.getcwd()

# Check if the current working directory is the same as the file directory
if current_cwd != file_dir:
    # Change the current working directory to the file directory
    os.chdir(file_dir)
    print(f"Changed current working directory to: {file_dir}")
else:
    print("Already in the correct directory.")


### Set container name to name of newly created container.

In [None]:
container_name = "pdfscompiancecatalyst"

In [None]:
# Load credentials
from dotenv import load_dotenv
import os 
load_dotenv()

# Check the environment variables are set and assign them to variables.
AI_SEARCH_ENDPOINT = os.getenv('AI_SEARCH_ENDPOINT')
AI_SEARCH_KEY = os.getenv('AI_SEARCH_KEY')

BLOB_STORAGE_ACCOUNT_CONNECTION_STRING = os.getenv('BLOB_STORAGE_ACCOUNT_CONNECTION_STRING')


OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_API_ENDPOINT = os.getenv('OPENAI_API_ENDPOINT')

AZURE_AI_KEY = os.getenv('AZURE_AI_KEY')

# Ensure all required environment variables are set
if not all([AI_SEARCH_ENDPOINT, AI_SEARCH_KEY, BLOB_STORAGE_ACCOUNT_CONNECTION_STRING, OPENAI_API_KEY, OPENAI_API_ENDPOINT, AZURE_AI_KEY]):
    missing_vars = [var for var, val in zip(['AI_SEARCH_ENDPOINT', 'AI_SEARCH_KEY', 'BLOB_STORAGE_ACCOUNT_CONNECTION_STRING', 'OPENAI_API_KEY', 'OPENAI_API_ENDPOINT', 'AZURE_AI_KEY'], 
                                            [AI_SEARCH_ENDPOINT, AI_SEARCH_KEY, BLOB_STORAGE_ACCOUNT_CONNECTION_STRING, OPENAI_API_KEY, OPENAI_API_ENDPOINT, AZURE_AI_KEY]) if not val]
    raise ValueError(f"Environment variables {', '.join(missing_vars)} must be set.")

# Print the environment variables
print(f"AI_SEARCH_ENDPOINT: {AI_SEARCH_ENDPOINT}")
print(f"AI_SEARCH_KEY: {AI_SEARCH_KEY}")
print(f"BLOB_STORAGE_ACCOUNT_CONNECTION_STRING: {BLOB_STORAGE_ACCOUNT_CONNECTION_STRING}")
print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")
print(f"OPENAI_API_ENDPOINT: {OPENAI_API_ENDPOINT}")
print(f"AZURE_AI_KEY: {AZURE_AI_KEY}")



In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

## Create a Data Source (Blob Container containting the pdfs)

Although only  PDF files are used here, this can be done at a much larger scale and Azure AI Search supports a range of other file formats including: Microsoft Office (DOCX/DOC, XSLX/XLS, PPTX/PPT, MSG), HTML, XML, ZIP, and plain text files (including JSON).
Azure Search support the following sources: [Data Sources Gallery](https://learn.microsoft.com/EN-US/AZURE/search/search-data-sources-gallery)

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source 
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AI_SEARCH_KEY))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=container_name+"-connection",
    type="azureblob",
    connection_string=BLOB_STORAGE_ACCOUNT_CONNECTION_STRING,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

In [None]:
container_name = "mapping-test"  # changing so I do not overwrite the existing indexing

## Create Index

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

AZURE_SEARCH_CREDENTIAL = AzureKeyCredential(AI_SEARCH_KEY)

# Create a search index  
index_name = container_name+"-index"
index_client = SearchIndexClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="subfield_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="subfield", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    ]  
 
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

# Create a Skill Set
One option for data pre-processing is to do it as part of the indexing by using a skillset.
A Skillset is a set of steps in which AI Services can be used to enrich the documents by extracting information, applying OCR, translating, etc.

https://learn.microsoft.com/en-us/azure/search/cognitive-search-working-with-skillsets
https://learn.microsoft.com/en-us/azure/search/cognitive-search-predefined-skills


In [None]:
from azure.search.documents.indexes.models import ( 
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    EntityRecognitionSkill,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    WebApiSkill,
    SearchIndexerDataUserAssignedIdentity
)

# Create a skillset  
skillset_name = container_name+"-skillset"

onetoone_skill = WebApiSkill(
    description="Skill to generate images from pdf",
    uri="https://mappingexample.azurewebsites.net/api/onetoone",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="input", source="/document/metadata_storage_path"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="new_field", target_name="new_field"),
    ]
)

divide_skill = WebApiSkill(
    description="Skill to generate images from pdf",
    uri="https://mappingexample.azurewebsites.net/api/divide",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="input", source="/document/new_field"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="new_list", target_name="new_list"),
    ]
)

onetoone_skill_again = WebApiSkill(
    description="Skill to generate images from pdf",
    uri="https://mappingexample.azurewebsites.net/api/onetoone",
    context="/document/new_list/*",
    inputs=[
        InputFieldMappingEntry(name="input", source="/document/new_list/*/item"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="new_field", target_name="new_field"),
    ]
)

skills = [onetoone_skill, divide_skill, onetoone_skill_again]


index_projections = SearchIndexerIndexProjection(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/new_list/*",  
            mappings=[  
                InputFieldMappingEntry(name="subfield", source="/document/changed_list/*"), 
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
) 

cognitive_services_account = CognitiveServicesAccountKey(key=AZURE_AI_KEY)


skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,
    index_projection=index_projections,
    cognitive_services_account=cognitive_services_account
)

client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

## Create indexer

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer 
indexer_name = container_name+"-indexer" 

indexer_parameters = None

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    parameters=indexer_parameters
)  

# Create and run the indexer  
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL) 

indexer_result = indexer_client.create_or_update_indexer(indexer)


print(f' {indexer_name} is created and running. Give the indexer a few minutes before running a query.')  

## Check results

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Vector Search using text-to-vector conversion of the querystring
query = "how much of earth is covered by water"  

search_client = SearchClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL, index_name=index_name)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="text_vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "title", "chunk", "locations", "topWords"],
    top=1
)  
  
for result in results: 
    print(result)
    print(f"Score: {result['@search.score']}")
    print(f"Title: {result['title']}")
    print(f"Locations: {result['locations']}")
    print(f"TopWords: {result['topWords']}")

## Chat with data

In [None]:
# Import libraries
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI

# Set up clients and specify the chat model
openai_client = AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY
 )

model = os.getenv("GPT4_MODEL_NAME")

search_client = SearchClient(
    endpoint=AI_SEARCH_ENDPOINT,
    index_name=index_name,
    credential=AzureKeyCredential(AI_SEARCH_KEY)
 )

# Provide instructions to the model
GROUNDED_PROMPT="""
You are an AI assistant that helps users learn from the information found in the source material.
Answer the query using only the sources provided below.
Use bullets if the answer has multiple points.
If the answer is longer than 3 sentences, provide a summary.
Answer ONLY with the facts listed in the list of sources below.
If there isn't enough information below, say you don't know.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""

# Provide the query. Notice it's sent to both the search engine and the LLM.
query="What are the Barren Grounds"

# Set up the search results and the chat thread.
# Retrieve the selected fields from the search index related to the question.
search_results = search_client.search(
    search_text=query,
    top=1,
    select="title, chunk, locations"
)
sources_formatted = "\n".join([f'{document["title"]}:{document["chunk"]}:{document["locations"]}' for document in search_results])

response = openai_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": GROUNDED_PROMPT.format(query=query, sources=sources_formatted)
        }
    ],
    model=model
)

print(response.choices[0].message.content)

### Optional Experimentation

Notice that whatever question you ask the search will bring back info it thinks is relevant.  
However if the answer to the question is not in the retrieved data the LLM will respond "I don't know".  

*For the Nasa Data Set a good question is "How much of the earths surface is covered with water"*

Currently the search is passing the whole query from the user and using it to do a basic query.
- What are the limitations of this method?
- What improvements could be made?