In [None]:
! pip install azure-search-documents==11.4.0a20230509004
! pip install openai
! pip install python-dotenv

In [1]:
# Import required libraries
import pandas as pd  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
key = os.getenv("AZURE_SEARCH_API_KEY")  
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)

In [2]:
with open('output_data/jsonQAdata.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)


In [5]:
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="embed-testing")
    embeddings = response['data'][0]['embedding']
    return embeddings

for item in input_data:
    title = item['question']
    content = item['value']
    title_embeddings = generate_embeddings(title)
    content_embeddings = generate_embeddings(content)
    item['titleVector'] = title_embeddings
    item['contentVector'] = content_embeddings
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("output_data/docVectors.json", "w") as f:
    json.dump(input_data, f)

In [5]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="question", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="value", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="question"),
        prioritized_content_fields=[SemanticField(field_name="value")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 vector-search-qai created


In [6]:
# Upload some documents to the index
with open('output_data/docVectors.json', 'r') as file:  
    documents = json.load(file)  

documents1 = []
documents2 = []
documents3 = []
documents4 = []
documents5 = []
documents6 = []
j = 0

for i in documents:
    if j < 1000:
        documents1.append(i)
    if j < 2000:
        documents2.append(i)
    if j < 3000:
        documents3.append(i)
    if j < 4000:
        documents4.append(i)
    if j < 5000:
        documents5.append(i)
    else:
        documents6.append(i)
    j += 1

documentsAlpha = [documents1, documents2, documents3, documents4, documents5, documents6]

print(len(documents1))

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

for x in documentsAlpha:
    result = search_client.upload_documents(x)  
    print(f"Uploaded {len(x)} documents") 





1000


KeyError: 'error_map'

In [None]:
# Pure Vector Search
query = "Information about NetApp Cloud Manager"  
  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
  
results = search_client.search(  
    search_text="",  
    vector=Vector(value=generate_embeddings(query), k=3, fields="contentVector"),  
    select=["question", "value"] 
)  
  
for result in results:  
    print(f"Encoded Question: {result['question']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['value']}")  