In [None]:
from dotenv import load_dotenv
import os
load_dotenv(override=True)   

search_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX")
admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai_key = os.getenv("AZURE_OPENAI_KEY")
ada002_deployment = os.getenv("AZURE_OPENAI_ADA002_EMBEDDING_DEPLOYMENT")
large3_deployment = os.getenv("AZURE_OPENAI_3_LARGE_EMBEDDING_DEPLOYMENT")
gpt_deployment = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT")

print(f"search_endpoint: {search_endpoint}")

In [7]:
from azure.core.credentials import AzureKeyCredential

from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.models import VectorizedQuery,VectorFilterMode
from azure.search.documents.indexes.models import (
    ComplexField,
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    HnswParameters,  
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchAlgorithmMetric,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)


index_client = SearchIndexClient(endpoint=search_endpoint, credential=AzureKeyCredential(admin_key))
# indexer_client = SearchIndexerClient(endpoint=se, credential=AzureKeyCredential(account_key))

In [None]:
# 필드명 정의
fields = [SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="type", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="iso_cd", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="language", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="product_group_code", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="product_code", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="new_product_code", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="new_product_name", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="product_model_code", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="data_id", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="mapping_key", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="symp_code_one", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="symp_code_two", type=SearchFieldDataType.String, filterable=True, facetable=True),
    #   SearchableField(name="symp_code_three", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="chunk_num", type=SearchFieldDataType.Int64, filterable=True, facetable=True),
        SearchableField(name="file_name", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="pages", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="url", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="title", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="main_text_path", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="main_text", type=SearchFieldDataType.String),
        SearchField(name="main_text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
        SearchField(name="main_text_vector_3", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=3072, vector_search_profile_name="myHnswProfile")]
# 백터 서치 configuration ( HNSW, KNN 사용 가능 )
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
    title_field=SemanticField(field_name="title"),
    content_fields=[SemanticField(field_name="main_text")]
)
)
semantic_search = SemanticSearch(configurations=[semantic_config])
suggesters= [
{
    "name": "sg",
    "searchMode": "analyzingInfixMatching",
    "sourceFields": ["title"]
}
]

vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

try:
    # 인덱스가 이미 존재하는지 확인
    existing_index = index_client.get_index(index_name)
    print(f"인덱스 '{index_name}'가 이미 존재합니다. 업데이트를 진행합니다.")

    # 인덱스 업데이트 (필요한 경우)
    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search,suggesters=suggesters)
    result = index_client.create_or_update_index(index)
    print(f"인덱스 '{result.name}' 업데이트 완료")
    

except Exception as e:
    # 인덱스가 존재하지 않는 경우 예외 발생
    print(f"인덱스 ( {index_name} ) 가 존재하지 않습니다. 새로운 인덱스를 생성합니다.")

    # 새 인덱스 생성
    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search,suggesters=suggesters)
    result = index_client.create_or_update_index(index)
    print(f"인덱스 ( {result.name} ) 생성 완료.")

In [9]:
import json

# Load the data from the JSON file
with open('data/uk-en.json', 'r') as file:
    data = json.load(file)


# Upload the data to the index
indexing_sender = SearchIndexingBufferedSender(
    endpoint=search_endpoint,
    index_name=index_name,
    credential=AzureKeyCredential(admin_key)
)

for item in data:
    indexing_sender.upload_documents(documents=[item])

indexing_sender.flush()
print("entries have been uploaded to the index.")

Top 1000 entries have been uploaded to the index.
