In [None]:
! pip install numpy
! pip install openai
! pip install python-dotenv
! pip install azure-core
! pip install azure-cosmos
! pip install tenacity
! pip install --index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004

In [34]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataSourceConnection
)

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [36]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "rag.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

cosmosdb_endpoint = config['cosmos_db_api_endpoint']
cosmosdb_key = config['cosmos_db_api_key']
cosmosdb_connection_str = config['cosmos_db_connection_string']

cog_search_endpoint = config['cognitive_search_api_endpoint']
cog_search_key = config['cognitive_search_api_key']

openai.api_type = config['openai_api_type']
openai.api_key = config['openai_api_key']
openai.api_base = config['openai_api_endpoint']
openai.api_version = config['openai_api_version']
embeddings_deployment = config['openai_embeddings_deployment']
completions_deployment = config['openai_completions_deployment']

In [6]:
from datasets import load_dataset
import pandas as pd 

dataset = load_dataset("squad_v2")
df = pd.DataFrame.from_dict(dataset["train"])

Downloading builder script: 100%|██████████| 5.28k/5.28k [00:00<00:00, 5.22MB/s]
Downloading metadata: 100%|██████████| 2.40k/2.40k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 8.02k/8.02k [00:00<?, ?B/s]
Downloading data: 42.1MB [00:01, 28.3MB/s]/2 [00:00<?, ?it/s]
Downloading data: 4.37MB [00:00, 33.8MB/s]/2 [00:02<00:02,  2.71s/it]
Downloading data files: 100%|██████████| 2/2 [00:03<00:00,  1.63s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 169.52it/s]
Generating train split: 100%|██████████| 130319/130319 [00:04<00:00, 28626.70 examples/s]
Generating validation split: 100%|██████████| 11873/11873 [00:00<00:00, 29236.26 examples/s]


In [20]:
samurai_df = df[df["title"] == "Samurai"]

In [30]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/e5-small-v2")

#Our sentences we like to encode
sentences = samurai_df.question.tolist()

#Sentences are encoded by calling model.encode()
samurai_questions_embedded = model.encode(sentences)

In [40]:
data = {
    "embeddings": samurai_questions_embedded,
    "payload": sentences
}

## Upload data to Cosmos DB

In [37]:
# Create the client to interact with the Azure Cosmos DB resource
client = CosmosClient(cosmosdb_endpoint, cosmosdb_key)

In [38]:
# Create a database in Azure Cosmos DB.
try:
    database = client.create_database_if_not_exists(id="cosmosvectorstore")
    print(f"Database created: {database.id}")

except exceptions.CosmosResourceExistsError:
    print("Database already exists.")

Database created: cosmosvectorstore


In [39]:
# Create a container in Azure Cosmos DB.
try:
    partition_key_path = PartitionKey(path="/id")
    container = database.create_container_if_not_exists(
        id="AzureServices",
        partition_key=partition_key_path,
        offer_throughput=400,
    )
    print(f"Container created: {container.id}")

except exceptions.CosmosResourceExistsError:
    print("Container already exists.")

Container created: AzureServices


In [42]:
# Create data items for every entry in the dataset, insert them into the database and collection specified above.
for data_item in data:
    try:
        container.create_item(body=data_item)
    
    except exceptions.CosmosResourceExistsError:
        print("Data item already exists.")

AttributeError: 'str' object has no attribute 'get'