In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import sys, os

print(f"Installing packages into environment {sys.executable}")

Installing packages into environment c:\anaconda3\envs\cogsearch02\python.exe


In [None]:
!{sys.executable} -m pip install llama-index openai langchain azure-identity

In [3]:
logger = logging.getLogger(__name__)

# Create Cognitive Search Index

In [4]:
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    HnswParameters,
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

from azure.search.documents.indexes import SearchIndexClient

from azure.core.credentials import AzureKeyCredential  

from typing import Any

In [6]:
def drop_and_create_index(index_name: str, service_endpoint: str, credential: Any):
    index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
    if index_name in index_client.list_index_names():
        logger.info("Index {index_name} exists, dropping index")
        index_client.delete_index(index_name)

    create_search_index(index_name, service_endpoint, credential)
    



def create_search_index(index_name: str, service_endpoint: str, credential: Any):
    #if args.verbose: print(f"Ensuring search index {args.index} exists")
    index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
    if index_name not in index_client.list_index_names():
        index = SearchIndex(
            name=index_name,
            fields=[
                SimpleField(name="id", type="Edm.String", key=True),
                SearchableField(name="content", type="Edm.String", analyzer_name="en.microsoft"),
                SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                            hidden=False, searchable=True, filterable=False, sortable=False, facetable=False,
                            vector_search_dimensions=1536, vector_search_configuration="default"),
                SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
                SimpleField(name="sourcepage", type="Edm.String", filterable=True, facetable=True),
                SimpleField(name="sourcefile", type="Edm.String", filterable=True, facetable=True),
                SearchableField(name="li_jsonMetadata", type="Edm.String", filterable=True),
                SimpleField(name="li_id", type="Edm.String", filterable=True),
                SimpleField(name="li_doc_id", type="Edm.String", filterable=True)
            ],
            semantic_settings=SemanticSettings(
                configurations=[SemanticConfiguration(
                    name='default',
                    prioritized_fields=PrioritizedFields(
                        title_field=None, prioritized_content_fields=[SemanticField(field_name='content')]))]),
                vector_search=VectorSearch(
                    algorithm_configurations=[
                        HnswVectorSearchAlgorithmConfiguration(
                            name="default",
                            kind="hnsw",
                            parameters={
                                "m": 4,
                                "efConstruction": 400,
                                "efSearch": 1000,
                                "metric": "cosine"
            }
                        )
                    ]
                )        
            )
        logger.info(f"Creating {index_name} search index")
        index_client.create_index(index)
    else:
        logger.info(f"Search index {index_name} already exists")

In [7]:
key = "UWTIPqJwmA03Cjew8KmP9OkxidP4whkfxVN0EwzmJXAzSeAMib7T"
credential = AzureKeyCredential(key)

service_endpoint = "https://llmdevcog001.search.windows.net"
index_name = "aoaicogsearchtest05"


In [None]:
#key = "2UCPzVfO4hVvxvRFFTbreE6MFSUTEmSgWYGpniC6pXAzSeBiVDiN"
#credential = AzureKeyCredential(key)

#service_endpoint = "https://gptkb-gdm7wgiiihc5y.search.windows.net"
#index_name = "aoaientsearchrio"

In [None]:
# key = "2UCPzVfO4hVvxvRFFTbreE6MFSUTEmSgWYGpniC6pXAzSeBiVDiN"
# credential = AzureKeyCredential(key)

# service_endpoint = "https://gptkb-gdm7wgiiihc5y.search.windows.net"



In [8]:
create_search_index(index_name=index_name, service_endpoint=service_endpoint, credential=credential)

INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': '13e3afae-4675-11ee-855c-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': '13e3afae-4675-11ee-855c-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 200
Response headers:
    'Transfer-Encoding': 'chunk

# Populate Index

In [9]:
from llama_index import download_loader
from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from llama_index import LLMPredictor
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding
from llama_index import PromptHelper
from azure.search.documents import SearchClient 
from azure.core.credentials import AzureKeyCredential  
from llama_index.vector_stores import CognitiveSearchVectorStore
from llama_index import VectorStoreIndex, StorageContext, ServiceContext
from llama_index import load_index_from_storage
from typing import Dict, Any 
import re
import base64 
import os
from azure.identity import DefaultAzureCredential
#from pypdf import PdfReader, PdfWriter
from azure.storage.blob import BlobServiceClient
import io

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [10]:
from llama_index.vector_stores import CognitiveSearchVectorStore

In [11]:
import logging

logger = logging.getLogger(__name__)

In [12]:
def get_token():
    # Request credential
    default_credential = DefaultAzureCredential()
    token = default_credential.get_token("https://cognitiveservices.azure.com/.default")
    return token.token

def get_vector_store(service_endpoint: str, index_name: str, credential: Any) -> CognitiveSearchVectorStore:
    search_client = SearchClient(service_endpoint, index_name, credential=credential) 
    vector_store = CognitiveSearchVectorStore(search_client,
                                          id_field_key = "li_id",
                                          chunk_field_key= "content",
                                          embedding_field_key = "embedding",
                                          metadata_field_key= "li_jsonMetadata",
                                          doc_id_field_key = "li_doc_id",
                                          index_mapping = cogsearch_ent_index_mapping
    )
    return vector_store

def get_llm_completion(openai_api_version: str, openai_api_key: str, openai_api_base: str, model_kwargs: Any) -> LLMPredictor:
    llm = AzureOpenAI(temperature=0.9, 
                  deployment_name="text-davinci-003", 
                  model_name="text-davinci-003", 
                  openai_api_version= openai_api_version, 
                  openai_api_key=openai_api_key, 
                  model_kwargs=model_kwargs)

    # define LLM
    llm_predictor = LLMPredictor(llm)   
    return llm_predictor

def get_llm_chat(openai_api_version: str, openai_api_key: str, openai_api_base: str, model_kwargs: Any) -> LLMPredictor:
    llm = AzureChatOpenAI(temperature=0.9, 
                  deployment_name="gpt-35-turbo-16k", 
                  model_name="gpt-35-turbo-16k", 
                  openai_api_version= openai_api_version, 
                  openai_api_key=openai_api_key, 
                  openai_api_base=openai_api_base,
                  openai_api_type="azure_ad",
                  model_kwargs=model_kwargs)

    # define LLM
    llm_predictor = LLMPredictor(llm)   
    return llm_predictor


def get_embedding(openai_api_key: str, model_kwargs: Any) -> LangchainEmbedding:
    # load in AOAI embedding model from langchain
    oai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",
                              deployment="text-embedding-ada-002",
                              openai_api_key=openai_api_key,
                              openai_api_base=model_kwargs["api_base"],
                              openai_api_type=model_kwargs["api_type"],
                              openai_api_version=model_kwargs["api_version"],
                              chunk_size=1)
    embeddings = LangchainEmbedding(oai_embeddings)
    return embeddings


def filename_to_id(filename):
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
    return f"file-{filename_ascii}-{filename_hash}"

def blob_name_from_file_page(filename, page = 0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)

def cogsearch_ent_index_mapping(
        enriched_doc: Dict[str, str], metadata: Dict[str, Any]
    ) -> Dict[str, str]:
        index_doc: Dict[str, str] = {}
        field_mapping = { "id": "id", "chunk": "content", "embedding": "contentVector", "doc_id": "document_id", "metadata": "jsonMetadata"}

        page = metadata.get("page_number", 0)
        sourcefile = metadata["file_name"]
        file_id = filename_to_id(sourcefile)
        # id, content, embedding, sourcepage, sourcefile

        index_doc["id"] = enriched_doc["id"] #f"{file_id}-page-{page}"
        index_doc["content"] = enriched_doc["chunk"]
        index_doc["embedding"] = enriched_doc["embedding"]
        index_doc["sourcepage"] = blob_name_from_file_page(sourcefile, page)
        index_doc["sourcefile"] = sourcefile
        index_doc["li_id"] = enriched_doc["id"]
        index_doc["li_jsonMetadata"] = enriched_doc["metadata"]
        index_doc["li_doc_id"] = enriched_doc["doc_id"]

        return index_doc

In [13]:
def load_documents(container_name: str, 
                blob: str,
                account_url: str,
                loader_hub_url: str): 
    default_credential = DefaultAzureCredential()

    loader = AzStorageBlobReader(container_name=container_name, 
        blob=blob,
        account_url=account_url, 
        credential=default_credential, 
        loader_hub_url=loader_hub_url) #, num_files_limit = 1)

    documents = loader.load_data()

    return documents

def index_document(documents: Any,
                azure_kwargs: Any,
                service_endpoint: str,
                index_name: str,
                persist_dir: str,
                credential: AzureKeyCredential
                ):
    
    default_credential = DefaultAzureCredential()
    token = default_credential.get_token("https://cognitiveservices.azure.com/.default")

    logger.info('Number of documents: {}'.format(len(documents)))


    llm_predictor = get_llm_chat(openai_api_version=azure_kwargs["api_version"], 
                        openai_api_key=token.token, 
                        openai_api_base=azure_kwargs["api_base"],
                        model_kwargs=azure_kwargs) 

    embeddings = get_embedding(openai_api_key=token.token, model_kwargs=azure_kwargs)


    vector_store = get_vector_store(service_endpoint, index_name, credential)

    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embeddings) # using default chunk limit

    try:
        logger.info(f"Loading storage context from {persist_dir}")
        storage_context = StorageContext.from_defaults(persist_dir=persist_dir, vector_store=vector_store)
    except FileNotFoundError as e:
        logger.info(f"Failed to load storage context from {persist_dir}, create default")
        storage_context = StorageContext.from_defaults(vector_store=vector_store)


    logger.info(f"Parsing nodes")
    nodes = service_context.node_parser.get_nodes_from_documents(documents)
    try:
        logger.info(f"Trying to load index from storage")
        cog_index = load_index_from_storage(storage_context=storage_context, service_context = service_context)
        cog_index.insert_nodes(nodes)
    except ValueError as e:
        logger.info(f"Creating a new index")
        cog_index = VectorStoreIndex(
        nodes=nodes, storage_context=storage_context, service_context=service_context)

    logger.info(f"Saving storage context to {persist_dir}")
    storage_context.persist(persist_dir=persist_dir)


    result = {"service_context": service_context, 
              "storage_context": storage_context, 
              "vector_index": cog_index}
    return result

In [14]:
storage_account_url = "https://aoaist002.blob.core.windows.net"
loader_hub_fork_url = "https://raw.githubusercontent.com/rivms/llama-hub/azblobmetadata/llama_hub"

# aoai_base = "https://demoaoai002.openai.azure.com/"
aoai_base = "https://demofcaoai004.openai.azure.com/"
azure_kwargs={"api_type": "azure_ad", "api_version": "2023-03-15-preview", "api_base": aoai_base}

# Test Delete

In [15]:
print(f"Vector store parameters, endpoint: {service_endpoint}, index-name: {index_name}")

Vector store parameters, endpoint: https://llmdevcog001.search.windows.net, index-name: aoaicogsearchtest05


In [None]:
from llama_index.vector_stores import CognitiveSearchVectorStore

search_client = SearchClient(service_endpoint, index_name, credential=credential) 
vector_store = CognitiveSearchVectorStore(search_client,
                                          id_field_key = "id",
                                          chunk_field_key= "content",
                                          embedding_field_key = "embedding",
                                          metadata_field_key= "li_jsonMetadata",
                                          doc_id_field_key = "li_doc_id")

In [None]:
vector_store._search_client

ref_doc_id = "97ff17d6-d5b7-4b13-8ffe-82187d24542c"
filter = f'li_doc_id eq \'{ref_doc_id}\''
results = vector_store._search_client.search(search_text="*", filter=filter)

In [None]:
docs_to_delete = []
for result in results:
    print(f"{result['id']}")
    doc = {}
    doc["id"] = result["id"]
    docs_to_delete.append(doc)

In [None]:
docs_to_delete

In [None]:
len(docs_to_delete)

In [None]:
if len(docs_to_delete) > 0:
    print(f"Deleting {len(docs_to_delete)}")
    vector_store._search_client.delete_documents(docs_to_delete)

In [None]:

vector_store.delete(ref_doc_id = ref_doc_id)

In [16]:
AzStorageBlobReader = download_loader("AzStorageBlobReader", loader_hub_fork_url, refresh_cache = True)

In [17]:
annual_reports_synergy = ["01-03-2017-annual-report-2016.pdf"]
persist_dir = "./temp_doc_store"
#split_pdf_storage_account = "stgdm7wgiiihc5y"
#split_pdf_container = "aoaientsearchsynergy"
load_container_name = "testdocs"

In [18]:
pdf = annual_reports_synergy[0]

documents = load_documents(container_name=load_container_name, blob=pdf, account_url=storage_account_url,
            loader_hub_url=loader_hub_fork_url)

INFO:azure.identity._credentials.environment:No environment configuration found.
No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
ManagedIdentityCredential will use IMDS
INFO:custom_loader:Adding metadata for C:\Users\rijumna\AppData\Local\Temp\tmpcer915nr\qt14khzr.pdf
Adding metadata for C:\Users\rijumna\AppData\Local\Temp\tmpcer915nr\qt14khzr.pdf
INFO:custom_loader:Start download of 01-03-2017-annual-report-2016.pdf
Start download of 01-03-2017-annual-report-2016.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request metho

In [None]:
annual_reports_rio = ["rt-annual-report-2022.pdf", "rt-annual-report-2021.pdf",
                      "rt-annual-report-2020.pdf", "rt-annual-report-2019.pdf",
                      "rt-annual-report-2018.pdf", "rt-annual-report-2017.pdf",
                      "rt-annual-report-2016.pdf", "rt-annual-report-2015.pdf",
                      "rt-annual-report-2014.pdf", "rt-annual-report-2013.pdf"]
persist_dir = "./temp_doc_store"
#split_pdf_storage_account = "stgdm7wgiiihc5y"
#split_pdf_container = "aoaientsearchsynergy"
load_container_name = "annualreportrio"

In [None]:
pdf = annual_reports_rio[4]

documents = load_documents(container_name=load_container_name, blob=pdf, account_url=storage_account_url,
            loader_hub_url=loader_hub_fork_url)

In [19]:
len(documents)

144

In [None]:
documents[0]

In [None]:
import json

j = {"field1": 5, "_node": { "field2": "value2", "field3": 54}}

json_str = json.dumps(j)

json_str


In [20]:
drop_and_create_index(index_name=index_name, service_endpoint=service_endpoint, credential=credential)

INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': '7ca832ed-4675-11ee-9eef-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': '7ca832ed-4675-11ee-9eef-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 200
Response headers:
    'Transfer-Encoding': 'chunk

In [None]:
service_endpoint

In [21]:
vector_store = get_vector_store(service_endpoint, index_name, credential)

In [None]:
vector_store

In [22]:
index_result = index_document(documents,
            azure_kwargs,
            service_endpoint,
            index_name,
            persist_dir,
            credential)

INFO:azure.identity._credentials.environment:No environment configuration found.
No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
ManagedIdentityCredential will use IMDS
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential
DefaultAzureCredential acquired a token from A

In [23]:
index_result

{'service_context': ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None), prompt_helper=PromptHelper(context_window=16234, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=LangchainEmbedding(model_name='text-embedding-ada-002', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001E33BBBA9D0>), node_parser=SimpleNodeParser(text_splitter=SentenceSplitter(chunk_size=1024, chunk_overlap=20, seperator=' ', paragraph_seperator='\n\n\n', secondary_chunking_regex='[^,.;。]+[,.;。]?', chunking_tokenizer_fn=<function sent_tokenize at 0x000001E34124DEE0>, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001E33BBBA9D0>, tokenizer=functools.partial(<bound method Encoding.encode of <Encoding 'gpt2'>>, allowed_special='all')), include_metadata=True, include_prev_next_rel=True, metadata_extractor=None, callback_manager=<llama_index.callbacks.base.Callba

In [24]:
cog_query_engine = index_result["vector_index"].as_query_engine(service_context=index_result["service_context"], verbose=True, vector_store_query_mode="hybrid")

In [25]:
prompt_query = "Please summarize the document?"
answer = cog_query_engine.query(prompt_query)

INFO:llama_index.vector_stores.cogsearch:Hybrid search with search text: Please summarize the document?
Hybrid search with search text: Please summarize the document?
INFO:llama_index.vector_stores.cogsearch:Vector search with supplied embedding
Vector search with supplied embedding
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://llmdevcog001.search.windows.net/indexes('aoaicogsearchtest05')/docs/search.post.search?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '34576'
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=none'
    'x-ms-client-request-id': '26c92657-4676-11ee-aa00-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
A body is sent with the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes('aoaicogsearchtest05')/docs/search.post.search?api-version=REDACTED'
Request me

In [27]:
print(answer)

The document is an annual report for Woodside Petroleum Ltd for the year 2016. It includes information on the company's governance practices, specifically in relation to shareholder engagement and executive remuneration. The report highlights the board's commitment to understanding shareholder views and making informed decisions. It also outlines changes made to the remuneration arrangements for the company's key management personnel based on shareholder feedback. These changes include using face value instead of fair value for the allocation of Variable Pay Rights (VPRs) and the removal of the second retesting for unvested awards. Overall, the report aims to improve transparency in how remuneration allocations are calculated and align the company's practices with industry standards.


In [None]:
vector_index = index_result["vector_index"]
service_context = index_result["service_context"]

In [None]:
service_context

In [None]:
default_credential = DefaultAzureCredential()
token = default_credential.get_token("https://cognitiveservices.azure.com/.default")

In [None]:
embedding = get_embedding(openai_api_key=token.token, model_kwargs=azure_kwargs)

In [None]:
llm_predictor = get_llm_chat(openai_api_version=azure_kwargs["api_version"], 
                        openai_api_key=token.token, 
                        openai_api_base=azure_kwargs["api_base"],
                        model_kwargs=azure_kwargs) 

In [None]:
llm_predictor.metadata

In [None]:
from llama_index.prompts.base import Prompt
from llama_index.prompts.prompt_type import PromptType

p=Prompt(
    "{query_str}", prompt_type=PromptType.SIMPLE_INPUT
)

llm_predictor.predict(p, query_str="The quick brown fox")

In [None]:
text_test = "The quick brown fox"

embedding.get_text_embedding(text_test)

In [None]:
prompt_helper = PromptHelper(context_window=3000, 
                             num_output=500, 
                             chunk_overlap_ratio=0.1, 
                             chunk_size_limit=1000)

In [None]:
service_context.prompt_helper = prompt_helper

In [None]:
service_context.embed_model.get_text_embedding(text_test)

In [None]:
service_context.llm_predictor.metadata.is_chat_model=True

In [None]:
service_context.llm_predictor.metadata

In [None]:
cog_query_engine = vector_index.as_query_engine(service_context=service_context, verbose=True)

In [None]:
prompt_query = "Please can you summarize Rio Tinto's annual report for 2022? Please include citations"

In [None]:
answer = cog_query_engine.query(prompt_query)

In [None]:
cog_query_engine = vector_index.as_chat_engine(
    chat_mode = "best",
    service_context = service_context,
    verbose=True
)

In [None]:
prompt_query = "Please can you summarize Rio Tinto's annual report for 2022? Please include citations"

#prompt_query = "What happens in a performance review? Please include citations"
#prompt_query = "What are the steps to create a report for a whistleblower?"

cog_response = cog_query_engine.chat(prompt_query)
print(cog_response)

In [None]:
import openai

In [None]:
def generate_prompt(prompt):
    capitalized_prompt = prompt.capitalize()

    return f'The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.\n\nHuman: Hello, who are you?\nAI: I am an AI created by OpenAI. How can I help you today?\nHuman: {capitalized_prompt}' 


In [None]:
openai.api_type = "azure"
openai.api_base = "https://demofcaoai004.openai.azure.com/"
openai.api_key = "651a7678936c41909d25b8f2aa32fa09"
openai.api_version = "2023-05-15"

In [None]:
engine = "gpt-35-turbo-16k"
model = "gpt-35-turbo-16k" 

temperature = 0.7
max_response_tokens = 1024

In [None]:
chat_prompt = "A joke about clowns please"



messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Does Azure OpenAI support customer managed keys?"},
        {"role": "assistant", "content": "Yes, customer managed keys are supported by Azure OpenAI."},
        {"role": "user", "content": "Do other Azure AI services support this too?"}
    ]

In [None]:
response = openai.ChatCompletion.create(
    engine = engine,
    messages = messages,
    temperature = temperature,
    max_tokens=max_response_tokens
  )


print(response['choices'][0]['message']['content'])

In [32]:
from llama_index.vector_stores.types import NodeWithEmbedding
from typing import Any, List
from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode

def create_sample_documents(n: int) -> List[NodeWithEmbedding]:
    nodes: List[NodeWithEmbedding] = []

    for i in range(n):
        nodes.append(
        [
            NodeWithEmbedding(
                node=TextNode(
                    text=f"test node text {i}",
                    id_=f"test node id {i}",
                    relationships={
                        NodeRelationship.SOURCE: RelatedNodeInfo(node_id=f"test doc id {i}")
                    },
                ),
                embedding=[0.5, 0.5],
            )
        ]
    )

    return nodes

In [33]:
create_sample_documents(12)

[[NodeWithEmbedding(node=TextNode(id_='test node id 0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='test doc id 0', node_type=None, metadata={}, hash=None)}, hash='40b8de9732bf78fef5d572211acd4a4a365418a07e62da8a573ce9785c2185fd', text='test node text 0', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), embedding=[0.5, 0.5])],
 [NodeWithEmbedding(node=TextNode(id_='test node id 1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='test doc id 1', node_type=None, metadata={}, hash=None)}, hash='00bef46d1dbbc23601fe8f4dc6799e6212909ae281412ebc6d37d5df1877fa7a', text='test node text 1', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{c