In [1]:
! pip install python-dotenv langchain langchain_openai langchain-community langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.4.0b8
! pip install langchain --upgrade

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [2]:
# Global Variables
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# Define fileds for the index of the corpus

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=1536,
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store which engine got an answer
    SearchableField(
        name="engine",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

vector_stores = ["pdf-sample", "mac"]  # List of vector stores to query for answers

In [3]:
# Set environment variables
# Called multiple times in differenct cells, you can run them independently
from dotenv import load_dotenv
from langchain.embeddings import AzureOpenAIEmbeddings
import os

from langchain import hub
from langchain.chat_models import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch

from dotenv import load_dotenv
import os


def set_env():
    load_dotenv()  # take environment variables from .env.
    os.environ["openai.api_type"] = os.getenv("openai.api_type")
    os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
    os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
    global doc_intelligence_endpoint
    doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")

    global doc_intelligence_key
    doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

    global api_version
    api_version = os.getenv("openai.api_version")

    global ada_deployed_model
    ada_deployed_model = os.getenv("ada")

    global gpt4_deployed_model
    gpt4_deployed_model = os.getenv("gpt4")

    global vector_store_address
    vector_store_address = os.getenv("AZURE_SEARCH_ENDPOINT")

    global vector_store_password
    vector_store_password = os.getenv("AZURE_SEARCH_ADMIN_KEY")


set_env()

In [4]:
# Defines the function to chunk and embed a document


def process_file(file_path, index_name):
    # Set environment variables
    set_env()

    # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
    loader = AzureAIDocumentIntelligenceLoader(
        file_path=file_path,
        api_key=doc_intelligence_key,
        api_endpoint=doc_intelligence_endpoint,
        api_model="prebuilt-layout",
    )
    docs = loader.load()

    # Split the document into chunks base on markdown headers.
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    docs_string = docs[0].page_content
    splits = text_splitter.split_text(docs_string)

    print("Length of splits: " + str(len(splits)))

    # Embed the splitted documents and insert into Azure Search vector store
    # openai.api_base = os.getenv("openai.api_base")
    import openai

    openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")

    aoai_embeddings = AzureOpenAIEmbeddings(
        azure_deployment=ada_deployed_model,
        openai_api_version=api_version,  # e.g., "2023-07-01-preview"
    )

    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=vector_store_address,
        azure_search_key=vector_store_password,
        index_name=index_name,
        embedding_function=aoai_embeddings.embed_query,
    )

    vector_store.add_documents(documents=splits)

In [5]:
# Let's create two indexes
# You can comment out this two lines, after the first run.
# After the first indexes will exist in your Azure Search service. No need to create them again.
# Here for practicality, we create two indexes, but in the final implementation, this could be totally different vector databases/stores
process_file("../sample_docs/pdf-sample.pdf", "pdf-sample")
process_file("../sample_docs/Mac.pdf", "mac")

Length of splits: 1


  warn_deprecated(


Length of splits: 2


In [6]:
# Defines the function to store prompts and which index to search
# This define our corpus of knowledge , of which embedding store (vector store or db) to search


def feed_corpus(question, engine):
    # Set environment variables
    set_env()

    import openai

    openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")

    aoai_embeddings = AzureOpenAIEmbeddings(
        azure_deployment=ada_deployed_model,
        openai_api_version=api_version,  # e.g., "2023-07-01-preview"
    )

    vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
    vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")

    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=vector_store_address,
        azure_search_key=vector_store_password,
        index_name="corpus",
        embedding_function=aoai_embeddings.embed_query,
        fields=fields,
    )

    vector_store.add_texts([question], [{"content": question, "engine": engine}])

In [7]:
def ask_corpus(question, top=3):
    # Set environment variables
    set_env()
    import openai

    openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")

    aoai_embeddings = AzureOpenAIEmbeddings(
        azure_deployment=ada_deployed_model,
        openai_api_version=api_version,  # e.g., "2023-07-01-preview"
    )

    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=vector_store_address,
        azure_search_key=vector_store_password,
        index_name="corpus",
        embedding_function=aoai_embeddings.embed_query,
        fields=fields,
    )

    retriever = vector_store.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    )

    retrieved_docs = retriever.get_relevant_documents("<your question>")

    if len(retrieved_docs) == 0:  # The corpus does not know which DB has the answer
        return 1, "none"

    # there could be multiple answers, we just take the first one in this versio
    engine = retrieved_docs[0].metadata[
        "engine"
    ]  # Our custom fields, comntains the name of the Vector Store

    return 0, engine

In [8]:
# Cell for testing the functions
# feed_corpus("Who can read a PDF file?", "pdf-sample")
# print(ask_corpus("Who can read a PDF file?", 3))

In [9]:
# Cell for setting up the RAG
def ask_llm_rag(index_name, question):
    # Set environment variables
    set_env()

    found = -1
    aoai_embeddings = AzureOpenAIEmbeddings(
        azure_deployment=ada_deployed_model,
        openai_api_version=api_version,  # e.g., "2023-07-01-preview"
    )
    # Set environment variables
    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=vector_store_address,
        azure_search_key=vector_store_password,
        index_name=index_name,
        embedding_function=aoai_embeddings.embed_query,
    )
    retriever = vector_store.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    )

    retrieved_docs = retriever.get_relevant_documents("<your question>")

    if len(retrieved_docs) == 0:  # This vector store does not has anwers
        found = 1
    else:
        found = 0  # This vector store has anwers (success, exit code 0, unix style :) )

    # Use a prompt for RAG that is checked into the LangChain prompt hub (https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=989ad331-949f-4bac-9694-660074a208a7)
    prompt = hub.pull("rlm/rag-prompt")
    llm = AzureChatOpenAI(
        openai_api_version=api_version,  # e.g., "2023-07-01-preview"
        azure_deployment=gpt4_deployed_model,
        temperature=0,
    )

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain_pdf = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    answer = rag_chain_pdf.invoke(question)
    if answer.find('The provided context does not contain information') != -1 :
        found = 1  # This vector store does not has anwers
    if answer.find('The context provided does not contain information') != -1 :
        found = 1 # This vector store does not has anwers
    # Improve this part! 
    return (found, answer)

In [10]:
# Cell for testing the RAG
# (found, ans) = ask_llm_rag("pdf-sample", "Who can read a PDF file?")
# print(ans)
# print(found)

In [11]:
# Query multiple vector stores, first the corpus, then all vector stores. Also update the corpus if the answer is found in a vector store.


def query_with_corpus(question):
    # Set environment variables
    set_env()
    print("  -- Asking the reference corpus first -- ")
    (found, vector_db) = ask_corpus(question)
    if found == 0:  # Found in that vector store (success, exit code 0, unix style :) )
        skip_scan_alls = True
        print(
            "    -- The corpus knows which vector store has information about:  "
            + question
            + " -- "
        )
        print("    -- Querying specific vector store: " + vector_db)
        (found, answer) = ask_llm_rag(vector_db, question)
        if (
            found == 0
        ):  # Found in that vector store (success, exit code 0, unix style :) )
            print("The answer is: " + answer)

        else:
            print(
                "Vector store "
                + vector_db
                + " does not have the answer for the question: "
                + question
            )
    else:
        print(
            "    -- The corpus does not know which vector store has information about:  "
            + question
            + " -- "
        )
        print("    -- Querying all vector stores -- ")
        skip_scan_alls = False

    if skip_scan_alls is False:
        for store in vector_stores:
            print(
                "  -- All vector stores will be queried, now Querying vector store: "
                + store
            )
            (found, answer) = ask_llm_rag(store, question)
            if (
                found == 0
            ):  # Found in that vector store (success, exit code 0, unix style :) )
                feed_corpus(question, store)
                print(
                    "    -- The corpus has been updated: "
                    + store
                    + " has knowledge about the answer for the question: "
                    + question
                    + " -- "
                )
                print("The answer is: " + answer)
            else:
                print(
                    "    --  Vector store "
                    + store
                    + " does not have the answer for the question: "
                    + question
                )

In [12]:
# Multiple index (in the future) multile vector stores
# The corpus is the vector store that contains which vector store has the answer

# We will do two iterartions
# First interation our corupus will be empty, we will need to query al vector stores
# Second iteration we will have a corpus and we will know which vector store has the answer

# First iteration
print("-- First iteration -- ")
query_with_corpus("Describe characteristics of an iMac G3?")

print("-- Second iteration -- ")
# Second iteration
# We expected to corpus to know which vector store has the answer, avoiding querying all vector stores
query_with_corpus("Describe characteristics of an iMac G3?")

-- First iteration -- 
  -- Asking the reference corpus first -- 
    -- The corpus does not know which vector store has information about:  Describe characteristics of an iMac G3? -- 
    -- Querying all vector stores -- 
  -- All vector stores will be queried, now Querying vector store: pdf-sample


  warn_deprecated(


    --  Vector store pdf-sample does not have the answer for the question: Describe characteristics of an iMac G3?
  -- All vector stores will be queried, now Querying vector store: mac
    -- The corpus has been updated: mac has knowledge about the answer for the question: Describe characteristics of an iMac G3? -- 
The answer is: The iMac G3, introduced by Apple in 1998, was a significant product that helped push the Mac mainstream. It was part of a focused product oversight by Steve Jobs after his return to Apple in 1996. The iMac G3 was part of the transition to the OS X operating system and the shift to Intel processors from 2005 to 2006.
-- Second iteration -- 
  -- Asking the reference corpus first -- 
    -- The corpus knows which vector store has information about:  Describe characteristics of an iMac G3? -- 
    -- Querying specific vector store: mac
The answer is: The iMac G3, introduced by Apple in 1998, was a significant product that helped push the Mac mainstream. It was 