In [1]:
import os
import time

import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import chromadb.utils.embedding_functions as embedding_functions
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

In [3]:
'''
If the idea is to update an entire document that has been chunked, instead of updating,
it is better to delete the old document and add the new one.
'''
# Data needed for update
vector_db_id = 1
document_ids = vector_db_id
# It is possible to find the ID using the other metadata keys.
# This notebook searches for the ID using these metadata.

document_name = "Agent_description"
first_page = -1
last_page = -1
added_by = 'default'

In [4]:
# Create a local Croma database. Não roda direto no Azure.
persist_directory = 'embedding_OpenAI/chroma/'
client = chromadb.PersistentClient(path = persist_directory)

# Embedding for LangChain
embedding_function_to_langchain = AzureOpenAIEmbeddings(
    model = 'text-embedding-3-small',
    api_key=EMBEDDING_KEY,
    deployment = DEPLOYMENT_NAME_EMBEDDING,
    azure_endpoint = AZURE_ENDPOINT_EMBEDDING,
) 

# Embedding for ChromaDB
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
                api_key = EMBEDDING_KEY,
                api_base = API_BASE_EMBEDDING ,
                api_type = "azure",
                api_version="2023-05-15",
                model_name=DEPLOYMENT_NAME_EMBEDDING
            )

# Create or get collection for full text
collection_full_text = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)

# Create or get collection for questions
collection_questions_text = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)

# Chroma instances
chroma_instance_full_text = Chroma(
    client = client,
    collection_name = "full_text",
    embedding_function = embedding_function_to_langchain,
)
chroma_instance_questions = Chroma(
    client = client,
    collection_name = "questions_text",
    embedding_function = embedding_function_to_langchain,
)

## Find id of document to be updated

In [5]:
# Define the metadata filter
metadata_filter = {
    "$and": [
        {"document_name": document_name},
        {"first_page": first_page},
        {"last_page": last_page},
        {"added_by": added_by},
    ]
}

# Perform a query based on metadata filtering
results = collection_full_text.get(
    where = metadata_filter  # filter by the metadata fields
)

# Output the document id(s) that match the query
document_ids = results["ids"][0]
print("Document IDs matching the query:", document_ids)

Document IDs matching the query: 1


## Hypothetical Questions

In [6]:
llm = AzureChatOpenAI(
    deployment_name = DEPLOYMENT_NAME_LLM,
    model_name = "gpt-4o-mini",
    api_version = API_VERSION,
    azure_endpoint = AZURE_ENDPOINT_LLM,
    api_key = AZURE_OPENAI_API_KEY_2,
)

# Define template for answers
# Build prompt
template = """Use the following pieces of context to answer the question at the end.
{context}
If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
Do not hallucinate.
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
llm_chain = prompt | llm

question = """Make as many relevant specific and/or generic questions that the above text can answer.
If you can't make a question with context, just don't say anything, don't try to make up an questions just to fill the quota."""

## New document

In [7]:
# New text
# Agent's introdutory text
text_who_am_I = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
"""
print(text_who_am_I)


Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.



In [8]:
flag_run = 1
if flag_run == 1:
    # Update collection_full_text
    vector_db_id = document_ids 
    collection_full_text.update(
        documents = [text_who_am_I], # The method .add() in the Chroma collections expects a list of documents. You have provided a single string (text_who_am_I). Make sure it’s wrapped in a list.
        metadatas = {"document_name": document_name,  
            "vector_db_id": vector_db_id, 
            'first_page': first_page, 
            'last_page': last_page,
            'added_by': added_by,
        },
        ids = [str(vector_db_id)]
    )

    # Make questions
    context = text_who_am_I
    result = llm_chain.invoke({"context": context, "question": question})
    collection_questions_text.update(
        documents = result.content,
        metadatas = {"document_name": document_name, 
            "vector_db_id": vector_db_id, 
            'first_page': first_page, 
            'last_page': last_page,
            'added_by': added_by,
        },
        ids = [str(vector_db_id)]
    )

## TEST

In [5]:
# Retrieve the document based on its ID
document_id = "1"  # Replace with the actual ID you are looking for
result = chroma_instance_full_text.get(ids=[document_id])
result

{'ids': ['1'],
 'embeddings': None,
 'documents': ["\nHello.\nWho am I?\nI'm just a test to verify it this documment was correctly updated in the vectorDB.\nI'm just an update.\n"],
 'uris': None,
 'data': None,
 'metadatas': [{'added_by': 'default',
   'document_name': 'Agent_description',
   'first_page': -1,
   'last_page': -1,
   'vector_db_id': '1'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [7]:
text_who_am_I = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
"""

results = chroma_instance_full_text.similarity_search_with_score(
        text_who_am_I,
        k = 2,
        filter = {'document_name': document_name}
    )
results

[(Document(id='1', metadata={'added_by': 'default', 'document_name': 'Agent_description', 'first_page': -1, 'last_page': -1, 'vector_db_id': '1'}, page_content="\nHello.\nWho am I?\nI'm just a test to verify it this documment was correctly updated in the vectorDB.\nI'm just an update.\n"),
  0.1411542547359449)]

In [8]:
results = chroma_instance_full_text.similarity_search_with_score(
        text_who_am_I,
        k = 2,
        filter = {'first_page': -1}
    )
similarity_score = 1 - results[0][1] / 2
similarity_score

0.9294228726320275

In [9]:
results = chroma_instance_full_text.similarity_search_with_score(
        text_who_am_I,
        k = 2,
        filter = {'first_page': 1}   # This is the wrong page.
    )
similarity_score = 1 - results[0][1] / 2
similarity_score

0.1505984663963318

In [10]:
results = chroma_instance_full_text.similarity_search_with_score(
        "Hi!",
        k = 1,
        # filter = filter_metadata
    )
similarity_score = 1 - results[0][1] / 2
similarity_score

0.3524348097326879

In [11]:
results = chroma_instance_full_text.similarity_search_with_score(
        "Hello",
        k = 1,
        # filter = filter_metadata
    )
results

[(Document(id='1', metadata={'added_by': 'default', 'document_name': 'Agent_description', 'first_page': -1, 'last_page': -1, 'vector_db_id': '1'}, page_content="\nHello.\nWho am I?\nI'm just a test to verify it this documment was correctly updated in the vectorDB.\nI'm just an update.\n"),
  1.3629798398061506)]