In [1]:
import os
import time
from dotenv import load_dotenv

import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import chromadb.utils.embedding_functions as embedding_functions
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

In [3]:
# Dados necessários para update
added_by = "Max Wienandts"
document_name = "Custon added"
first_page = -1
last_page = -1

text_to_add = """
This is an example of how to add more information in your ChromaDb.
To do so, just read this notebook.
"""

## Chunk text

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1500,
        chunk_overlap = 400
    )
chunks = text_splitter.split_text(text_to_add)

## Hypothetical Questions

In [5]:
# Model used
llm = AzureChatOpenAI(
    deployment_name = DEPLOYMENT_NAME_LLM,
    model_name = "gpt-4o-mini",
    api_version = API_VERSION,
    azure_endpoint = AZURE_ENDPOINT_LLM,
    api_key = AZURE_OPENAI_API_KEY_2,
)
# Embedding for LangChain
embedding_function_to_langchain = AzureOpenAIEmbeddings(
    model = 'text-embedding-3-small',
    api_key=EMBEDDING_KEY,
    deployment = DEPLOYMENT_NAME_EMBEDDING,
    azure_endpoint = AZURE_ENDPOINT_EMBEDDING,
) 

# Embedding for ChromaDB
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
                api_key = EMBEDDING_KEY,
                api_base = API_BASE_EMBEDDING ,
                api_type = "azure",
                api_version="2023-05-15",
                model_name=DEPLOYMENT_NAME_EMBEDDING
            )

# Define template for answers
# Build prompt
template = """Use the following pieces of context to answer the question at the end.
{context}
If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
Do not hallucinate.
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
llm_chain = prompt | llm

question = """Make as many relevant specific and/or generic questions that the above text can answer.
If you can't make a question with context, just don't say anything, don't try to make up an questions just to fill the quota."""

## Load VectorDB

In [6]:
# Create/load a local Croma database.
persist_directory = 'embedding_OpenAI/chroma/'
client = chromadb.PersistentClient(path = persist_directory)
# Create or get a collection
collection_full_text = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)
collection_questions_text = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)

In [7]:
# Get last index
# Get last id
# Retrieve documents with metadata, which may contain IDs
documents = collection_full_text.get(include=["documents", "metadatas"])

# Assuming your IDs are stored in the metadata and are numeric or sortable
max_index = max([doc['vector_db_id'] for doc in documents['metadatas']], key=lambda x: int(x))

print(f"Largest index: {max_index}")

Largest index: 149


In [8]:
# Adiciona na coleção para armazenar o texto completo
add_text_collection_full_text = 1
if add_text_collection_full_text == 1:
    vector_db_id = max_index + 1
    for e in tqdm.tqdm(chunks):
        collection_full_text.add(
            documents = e,
            metadatas = {"document_name": document_name,
                "vector_db_id": vector_db_id, 
                'first_page': -1, # This is a custom added text, so there isn't a first and last page.
                'last_page': -1,
                'added_by': added_by,
            },
            ids = [str(vector_db_id)]
        )
        # Questions
        context = e
        result = llm_chain.invoke({"context": context, "question": question})
        collection_questions_text.add(
            documents = result.content,
            metadatas = {"document_name": document_name,
                "vector_db_id": vector_db_id, 
                'first_page': -1, 
                'last_page': -1,
                'added_by': added_by,
            },
            ids = [str(vector_db_id)]
        )

        vector_db_id += 1

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.67s/it]


## Verify if collections have the same size

In [9]:
# Create or get a collection
collection_full_text_aux = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)
collection_questions_aux = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)

# Get last index
# Get last id
# Retrieve documents with metadata, which may contain IDs
documents_full_text = collection_full_text_aux.get(include=["documents", "metadatas"])
documents_questions_text = collection_questions_aux.get(include=["documents", "metadatas"])

# Assuming your IDs are stored in the metadata and are numeric or sortable
max_index_full_text = max([doc['vector_db_id'] for doc in documents_full_text['metadatas']], key=lambda x: int(x))
max_index_questions = max([doc['vector_db_id'] for doc in documents_questions_text['metadatas']], key=lambda x: int(x))


print(f"Full text largest index: {max_index_full_text}")
print(f"Questions largest index: {max_index_questions}")

Full text largest index: 150
Questions largest index: 150


## Test

In [10]:
# Create/load a local Croma database.
persist_directory = 'embedding_OpenAI/chroma/'
client = chromadb.PersistentClient(path = persist_directory)

chroma_instance_full_text = Chroma(
    client = client,
    collection_name = "full_text",
    embedding_function = embedding_function_to_langchain,
)
chroma_instance_questions = Chroma(
    client = client,
    collection_name = "questions_text",
    embedding_function = embedding_function_to_langchain,
)

In [11]:
# Retrieve the document based on its ID
document_id = "150"  # Replace with the actual ID you are looking for
result = chroma_instance_full_text.get(ids=[document_id])
result

{'ids': ['150'],
 'embeddings': None,
 'documents': ['This is an example of how to add more information in your ChromaDb.\nTo do so, just read this notebook.'],
 'uris': None,
 'data': None,
 'metadatas': [{'added_by': 'Max Wienandts',
   'document_name': 'Custon added',
   'first_page': -1,
   'last_page': -1,
   'vector_db_id': 150}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [12]:
text_to_find = 'This is an example of how to add more information in your ChromaDb.\nTo do so, just read this notebook.'
print(text_to_find)

This is an example of how to add more information in your ChromaDb.
To do so, just read this notebook.


In [13]:
results = chroma_instance_full_text.similarity_search_with_score(
    text_to_find,
    k = 2,
)
results

[(Document(id='150', metadata={'added_by': 'Max Wienandts', 'document_name': 'Custon added', 'first_page': -1, 'last_page': -1, 'vector_db_id': 150}, page_content='This is an example of how to add more information in your ChromaDb.\nTo do so, just read this notebook.'),
  0.01594089670725535),
 (Document(id='149', metadata={'added_by': 'default', 'document_name': 'Custon added', 'first_page': -1, 'last_page': -1, 'vector_db_id': 149}, page_content='This is an example of how to add more information in your ChromaDb.\nTo do so, just read this notebook.'),
  0.01594089670725535)]

In [14]:
results = chroma_instance_full_text.similarity_search_with_score(
    text_to_find,
    k = 2,
    filter={"added_by": {"$in": [added_by, 'deafult']}},
)
results

[(Document(id='150', metadata={'added_by': 'Max Wienandts', 'document_name': 'Custon added', 'first_page': -1, 'last_page': -1, 'vector_db_id': 150}, page_content='This is an example of how to add more information in your ChromaDb.\nTo do so, just read this notebook.'),
  0.01594089670725535)]