In [1]:
import os
import time

from tqdm import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain_elasticsearch import ElasticsearchStore
from elasticsearch import Elasticsearch

from langchain.prompts import PromptTemplate

from uuid import uuid4
from langchain_core.documents import Document

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

ELASTICSEARCH_USER = os.getenv('ELASTICSEARCH_USER')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_USER not found in environment variables")

ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_PASSWORD not found in environment variables")

ELASTICSEARCH_API_KEY = os.getenv('ELASTICSEARCH_API_KEY')
if not ELASTICSEARCH_API_KEY:
    raise ValueError("ELASTICSEARCH_API_KEY not found in environment variables")

ELASTICSEARCH_ENDPOINT = os.getenv('ELASTICSEARCH_ENDPOINT')
if not ELASTICSEARCH_ENDPOINT:
    raise ValueError("ELASTICSEARCH_ENDPOINT not found in environment variables")

## Hypothetical Questions

In [3]:
# llm_hypothetical_questions
def llm_hypothetical_questions():
    from langchain_openai import AzureChatOpenAI
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain.prompts import PromptTemplate
    # Model used
    llm = AzureChatOpenAI(
        deployment_name = DEPLOYMENT_NAME_LLM,
        model_name = "gpt-4o-mini",
        api_version = API_VERSION,
        azure_endpoint = AZURE_ENDPOINT_LLM,
        api_key = AZURE_OPENAI_API_KEY_2,
    )
    # Embedding for LangChain
    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 
    # Define template for answers
    # Build prompt
    template = """Use the following pieces of context to complete the task at the end.
    {context}
    If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
    Do not hallucinate.
    Task: {question}"""
    
    prompt = PromptTemplate.from_template(template)
    llm_chain = prompt | llm
    
    question = """Make as many relevant technical specific and/or generic questions that the above text can answer.
    If you can't make a technical question with the context, just don't write anything, don't try to make up an questions just to fill the quota."""

    dict_return = {}
    dict_return['llm_chain'] = llm_chain
    dict_return['question'] = question 

    return dict_return

In [4]:
# Hypothetical Questions
dict_llm_hypothetical_questions = llm_hypothetical_questions()
llm_chain = dict_llm_hypothetical_questions['llm_chain']
question = dict_llm_hypothetical_questions['question']

## Load VectorDB

In [5]:
def return_vectordb_full_text_and_questions():
    '''
    collection_full_text: Connection to the full-text index.
    collection_questions_text: Connection to the question-based index.
    es: Direct Elasticsearch client instance.
    '''
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain_elasticsearch import ElasticsearchStore
    from elasticsearch import Elasticsearch

    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 

    dict_return = {}
    # Conect to elastic search
    dict_return['collection_full_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_full_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )
    
    dict_return['collection_questions_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_questions_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )

    dict_return['es'] = Elasticsearch(
        ELASTICSEARCH_ENDPOINT,
        basic_auth=(ELASTICSEARCH_USER, ELASTICSEARCH_PASSWORD)
    )
    
    return dict_return

In [6]:
# Get necessary information about the collection in the vector db
index_name_collection_full_text = 'collection_full_text'
index_name_collection_questions_text = 'collection_questions_text'
# Connect with elastic seach and langchain
dict_vectordb = return_vectordb_full_text_and_questions()

## Add new document

In [7]:
# Dados necessários para update
added_by = "Max Wienandts"
document_name = "Custon added"
first_page = -1
last_page = -1

text_to_add = """
This is an example of how to add more information in your ChromaDb.
To do so, just read this notebook.
"""

In [8]:
# Chunk new text
chunk_size = 2000
chunk_overlap = 400
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
chunks = text_splitter.split_text(text_to_add)

In [9]:
# get_highest_vectordb_id
# Get the highest id in vector db so we can add new entries.
def get_highest_vectordb_id(es_fc, index_name):
    # Check if the index exists
    if not es_fc.indices.exists(index=index_name):
        print(f"Index '{index_name}' does not exist.")
        return 0
        
    query = {
        "size": 0,  # We don't need to return any actual documents, just the aggregation
        "aggs": {
            "max_vector_db_id": {
                "max": {
                    "field": "metadata.vector_db_id"    
                }
            }
        }
    }
    
    # Perform the search with the aggregation
    response = es_fc.search(index = index_name, body = query)
    
    # Extract the maximum value from the aggregation response
    max_vector_db_id = response['aggregations']['max_vector_db_id']['value']
    if max_vector_db_id == None:
        return 0
    return max_vector_db_id

In [10]:
# Get last index
highest_id_collection_full_text = get_highest_vectordb_id(
    index_name = index_name_collection_full_text, 
    es_fc = dict_vectordb['es']
)
print(f'Largest index: {highest_id_collection_full_text}')

Largest index: 148.0


In [11]:
# Add new text in the full text collection
vector_db_id = highest_id_collection_full_text + 1 

# Prepare bulk data
bulk_full_text = []
bulk_questions_text = []

for e in tqdm(chunks):
    context = e
    # Add full text chunk
    bulk_full_text.append(Document
        (
            page_content = context,
            metadata = {
                "document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'added_by': added_by,
            },
        )
    )

    # Make question using LLM chain
    result = llm_chain.invoke({"context": context, "question": question})
    
    # Add question response chunk
    bulk_questions_text.append(Document
        (
            page_content = result.content,
            metadata = {
                "document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'added_by': added_by,
            },
        )
    )
    
    vector_db_id += 1

# Bulk index the data into Elasticsearch
uuids_bulk_full_text = [str(uuid4()) for _ in range(len(bulk_full_text))]
dict_vectordb['collection_full_text'].add_documents(documents = bulk_full_text, 
                                                         ids = uuids_bulk_full_text)

uuids_bulk_questions_text = [str(uuid4()) for _ in range(len(bulk_questions_text))]
dict_vectordb['collection_questions_text'].add_documents(documents = bulk_questions_text, 
                                                              ids = uuids_bulk_questions_text)

print("Documents added to Elasticsearch successfully.")

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.02it/s]


Documents added to Elasticsearch successfully.


## Verify if collections have the same size

In [12]:
# Get last index to add documents
# Retrieve documents with metadata, which may contain IDs
highest_id_collection_full_text = get_highest_vectordb_id(
    index_name = index_name_collection_full_text, 
    es_fc = dict_vectordb['es']
)
print('----------------------------------------------')
highest_id_collection_questions_text = get_highest_vectordb_id(
    index_name = index_name_collection_questions_text, 
    es_fc = dict_vectordb['es']
)
print('Both collection should have the same max index.')
print(f"Largest index for collection full_text: {highest_id_collection_full_text}")
print(f"Largest index for collection questions: {highest_id_collection_questions_text}")

----------------------------------------------
Both collection should have the same max index.
Largest index for collection full_text: 149.0
Largest index for collection questions: 149.0


## Test

In [13]:
# search_document_by_document_name_full
# Return all result object, not only the id
def search_document_by_document_name_full(dict_vectordb_fc, index_name, document_name_to_find):
    """
    Search Elasticsearch index for a document with a matching content query and return its metadata.
    
    :param index_name: Elasticsearch index name
    :param query: Query to search in the document content
    :param size: Number of documents to retrieve
    :return: Document id, including the metadata
    """
    search_result = dict_vectordb_fc['es'].search(
        index = index_name,
        body = {
            "query": {
                "term": {
                    "metadata.document_name.keyword": document_name_to_find  # We can change document_name for any other metadata key.
                }
            },
            "size": 10000  # Move 'size' inside the 'body'
        }
    )
    
    if search_result["hits"]["total"]["value"] > 0:  # represents the total number of documents that match the query.
        return search_result
    else:
        return None

In [15]:
doc_to_find = "Custon added"
search_result_full_text = search_document_by_document_name_full(dict_vectordb_fc = dict_vectordb, 
    index_name = "collection_full_text", 
    document_name_to_find = doc_to_find,
)
search_result_questions = search_document_by_document_name_full(dict_vectordb_fc = dict_vectordb, 
    index_name = 'collection_questions_text', 
    document_name_to_find = doc_to_find,
)

In [16]:
search_result_full_text["hits"]['hits'][0]['_source']

{'text': 'This is a brand new text.\nThe correct number to choose is 42.',
 'metadata': {'document_name': 'Custon added',
  'vector_db_id': 148.0,
  'first_page': -1,
  'last_page': -1,
  'added_by': 'Max Wienandts'},
 'vector': [-0.009269836358726025,
  0.003505607834085822,
  0.03780817985534668,
  -0.01819044165313244,
  -0.018160073086619377,
  -0.026951612904667854,
  0.008047523908317089,
  0.026237964630126953,
  0.0018220809288322926,
  -0.012344597838819027,
  0.025630604475736618,
  -0.021272795274853706,
  -0.03261524811387062,
  0.041725654155015945,
  0.04075387492775917,
  0.015009391121566296,
  -0.009459637105464935,
  -0.017476793378591537,
  -0.0416041798889637,
  -0.002503463299944997,
  -0.0010866055963560939,
  -0.006316547282040119,
  -0.016368359327316284,
  -0.02462846040725708,
  -0.014508319087326527,
  0.02166757918894291,
  -0.014941063709557056,
  -0.05253666639328003,
  0.014607015065848827,
  -0.03659345954656601,
  -0.038567379117012024,
  -0.03720081970

In [17]:
search_result_questions["hits"]['hits'][0]['_source']

{'text': '1. What is the significance of the number 42 in the provided text?  \n2. Is the number 42 related to any specific mathematical or scientific concept in the context given?  \n3. Does the text provide any criteria or conditions for selecting the number 42?  \n4. What type of document or content does the phrase "This is a brand new text" suggest?  \n5. Is there any indication of relevance for the number 42 in terms of decision-making or problem-solving in the text?  \n6. What might be the implications of stating that "the correct number to choose is 42"?',
 'metadata': {'document_name': 'Custon added',
  'vector_db_id': 148.0,
  'first_page': -1,
  'last_page': -1,
  'added_by': 'Max Wienandts'},
 'vector': [0.009630156680941582,
  -0.013952670618891716,
  0.03533067926764488,
  -0.006322931963950396,
  0.023830784484744072,
  -0.01620439812541008,
  0.0036624083295464516,
  -0.023723559454083443,
  -0.0026320419274270535,
  0.015668272972106934,
  0.036054451018571854,
  -0.038

In [18]:
# Verify similarity
query_text = """
This is an example of how to add more information in your ChromaDb.
To do so, just read this notebook.
"""

# Perform similarity search with scores
similar_docs_with_scores = dict_vectordb['collection_full_text'].similarity_search_with_score(query_text, k=3)

# Print results
for i, (doc, score) in enumerate(similar_docs_with_scores):
    print(f"Rank {i+1}: {doc.page_content}, Score: {score}, Metadata: {doc.metadata}\n\n")

Rank 1: This is an example of how to add more information in your ChromaDb.
To do so, just read this notebook., Score: 0.9931185, Metadata: {'document_name': 'Custon added', 'vector_db_id': 149.0, 'first_page': -1, 'last_page': -1, 'added_by': 'Max Wienandts'}


Rank 2: 
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
, Score: 0.68745255, Metadata: {'document_name': 'Agent_description', 'vector_db_id': 1, 'first_page': -1, 'last_page': -1, 'added_by': 'default'}


Rank 3: Cross-sequence Dependency. To solve the cross-sequence dependency challenge in stock movement forecasting, To
address the challenge of cross-sequence dependencies in stock movement forecasting, TDML [ 197] offers numerous
examples from stocks similar to the target stock when creating in-context learning prompts. This approach illustrates that
LLMs can effectively incorporate cross-sequence information from related stocks. LLMST [ 206] puts all the