In [1]:
import os
import time

from tqdm import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain_elasticsearch import ElasticsearchStore
from elasticsearch import Elasticsearch

from langchain.prompts import PromptTemplate

from uuid import uuid4
from langchain_core.documents import Document

In [2]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

ELASTICSEARCH_USER = os.getenv('ELASTICSEARCH_USER')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_USER not found in environment variables")

ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_PASSWORD not found in environment variables")

ELASTICSEARCH_API_KEY = os.getenv('ELASTICSEARCH_API_KEY')
if not ELASTICSEARCH_API_KEY:
    raise ValueError("ELASTICSEARCH_API_KEY not found in environment variables")

ELASTICSEARCH_ENDPOINT = os.getenv('ELASTICSEARCH_ENDPOINT')
if not ELASTICSEARCH_ENDPOINT:
    raise ValueError("ELASTICSEARCH_ENDPOINT not found in environment variables")

In [3]:
def return_vectordb_full_text_and_questions():
    '''
    collection_full_text: Connection to the full-text index.
    collection_questions_text: Connection to the question-based index.
    es: Direct Elasticsearch client instance.
    '''
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain_elasticsearch import ElasticsearchStore
    from elasticsearch import Elasticsearch

    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 

    dict_return = {}
    # Conect to elastic search
    dict_return['collection_full_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_full_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )
    
    dict_return['collection_questions_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_questions_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )

    dict_return['es'] = Elasticsearch(
        ELASTICSEARCH_ENDPOINT,
        basic_auth=(ELASTICSEARCH_USER, ELASTICSEARCH_PASSWORD)
    )
    
    return dict_return

# Get necessary information about the collection in the vector db
index_name_collection_full_text = 'collection_full_text'
index_name_collection_questions_text = 'collection_questions_text'
# Connect with elastic seach and langchain
dict_vectordb = return_vectordb_full_text_and_questions()

In [4]:
'''
If the idea is to update an entire document that has been chunked, instead of updating,
it is better to delete the old document and add the new one.
'''
# Data needed for update
# It is possible to find the ID using the other metadata keys.
# This notebook searches for the ID using these metadata.
vector_db_id = 1
document_name = "Agent_description"
first_page = -1
last_page = -1
added_by = 'default'

## Find id of document to be updated

In [5]:
# search_document_by_document_name
def search_document_by_document_name(dict_vectordb_fc, index_name, document_name_to_find):
    """
    Search Elasticsearch index for a document with a matching content query and return its metadata.
    
    :param index_name: Elasticsearch index name
    :param query: Query to search in the document content
    :param size: Number of documents to retrieve
    :return: Document id, including the metadata
    """
    search_result = dict_vectordb_fc['es'].search(
        index = index_name,
        body = {
            "query": {
                "term": {
                    "metadata.document_name.keyword": document_name_to_find  # We can change document_name for any other metadata key.
                }
            },
            "size": 10000  # Move 'size' inside the 'body'
        }
    )
    
    if search_result["hits"]["total"]["value"] > 0:  # represents the total number of documents that match the query.
        document_ids = [hit['_id'] for hit in search_result['hits']['hits']]
        return document_ids
    else:
        return None

In [6]:
doc_to_update = "Agent_description"
document_ids_full_text = search_document_by_document_name(dict_vectordb_fc = dict_vectordb, 
    index_name = "collection_full_text", 
    document_name_to_find = doc_to_update,
)
document_ids_questions = search_document_by_document_name(dict_vectordb_fc = dict_vectordb, 
    index_name = 'collection_questions_text', 
    document_name_to_find = doc_to_update,
)
print(f'Update document id full text: {document_ids_full_text}')
print(f'Update document id questions: {document_ids_questions}')

Update document id full text: ['6a5d0d4a-e48d-469c-ba45-364c52810204']
Update document id questions: ['7a42311c-ad89-47b1-8228-47c117252ce4']


## Hypothetical Questions

In [7]:
# llm_hypothetical_questions
def llm_hypothetical_questions():
    from langchain_openai import AzureChatOpenAI
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain.prompts import PromptTemplate
    # Model used
    llm = AzureChatOpenAI(
        deployment_name = DEPLOYMENT_NAME_LLM,
        model_name = "gpt-4o-mini",
        api_version = API_VERSION,
        azure_endpoint = AZURE_ENDPOINT_LLM,
        api_key = AZURE_OPENAI_API_KEY_2,
    )
    # Embedding for LangChain
    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 
    # Define template for answers
    # Build prompt
    template = """Use the following pieces of context to complete the task at the end.
    {context}
    If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
    Do not hallucinate.
    Task: {question}"""
    
    prompt = PromptTemplate.from_template(template)
    llm_chain = prompt | llm
    
    question = """Make as many relevant technical specific and/or generic questions that the above text can answer.
    If you can't make a technical question with the context, just don't write anything, don't try to make up an questions just to fill the quota."""

    dict_return = {}
    dict_return['llm_chain'] = llm_chain
    dict_return['question'] = question 

    return dict_return

In [8]:
# Hypothetical Questions
dict_llm_hypothetical_questions = llm_hypothetical_questions()
llm_chain = dict_llm_hypothetical_questions['llm_chain']
question = dict_llm_hypothetical_questions['question']

## New document

In [9]:
# New text
# Agent's introdutory text
text_who_am_I = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
"""
print(text_who_am_I)


Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.



In [10]:
# Update documents in vector db
vector_db_id = 1
document_name = "Agent_description"
first_page = -1
last_page = -1
added_by = 'default'

# Prepare bulk data
bulk_full_text = []
bulk_questions_text = []

print('Adding documents to Elasticsearch:')

# Add an introdutory text
document_name = "Agent_description"
# Add full text chunk
bulk_full_text.append(Document
    (
        page_content = text_who_am_I,
        metadata = {
            "document_name": document_name,  
            "vector_db_id": vector_db_id, 
            'first_page': first_page,
            'last_page': last_page,
            'added_by': added_by,
        },
    )
)
# Make questions
context = text_who_am_I
result = llm_chain.invoke({"context": context, "question": question})
# Add questions in the collection
bulk_questions_text.append(Document
    (
        page_content = result.content,
        metadata = {
            "document_name": document_name,  
            "vector_db_id": vector_db_id, 
            'first_page': first_page,
            'last_page': last_page,
            'added_by': added_by,
        },
    )
)
vector_db_id += 1
# Bulk index the data into Elasticsearch
uuids_bulk_full_text = document_ids_full_text    # Used the ids found by search_document_by_document_name()
dict_vectordb['collection_full_text'].add_documents(documents = bulk_full_text, 
                                                         ids = uuids_bulk_full_text)

uuids_bulk_questions_text = document_ids_questions   # Used the ids found by search_document_by_document_name()
dict_vectordb['collection_questions_text'].add_documents(documents = bulk_questions_text, 
                                                              ids = uuids_bulk_questions_text)

Adding documents to Elasticsearch:


['7a42311c-ad89-47b1-8228-47c117252ce4']

## TEST

In [11]:
# search_document_by_document_name_full
# Return all result object, not only the id
def search_document_by_document_name_full(dict_vectordb_fc, index_name, document_name_to_find):
    """
    Search Elasticsearch index for a document with a matching content query and return its metadata.
    
    :param index_name: Elasticsearch index name
    :param query: Query to search in the document content
    :param size: Number of documents to retrieve
    :return: Document id, including the metadata
    """
    search_result = dict_vectordb_fc['es'].search(
        index = index_name,
        body = {
            "query": {
                "term": {
                    "metadata.document_name.keyword": document_name_to_find  # We can change document_name for any other metadata key.
                }
            },
            "size": 10000  # Move 'size' inside the 'body'
        }
    )
    
    if search_result["hits"]["total"]["value"] > 0:  # represents the total number of documents that match the query.
        return search_result
    else:
        return None

In [12]:
doc_to_update = "Agent_description"
search_result_full_text = search_document_by_document_name_full(dict_vectordb_fc = dict_vectordb, 
    index_name = "collection_full_text", 
    document_name_to_find = doc_to_update,
)
search_result_questions = search_document_by_document_name_full(dict_vectordb_fc = dict_vectordb, 
    index_name = 'collection_questions_text', 
    document_name_to_find = doc_to_update,
)

In [21]:
search_result_full_text["hits"]['hits'][0]['_source']

{'_index': 'collection_full_text',
 '_id': '6a5d0d4a-e48d-469c-ba45-364c52810204',
 '_score': 5.28151,
 '_source': {'text': "\nHello.\nWho am I?\nI'm just a test to verify it this documment was correctly updated in the vectorDB.\nI'm just an update.\n",
  'metadata': {'document_name': 'Agent_description',
   'vector_db_id': 1,
   'first_page': -1,
   'last_page': -1,
   'added_by': 'default'},
  'vector': [0.013070615008473396,
   0.00869653932750225,
   -0.009193300269544125,
   -0.0159995686262846,
   -0.039663445204496384,
   0.00843203067779541,
   0.006857879459857941,
   0.01891561783850193,
   -0.042192406952381134,
   -0.013999621383845806,
   0.010857771150767803,
   0.04792128503322601,
   -0.02913469634950161,
   -0.0017773713916540146,
   0.0031821720767766237,
   0.06812719255685806,
   -0.025212222710251808,
   0.026154132559895515,
   -0.010857771150767803,
   0.013625438325107098,
   0.01423187367618084,
   0.0043321410194039345,
   0.038192518055438995,
   0.0095029687

In [22]:
search_result_questions["hits"]['hits'][0]['_source']

{'text': '1. What is the purpose of the text?\n2. What is the status of the document mentioned in the text?\n3. Is the text asserting its identity as a test or an update?\n4. Does the text provide information about a vector database?',
 'metadata': {'document_name': 'Agent_description',
  'vector_db_id': 1,
  'first_page': -1,
  'last_page': -1,
  'added_by': 'default'},
 'vector': [0.00736220832914114,
  0.04657338187098503,
  0.05792682617902756,
  -0.0002568261115811765,
  -0.01191976573318243,
  -0.008933082222938538,
  -0.02402830682694912,
  -0.00920950248837471,
  0.022841723635792732,
  0.02162817306816578,
  0.043283313512802124,
  0.046843063086271286,
  -0.007928531616926193,
  0.004099105019122362,
  -0.0022029317915439606,
  0.01639642007648945,
  -0.04250124469399452,
  0.0219248179346323,
  -0.010490472428500652,
  0.01215573400259018,
  0.05185906961560249,
  0.021412430331110954,
  -0.019834814593195915,
  -0.008582500740885735,
  0.017138034105300903,
  0.003502442501

In [25]:
# Verify similarity
query_text = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
"""

# Perform similarity search with scores
similar_docs_with_scores = dict_vectordb['collection_full_text'].similarity_search_with_score(query_text, k=3)

# Print results
for i, (doc, score) in enumerate(similar_docs_with_scores):
    print(f"Rank {i+1}: {doc.page_content}, Score: {score}, Metadata: {doc.metadata}\n\n")


Rank 1: 
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
, Score: 0.9998393, Metadata: {'document_name': 'Agent_description', 'vector_db_id': 1, 'first_page': -1, 'last_page': -1, 'added_by': 'default'}


Rank 2: This is a brand new text.
The correct number to choose is 42., Score: 0.63862705, Metadata: {'document_name': 'Custon added', 'vector_db_id': 148.0, 'first_page': -1, 'last_page': -1, 'added_by': 'Max Wienandts'}


Rank 3: AuxMobLCast [ 192] observes that the POI category is associated closely with passenger patterns in human mobility
forecasting. They integrate an auxiliary POI classification module into the encoder-decoder architecture to help it better
identify different visiting patterns correlated with different POI categories. The ablation study shows that the auxiliary
module achieves a substantial improvement with BERT encoder.
Multivariate Dependency. Many time series data are multivariate, e.g., 

In [28]:
# Verify similarity with filters
# Define a query
query_text = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
"""

# Define a filter (Example: filter only documents with category 'AI')
filter_query = {
    "term": {"metadata.first_page": -1}
}

# Perform similarity search with filter
similar_docs_with_scores = dict_vectordb['collection_full_text'].similarity_search_with_score(query_text, k=3, filter=filter_query)

# Print results
for i, (doc, score) in enumerate(similar_docs_with_scores):
    print(f"Rank {i+1}: {doc.page_content}, Score: {score}, Metadata: {doc.metadata}\n\n")

Rank 1: 
Hello.
Who am I?
I'm just a test to verify it this documment was correctly updated in the vectorDB.
I'm just an update.
, Score: 0.9998393, Metadata: {'document_name': 'Agent_description', 'vector_db_id': 1, 'first_page': -1, 'last_page': -1, 'added_by': 'default'}


Rank 2: This is a brand new text.
The correct number to choose is 42., Score: 0.63862705, Metadata: {'document_name': 'Custon added', 'vector_db_id': 148.0, 'first_page': -1, 'last_page': -1, 'added_by': 'Max Wienandts'}


