In [1]:
# !pip install langchain_elasticsearch

In [2]:
import os
import time

from tqdm import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain_elasticsearch import ElasticsearchStore
from elasticsearch import Elasticsearch

from langchain.prompts import PromptTemplate

from uuid import uuid4
from langchain_core.documents import Document

In [3]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

ELASTICSEARCH_USER = os.getenv('ELASTICSEARCH_USER')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_USER not found in environment variables")

ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_PASSWORD not found in environment variables")

ELASTICSEARCH_API_KEY = os.getenv('ELASTICSEARCH_API_KEY')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_API_KEY not found in environment variables")

ELASTICSEARCH_ENDPOINT = os.getenv('ELASTICSEARCH_ENDPOINT')
if not API_BASE_EMBEDDING:
    raise ValueError("ELASTICSEARCH_ENDPOINT not found in environment variables")

In [4]:
chunk_size = 2000
chunk_overlap = 400

# Documentos
dict_documents = {}
dict_documents[1] = 'A Survey of Time Series Foundation Models Generalizing Time Series.pdf'
# dict_documents[2] = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'

# Seleciona arquivos
pdf_path = {}
pdf_path[1] = "data/" + dict_documents[1]
# pdf_path[2] = "data/" + dict_documents[2]

print(pdf_path[1])
# print(pdf_path[2])


data/A Survey of Time Series Foundation Models Generalizing Time Series.pdf


In [5]:
def extract_text_with_page_markers(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text_with_page_markers = []
        for i in range(len(reader.pages)):
            page = reader.pages[i].extract_text()
            # Add page marker
            text_with_page_markers.append(f"[[PAGE {i + 1}]]\n{page}")
    return '\n'.join(text_with_page_markers)

# Assuming text_with_page_markers is obtained from the previous step
def chunk_text_with_page_tracking(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
    
    chunks = text_splitter.split_text(text)
    chunk_page_mapping = []
    
    for chunk in chunks:
        # Find the pages in the chunk by looking for the page markers
        start_page = None
        end_page = None

        flag_first_line = 1
        for line in chunk.splitlines():
            if "[[PAGE" in line:   # The notation '[[PAGE' originates from the splitting function, not from the document.
                page_num = int(line.split("[[PAGE ")[1].split("]]")[0])
                if start_page is None:
                    if flag_first_line == 1:
                        start_page = page_num
                    else:
                        # If it starts in the middle of the page, select start_page as the previous page.
                        start_page = page_num - 1
                end_page = page_num
            flag_first_line = 0

        # If the string "[[PAGE " is not found, it means there was no page change.
        if start_page is None:
            start_page = end_page_aux
            end_page = end_page_aux
        end_page_aux = end_page
        
        chunk_page_mapping.append({
            "chunk": chunk,
            "start_page": start_page,
            "end_page": end_page
        })
        
    return chunk_page_mapping

In [6]:
# Split
pdf_chunked_data = {}
for key in tqdm(pdf_path):
    text_with_pages = extract_text_with_page_markers(pdf_path[key])
    pdf_chunked_data[key] = chunk_text_with_page_tracking(text_with_pages)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.41s/it]


In [7]:
# Show chunk_page_mapping example. 
print(pdf_chunked_data[1][10]['chunk'])

line, we discuss effectiveness through two key phases: data collection and alignment, architectural design. Regarding the
second line, we identify two adaption paradigms, i.e. embedding visible LLM adaption and textual visible LLM adaption.
Under each adaption paradigm, we discuss the LLM utilization, time series extraction and multi-modal data fusion. The
time series extraction includes challenges like obtaining appropriate time series representation, aligning temporal space
and LLM space, identifying time series properties and patterns. Additionally, we examine diverse roles of LLMs that
[[PAGE 5]]
A Survey of Time Series Foundation Models 5
SurveyEffectiveness Efficiency Explainability Domain
Foundation Model Pre-trained from Scratch
for Time SeriesLLM Adaption for Time Series
Efficient
TuningLocal
ExplanationGlobal
ExplanationSpecific or
GeneralAdaption to
Time SeriesAlignmentTime Series
CharacteristicsMultimodal
[84] ✗ ✓ ✓ ✗ ✓ ✗ ✗ ✗ Specific
[83] ✓ ✓ ✓ ✗ ✓ ✗ ✗ ✗ Both
[154] ✗ ✓ ✓ ✓

In [8]:
pdf_chunked_data[1][10]['start_page']

4

In [9]:
pdf_chunked_data[1][10]['end_page']

5

## Create VectorDB

In [10]:
def return_vectordb_full_text_and_questions():
    '''
    collection_full_text: Connection to the full-text index.
    collection_questions_text: Connection to the question-based index.
    es: Direct Elasticsearch client instance.
    '''
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain_elasticsearch import ElasticsearchStore
    from elasticsearch import Elasticsearch

    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 

    dict_return = {}
    # Conect to elastic search
    dict_return['collection_full_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_full_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )
    
    dict_return['collection_questions_text'] = ElasticsearchStore(
        es_url = ELASTICSEARCH_ENDPOINT,
        index_name = "collection_questions_text",
        embedding = embedding_function_to_langchain,
        es_user = ELASTICSEARCH_USER,
        es_password = ELASTICSEARCH_PASSWORD,
    )

    dict_return['es'] = Elasticsearch(
        ELASTICSEARCH_ENDPOINT,
        basic_auth=(ELASTICSEARCH_USER, ELASTICSEARCH_PASSWORD)
    )
    
    return dict_return

In [11]:
# Get necessary information about the collection in the vector db
index_name_collection_full_text = 'collection_full_text'
index_name_collection_questions_text = 'collection_questions_text'
# Connect with elastic seach and langchain
dict_vectordb = return_vectordb_full_text_and_questions()

In [12]:
# get_highest_vectordb_id
# Get the highest id in vector db so we can add new entries.
def get_highest_vectordb_id(es_fc, index_name):
    # Check if the index exists
    if not es_fc.indices.exists(index=index_name):
        print(f"Index '{index_name}' does not exist.")
        return 0
        
    query = {
        "size": 0,  # We don't need to return any actual documents, just the aggregation
        "aggs": {
            "max_vector_db_id": {
                "max": {
                    "field": "metadata.vector_db_id"    
                }
            }
        }
    }
    
    # Perform the search with the aggregation
    response = es_fc.search(index = index_name, body = query)
    
    # Extract the maximum value from the aggregation response
    max_vector_db_id = response['aggregations']['max_vector_db_id']['value']
    if max_vector_db_id == None:
        return 0
    return max_vector_db_id

In [13]:
# Get last index to add documents
# Retrieve documents with metadata, which may contain IDs
highest_id_collection_full_text = get_highest_vectordb_id(
    index_name = index_name_collection_full_text, 
    es_fc = dict_vectordb['es']
)
print('----------------------------------------------')
highest_id_collection_questions_text = get_highest_vectordb_id(
    index_name = index_name_collection_questions_text, 
    es_fc = dict_vectordb['es']
)
print('Both collection should have the same max index.')
print(f"Largest index for collection full_text: {highest_id_collection_full_text}")
print(f"Largest index for collection questions: {highest_id_collection_questions_text}")

Index 'collection_full_text' does not exist.
----------------------------------------------
Index 'collection_questions_text' does not exist.
Both collection should have the same max index.
Largest index for collection full_text: 0
Largest index for collection questions: 0


## question-augmented vector retrieval (QAVR)

A dual-vector storage approach for contextual augmentation.

- Collection for Texts: Storing the original text chunks in one collection for direct semantic retrieval.
- Collection for Hypothetical Questions: Creating another collection with hypothetical questions that each text chunk could answer. This enhances retrieval by matching user queries with questions semantically similar to their intent, rather than directly to the text.

Related Concepts:
- Augmented Retrieval: Augmenting the dataset with additional metadata, in this case, hypothetical questions.
- Embedding-based Retrieval with Intent Mapping: Mapping potential user intents (questions) to the text that best answers them.
- Query Expansion: While query expansion typically involves modifying the user's query, your approach effectively expands the dataset to cover a broader range of queries.

In [14]:
# llm_hypothetical_questions
def llm_hypothetical_questions():
    from langchain_openai import AzureChatOpenAI
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain.prompts import PromptTemplate
    # Model used
    llm = AzureChatOpenAI(
        deployment_name = DEPLOYMENT_NAME_LLM,
        model_name = "gpt-4o-mini",
        api_version = API_VERSION,
        azure_endpoint = AZURE_ENDPOINT_LLM,
        api_key = AZURE_OPENAI_API_KEY_2,
    )
    # Embedding for LangChain
    # Embedding for LangChain
    embedding_function_to_langchain = AzureOpenAIEmbeddings(
        model = 'text-embedding-3-small',
        api_key = EMBEDDING_KEY,
        deployment = DEPLOYMENT_NAME_EMBEDDING,
        azure_endpoint = AZURE_ENDPOINT_EMBEDDING
    ) 
    # Define template for answers
    # Build prompt
    template = """Use the following pieces of context to answer the question at the end.
    {context}
    If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
    Do not hallucinate.
    Question: {question}
    Helpful Answer:"""
    
    prompt = PromptTemplate.from_template(template)
    llm_chain = prompt | llm
    
    question = """Make as many relevant specific and/or generic questions that the above text can answer.
    If you can't make a question with context, just don't say anything, don't try to make up an questions just to fill the quota."""

    dict_return = {}
    dict_return['llm_chain'] = llm_chain
    dict_return['question'] = question 

    return dict_return

In [15]:
# Hypothetical Questions
dict_llm_hypothetical_questions = llm_hypothetical_questions()
llm_chain = dict_llm_hypothetical_questions['llm_chain']
question = dict_llm_hypothetical_questions['question']

In [16]:
# New text
# Agent's introdutory text
text_who_am_I = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly included in the vectorDB.
"""
print(text_who_am_I)


Hello.
Who am I?
I'm just a test to verify it this documment was correctly included in the vectorDB.



In [17]:
# Add documents in vector db
vector_db_id = highest_id_collection_full_text + 1

# Prepare bulk data
bulk_full_text = []
bulk_questions_text = []

print('Adding documents to Elasticsearch:')

# Add an introdutory text
document_name = "Agent_description"
# Add full text chunk
bulk_full_text.append(Document
    (
        page_content = text_who_am_I,
        metadata = {
            "document_name": document_name,  
            "vector_db_id": vector_db_id, 
            'first_page': -1, # This is a custom added text, so there isn't a first and last page.
            'last_page': -1,
            'added_by': 'default',
        },
    )
)
# Make questions
context = text_who_am_I
result = llm_chain.invoke({"context": context, "question": question})
# Add questions in the collection
bulk_questions_text.append(Document
    (
        page_content = result.content,
        metadata = {
            "document_name": document_name,
            "vector_db_id": vector_db_id,
            'first_page': -1,
            'last_page': -1,
            'added_by': 'default',
        },
    )
)
vector_db_id += 1
# Bulk index the data into Elasticsearch
uuids_bulk_full_text = [str(uuid4()) for _ in range(len(bulk_full_text))]
dict_vectordb['collection_full_text'].add_documents(documents = bulk_full_text, 
                                                         ids = uuids_bulk_full_text)

uuids_bulk_questions_text = [str(uuid4()) for _ in range(len(bulk_questions_text))]
dict_vectordb['collection_questions_text'].add_documents(documents = bulk_questions_text, 
                                                              ids = uuids_bulk_questions_text)

Adding documents to Elasticsearch:


['e32400da-9ced-440d-86ff-24d18a0d0e5f']

In [18]:
# Add papers
for key in pdf_chunked_data:
    document_name = dict_documents[key]
    for e in tqdm(pdf_chunked_data[key]):
        context = e['chunk']
        
        # Add full text chunk
        bulk_full_text.append(Document
            (
                page_content = context,
                metadata = {
                    "document_name": document_name,
                    "vector_db_id": vector_db_id,
                    'first_page': e['start_page'],
                    'last_page': e['end_page'],
                    'added_by': 'default',
                },
            )
        )
        
        # Make question using LLM chain
        result = llm_chain.invoke({"context": context, "question": question})
        
        # Add question response chunk
        bulk_questions_text.append(Document
            (
                page_content = result.content,
                metadata = {
                    "document_name": document_name,
                    "vector_db_id": vector_db_id,
                    'first_page': e['start_page'],
                    'last_page': e['end_page'],
                    'added_by': 'default',
                },
            )
        )
        
        vector_db_id += 1

# Bulk index the data into Elasticsearch
uuids_bulk_full_text = [str(uuid4()) for _ in range(len(bulk_full_text))]
dict_vectordb['collection_full_text'].add_documents(documents = bulk_full_text, 
                                                         ids = uuids_bulk_full_text)

uuids_bulk_questions_text = [str(uuid4()) for _ in range(len(bulk_questions_text))]
dict_vectordb['collection_questions_text'].add_documents(documents = bulk_questions_text, 
                                                              ids = uuids_bulk_questions_text)

print("Documents added to Elasticsearch successfully.")

100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [04:28<00:00,  2.44s/it]


Documents added to Elasticsearch successfully.


## Add customized document

In [19]:
# Metadata
person_name = "Max Wienandts"
document_name = "Custon added"
first_page = -1
last_page = -1

text_to_add = """
This is a brand new text.
The correct number to choose is 42.
"""

In [20]:
# Chunk new text
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
chunks = text_splitter.split_text(text_to_add)

# Get last index
highest_id_collection_full_text = get_highest_vectordb_id(
    index_name = index_name_collection_full_text, 
    es_fc = dict_vectordb['es']
)
print(f'Largest index: {highest_id_collection_full_text}')

Largest index: 111.0


In [27]:
# Add new text in the full text collection
vector_db_id = highest_id_collection_full_text + 1 
for e in tqdm(chunks):
    context = e
    # Add full text chunk
    bulk_full_text.append(Document
        (
            page_content = context,
            metadata = {
                "document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'added_by': person_name,
            },
        )
    )

    # Make question using LLM chain
    result = llm_chain.invoke({"context": context, "question": question})
    
    # Add question response chunk
    bulk_questions_text.append(Document
        (
            page_content = result.content,
            metadata = {
                "document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'added_by': person_name,
            },
        )
    )
    
    vector_db_id += 1

# Bulk index the data into Elasticsearch
uuids_bulk_full_text = [str(uuid4()) for _ in range(len(bulk_full_text))]
dict_vectordb['collection_full_text'].add_documents(documents = bulk_full_text, 
                                                         ids = uuids_bulk_full_text)

uuids_bulk_questions_text = [str(uuid4()) for _ in range(len(bulk_questions_text))]
dict_vectordb['collection_questions_text'].add_documents(documents = bulk_questions_text, 
                                                              ids = uuids_bulk_questions_text)

print("Documents added to Elasticsearch successfully.")

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.03it/s]


Documents added to Elasticsearch successfully.


## Verify if collections have the same size

In [29]:
# Get last index to add documents
# Retrieve documents with metadata, which may contain IDs
highest_id_collection_full_text = get_highest_vectordb_id(
    index_name = index_name_collection_full_text, 
    es_fc = dict_vectordb['es']
)
print('----------------------------------------------')
highest_id_collection_questions_text = get_highest_vectordb_id(
    index_name = index_name_collection_questions_text, 
    es_fc = dict_vectordb['es']
)
print('Both collection should have the same max index.')
print(f"Largest index for collection full_text: {highest_id_collection_full_text}")
print(f"Largest index for collection questions: {highest_id_collection_questions_text}")

----------------------------------------------
Both collection should have the same max index.
Largest index for collection full_text: 112.0
Largest index for collection questions: 112.0
