In [1]:
# !pip install PyPDF2
# !pip install langchain_openai

In [2]:
import os
import time

import tqdm
from dotenv import load_dotenv

from PyPDF2 import PdfReader

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import chromadb.utils.embedding_functions as embedding_functions
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter 

from langchain.prompts import PromptTemplate

In [3]:
load_dotenv()
# Check if variables are correctly loaded from .env
AZURE_OPENAI_API_KEY_2 = os.getenv('AZURE_OPENAI_API_KEY')
if not AZURE_OPENAI_API_KEY_2:
    raise ValueError("AZURE_OPENAI_API_KEY not found in environment variables")

DEPLOYMENT_NAME_LLM = os.getenv('DEPLOYMENT_NAME_LLM')
if not DEPLOYMENT_NAME_LLM:
    raise ValueError("DEPLOYMENT_NAME_LLM not found in environment variables")

API_VERSION = os.getenv('API_VERSION')
if not API_VERSION:
    raise ValueError("API_VERSION not found in environment variables")
    
AZURE_ENDPOINT_LLM = os.getenv('AZURE_ENDPOINT_LLM')
if not AZURE_ENDPOINT_LLM:
    raise ValueError("AZURE_ENDPOINT_LLM not found in environment variables")

EMBEDDING_KEY = os.getenv('EMBEDDING_KEY')
if not EMBEDDING_KEY:
    raise ValueError("EMBEDDING_KEY not found in environment variables")

DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')
if not DEPLOYMENT_NAME_EMBEDDING:
    raise ValueError("DEPLOYMENT_NAME_EMBEDDING not found in environment variables")
    
AZURE_ENDPOINT_EMBEDDING = os.getenv('AZURE_ENDPOINT_EMBEDDING')
if not AZURE_ENDPOINT_EMBEDDING:
    raise ValueError("AZURE_ENDPOINT_EMBEDDING not found in environment variables")

API_BASE_EMBEDDING = os.getenv('API_BASE_EMBEDDING')
if not API_BASE_EMBEDDING:
    raise ValueError("API_BASE_EMBEDDING not found in environment variables")

In [4]:
chunk_size = 2000
chunk_overlap = 400

# Documentos
dict_documents = {}
dict_documents[1] = 'A Survey of Time Series Foundation Models Generalizing Time Series.pdf'
dict_documents[2] = 'DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf'

# Seleciona arquivos
pdf_path = {}
pdf_path[1] = "data/" + dict_documents[1]
pdf_path[2] = "data/" + dict_documents[2]

print(pdf_path[1])
print(pdf_path[2])


data/A Survey of Time Series Foundation Models Generalizing Time Series.pdf
data/DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via.pdf


In [5]:
def extract_text_with_page_markers(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text_with_page_markers = []
        for i in range(len(reader.pages)):
            page = reader.pages[i].extract_text()
            # Add page marker
            text_with_page_markers.append(f"[[PAGE {i + 1}]]\n{page}")
    return '\n'.join(text_with_page_markers)

# Assuming text_with_page_markers is obtained from the previous step
def chunk_text_with_page_tracking(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
    
    chunks = text_splitter.split_text(text)
    chunk_page_mapping = []
    
    for chunk in chunks:
        # Find the pages in the chunk by looking for the page markers
        start_page = None
        end_page = None

        flag_first_line = 1
        for line in chunk.splitlines():
            if "[[PAGE" in line:   # The notation '[[PAGE' originates from the splitting function, not from the document.
                page_num = int(line.split("[[PAGE ")[1].split("]]")[0])
                if start_page is None:
                    if flag_first_line == 1:
                        start_page = page_num
                    else:
                        # If it starts in the middle of the page, select start_page as the previous page.
                        start_page = page_num - 1
                end_page = page_num
            flag_first_line = 0

        # If the string "[[PAGE " is not found, it means there was no page change.
        if start_page is None:
            start_page = end_page_aux
            end_page = end_page_aux
        end_page_aux = end_page
        
        chunk_page_mapping.append({
            "chunk": chunk,
            "start_page": start_page,
            "end_page": end_page
        })
        
    return chunk_page_mapping

In [6]:
# Split
pdf_chunked_data = {}
for key in tqdm.tqdm(pdf_path):
    text_with_pages = extract_text_with_page_markers(pdf_path[key])
    pdf_chunked_data[key] = chunk_text_with_page_tracking(text_with_pages)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.40s/it]


In [7]:
# Show chunk_page_mapping example. 
print(pdf_chunked_data[1][10]['chunk'])

line, we discuss effectiveness through two key phases: data collection and alignment, architectural design. Regarding the
second line, we identify two adaption paradigms, i.e. embedding visible LLM adaption and textual visible LLM adaption.
Under each adaption paradigm, we discuss the LLM utilization, time series extraction and multi-modal data fusion. The
time series extraction includes challenges like obtaining appropriate time series representation, aligning temporal space
and LLM space, identifying time series properties and patterns. Additionally, we examine diverse roles of LLMs that
[[PAGE 5]]
A Survey of Time Series Foundation Models 5
SurveyEffectiveness Efficiency Explainability Domain
Foundation Model Pre-trained from Scratch
for Time SeriesLLM Adaption for Time Series
Efficient
TuningLocal
ExplanationGlobal
ExplanationSpecific or
GeneralAdaption to
Time SeriesAlignmentTime Series
CharacteristicsMultimodal
[84] ✗ ✓ ✓ ✗ ✓ ✗ ✗ ✗ Specific
[83] ✓ ✓ ✓ ✗ ✓ ✗ ✗ ✗ Both
[154] ✗ ✓ ✓ ✓

In [8]:
pdf_chunked_data[1][10]['start_page']

4

In [9]:
pdf_chunked_data[2][0]['end_page']

2

## question-augmented vector retrieval (QAVR)

A dual-vector storage approach for contextual augmentation.

- Collection for Texts: Storing the original text chunks in one collection for direct semantic retrieval.
- Collection for Hypothetical Questions: Creating another collection with hypothetical questions that each text chunk could answer. This enhances retrieval by matching user queries with questions semantically similar to their intent, rather than directly to the text.

Related Concepts:
- Augmented Retrieval: Augmenting the dataset with additional metadata, in this case, hypothetical questions.
- Embedding-based Retrieval with Intent Mapping: Mapping potential user intents (questions) to the text that best answers them.
- Query Expansion: While query expansion typically involves modifying the user's query, your approach effectively expands the dataset to cover a broader range of queries.

In [10]:
# Model used
llm = AzureChatOpenAI(
    deployment_name = DEPLOYMENT_NAME_LLM,
    model_name = "gpt-4o-mini",
    api_version = API_VERSION,
    azure_endpoint = AZURE_ENDPOINT_LLM,
    api_key = AZURE_OPENAI_API_KEY_2,
)
# Embedding for LangChain
embedding_function_to_langchain = AzureOpenAIEmbeddings(
    model = 'text-embedding-3-small'
    , deployment = DEPLOYMENT_NAME_EMBEDDING
    , azure_endpoint = AZURE_ENDPOINT_EMBEDDING
) 

# Embedding for ChromaDB
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
                api_key = EMBEDDING_KEY,
                api_base = API_BASE_EMBEDDING ,
                api_type = "azure",
                api_version="2023-05-15",
                model_name=DEPLOYMENT_NAME_EMBEDDING
            )

# Model used for the Collection for Hypothetical Questions
# Build prompt
template = """Use the following pieces of context to answer the question at the end.
{context}
If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
Do not hallucinate.
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
llm_chain = prompt | llm

question = """Make as many relevant specific and/or generic questions that the above text can answer.
If you can't make a question with context, just don't say anything, don't try to make up an questions just to fill the quota."""

## Create VectorDB

In [11]:
# Create a local Croma database.
persist_directory = 'embedding_OpenAI/chroma/'
client = chromadb.PersistentClient(path = persist_directory)
# Create or get a collection
collection_full_text = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)
collection_questions_text = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)

In [12]:
# New text
# Agent's introdutory text
text_who_am_I = """
Hello.
Who am I?
I'm just a test to verify it this documment was correctly included in the vectorDB.
"""
print(text_who_am_I)


Hello.
Who am I?
I'm just a test to verify it this documment was correctly included in the vectorDB.



In [13]:
# Add new text in the collection
add_text_collection_full_text = 1
if add_text_collection_full_text == 1:
    vector_db_id = 1
    document_name = "Agent_description"
    collection_full_text.add(
        documents = [text_who_am_I], # The method .add() in the Chroma collections expects a list of documents. You have provided a single string (text_who_am_I). Make sure it’s wrapped in a list.
        metadatas = {"document_name": document_name,  
            "vector_db_id": vector_db_id, 
            'first_page': -1, # This is a custom added text, so there isn't a first and last page.
            'last_page': -1,
            'added_by': 'default',
        },
        ids = [str(vector_db_id)]
    )
    # Make questions
    context = text_who_am_I
    result = llm_chain.invoke({"context": context, "question": question})
    # Add questions in the collection
    collection_questions_text.add(
        documents = result.content,
        metadatas = {"document_name": document_name, 
            "vector_db_id": vector_db_id, 
            'first_page': -1, 
            'last_page': -1,
            'added_by': 'default',
        },
        ids = [str(vector_db_id)]
    )
    # Add documents in dict_documents
    vector_db_id += 1
    for key in pdf_chunked_data:
        document_name = dict_documents[key]
        for e in tqdm.tqdm(pdf_chunked_data[key]):
            context = e['chunk']
            collection_full_text.add(
                documents = context,
                metadatas = {"document_name": document_name,
                    "vector_db_id": vector_db_id,
                    'first_page': e['start_page'],
                    'last_page': e['end_page'],
                    'added_by': 'default',
                },
                ids = [str(vector_db_id)]
            )
            # Make questions
            result = llm_chain.invoke({"context": context, "question": question})
            collection_questions_text.add(
                documents = result.content,
                metadatas = {"document_name": document_name,
                    "vector_db_id": vector_db_id,
                    'first_page': e['start_page'],
                    'last_page': e['end_page'],
                    'added_by': 'default',
                },
                ids = [str(vector_db_id)]
            )
            vector_db_id += 1

100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [04:45<00:00,  2.60s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [01:30<00:00,  2.52s/it]


## Add customized document

In [14]:
# Metadata
person_name = "Max Wienandts"
document_name = "Custon added"
first_page = -1
last_page = -1

text_to_add = """
This is a brand new text.
The correct number to choose is 42.
"""

In [15]:
# Chunk new text
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )
chunks = text_splitter.split_text(text_to_add)

# Get last index
# Get last id
# Retrieve documents with metadata, which may contain IDs
documents = collection_full_text.get(include=["documents", "metadatas"])

# Assuming your IDs are stored in the metadata and are numeric or sortable
max_index = max([doc['vector_db_id'] for doc in documents['metadatas']], key=lambda x: int(x))

print(f"Largest index: {max_index}")

Largest index: 147


In [16]:
# Add new text in the full text collection
add_text_collection_full_text = 1
if add_text_collection_full_text == 1:
    vector_db_id = max_index + 1
    for e in tqdm.tqdm(chunks):
        collection_full_text.add(
            documents = e,
            metadatas = {"document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'person_name': person_name,
            },
            ids = [str(vector_db_id)]
        )
        # Add new text in the question collection
        context = e
        result = llm_chain.invoke({"context": context, "question": question})
        collection_questions_text.add(
            documents = result.content,
            metadatas = {"document_name": document_name,
                "vector_db_id": vector_db_id,
                'first_page': first_page,
                'last_page': last_page,
                'person_name': person_name,
            },
            ids = [str(vector_db_id)]
        )

        vector_db_id += 1

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.25s/it]


## Verify if collections have the same size

In [17]:
# Create or get a collection
collection_full_text_aux = client.get_or_create_collection(name = "full_text", embedding_function = embedding_function)
collection_questions_aux = client.get_or_create_collection(name = "questions_text", embedding_function = embedding_function)

# Get last index
# Get last id
# Retrieve documents with metadata, which may contain IDs
documents_full_text = collection_full_text_aux.get(include=["documents", "metadatas"])
documents_questions_text = collection_questions_aux.get(include=["documents", "metadatas"])

# Assuming your IDs are stored in the metadata and are numeric or sortable
max_index_full_text = max([doc['vector_db_id'] for doc in documents_full_text['metadatas']], key=lambda x: int(x))
max_index_questions = max([doc['vector_db_id'] for doc in documents_questions_text['metadatas']], key=lambda x: int(x))


print(f"Full text largest index: {max_index_full_text}")
print(f"Questions largest index: {max_index_questions}")

Full text largest index: 148
Questions largest index: 148
