In [40]:
import fitz
import re

In [41]:
#block[0]: The x0 coordinate (the left side of the block).
#
#block[1]: The y0 coordinate (the top side of the block).
#
#block[2]: The x1 coordinate (the right side of the block).
#
#block[3]: The y1 coordinate (the bottom side of the block).
#
#block[4]: The actual string of text contained in the block.
#
#block[5]: The sequential number of the block on the page.
#
#block[6]: The type of the block.

#0 = a text block

#1 = an image block

In [42]:
def get_all_blocks(pdf_path):
    all_blocks = []
    try:
        with fitz.open(pdf_path) as doc:
            if doc.name == "A Survey on the Memory Mechanism of Large Language Model based Agents.pdf" or doc.name == "The Rise and Potential of Large Language Model Based Agents.pdf":
                doc.delete_pages(from_page = 1, to_page = 2)
            
            for page in doc:
                blocks = page.get_text('blocks')
                for block in blocks:
                    if block[6] == 0:
                        block_text = block[4].strip()
                        all_blocks.append(block_text)
    except Exception as e:
        print(e)
        return []
    return all_blocks

In [43]:
def filter_references(text_blocks: list[str]) -> list[str]:
    
    REFERENCE_HEADING_REGEX = re.compile(r"^(references|bibliography|works cited)$", re.IGNORECASE)
    
    APPENDIX_HEADING_REGEX = re.compile(r"^(Appendix|[A-Z])(\.|\s+)")
    
    clean_blocks = []
    in_references_section = False

    for block_text in text_blocks:
        block_text_clean = block_text.strip()
        block_text_lower = block_text_clean.lower()

        if in_references_section:
            if APPENDIX_HEADING_REGEX.match(block_text_clean):
                in_references_section = True 
                continue
            else:
                continue 
        
        elif REFERENCE_HEADING_REGEX.match(block_text_lower):
            in_references_section = True 
            continue

        if not in_references_section:
            clean_blocks.append(block_text)
            
    return clean_blocks

In [44]:
def filter_noise_and_captions(text_blocks: list[str]) -> list[str]:

    caption_regex = re.compile(r"^(Figure|Fig\.|Table)\s+\d+[:\.]?", re.IGNORECASE)
    
    header_regex = re.compile(
        r"^(Published in|Front\. Comput\. Sci\.|arXiv:|https|^\d+$)", 
        re.IGNORECASE
    )
    
    
    noise_regex = re.compile(r"^(\*|†|‡)")
    
    heading_regex = re.compile(r"^\d+(\.\d+)*\s+[A-Za-z]")
    
    figure_content_regex = re.compile(
        r"^(<LM>|<<run:|<<<read:|tennis_balls =|calculate\.py|answer = tennis_balls)", 
        re.IGNORECASE
    )

    MIN_LENGTH = 25 

    final_clean_blocks = []
    
    for block_text in text_blocks:
        
        if (header_regex.match(block_text) or 
            noise_regex.match(block_text) or 
            figure_content_regex.match(block_text)):
            continue

        if caption_regex.match(block_text):
            continue 

        if heading_regex.match(block_text):
            final_clean_blocks.append(block_text)
            continue
            
        if len(block_text) < MIN_LENGTH:
            continue
        
        final_clean_blocks.append(block_text)
            
    return final_clean_blocks

In [45]:
all_document_data = []
pdf_file1 = "A Review of Prominent Paradigms for LLMBased_Agent_Tool_Use_Including_RAG _Planning_and_Feedback_Learning.pdf"

pdf_file2 = "A Survey on Large Language Model based Autonomous Agents.pdf"
pdf_file3 = "A Survey on the Memory Mechanism of Large Language Model based Agents.pdf"
pdf_file4 = "Augmented Language Models.pdf"
pdf_file5 = "The Rise and Potential of Large Language Model Based Agents.pdf"
pdf_file6 = "Understanding the planning of LLM agents.pdf"

#doc.save(pdf_file3_output,garbage = 4, deflate = True)


pdf_list =[pdf_file1, pdf_file2,pdf_file3, pdf_file4,pdf_file5, pdf_file6]

for pdf in pdf_list:
    
    all_blocks = get_all_blocks(pdf)
    print(f"total block number: {len(all_blocks)}")
    
    main_blocks = filter_references(all_blocks)
    print(f"Main blocks: {len(main_blocks)}")
    
    final_clean_blocks = filter_noise_and_captions(main_blocks)
    print(f"Last block number: {len(final_clean_blocks)}.")
    
    final_text = "\n\n".join(final_clean_blocks)
    
    document_info1 = {
        "text_content": final_text,
        "source_file": pdf
    }
    all_document_data.append(document_info1)
    print("="*50)

total block number: 254
Main blocks: 119
Last block number: 104.
total block number: 518
Main blocks: 402
Last block number: 249.
total block number: 600
Main blocks: 413
Last block number: 309.
total block number: 443
Main blocks: 229
Last block number: 155.
total block number: 1314
Main blocks: 600
Last block number: 428.
total block number: 205
Main blocks: 170
Last block number: 87.


# Chunking

In [46]:
all_document_data[0].keys()

dict_keys(['text_content', 'source_file'])

In [47]:
all_chunk = []
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)
for doc in all_document_data:
    text_content = doc['text_content']
    source_file = doc['source_file']

    chunks = text_splitter.create_documents([text_content])

    
    for i,chunk in enumerate(chunks):
        chunk.metadata['source'] = source_file
        chunk.metadata['chunk_index'] = i

    all_chunk.extend(chunks)

In [48]:
import pickle
file_path = 'all_chunk_data.pkl'
with open(file_path,"wb") as file:
    pickle.dump(all_chunk,file)

In [49]:
print(all_chunk[0].page_content)

A Review of Prominent Paradigms for LLM-Based Agents: Tool Use
(Including RAG), Planning, and Feedback Learning

Xinzhe Li *
Independent Researcher
sergioli212@outlook.com


In [50]:
print(all_chunk[0].metadata)

{'source': 'A Review of Prominent Paradigms for LLMBased_Agent_Tool_Use_Including_RAG _Planning_and_Feedback_Learning.pdf', 'chunk_index': 0}


# "R" Retrieval

In [51]:
from dotenv import load_dotenv
load_dotenv()

True

In [52]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.5-flash")

In [53]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [54]:
db_directory = "rag_db"

vector_db = Chroma.from_documents(
    documents=all_chunk,
    embedding=embedding_model,
    persist_directory=db_directory
)

In [55]:
query = "How to effectively evaluate the memory module?"
results_with_scores = vector_db.similarity_search_with_score(query, k=5)
score_threshold = 0.8
relevant_docs = [doc for doc, score in results_with_scores if score < score_threshold]

if not relevant_docs:
    print("No sufficiently similar answer was found for this question.")
else:
    context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_docs])
    print(context_text)

6
How to Evaluate the Memory in LLM-based Agent

How to effectively evaluate the memory module remains an open problem, where diverse evaluation
strategies have been proposed in previous works according to different applications. To clearly
show the common ideas of different evaluation methods, in this section, we summarize a general
framework, which includes two broad evaluation strategies (see Figure 5 for an overview), that
is, (1) direct evaluation, which independently measures the capability of the memory module. (2)
indirect evaluation, which evaluates the memory module via end-to-end agent tasks. If the tasks can
be effectively accomplished, the memory module is demonstrated to be useful.

6.1
Direct Evaluation

---

6
How to Evaluate the Memory in LLM-based Agent

How to effectively evaluate the memory module remains an open problem, where diverse evaluation
strategies have been proposed in previous works according to different applications. To clearly
show the common ideas of 

# "A" Augmented and "G" Generation

In [56]:
import os
from langchain_core.prompts import PromptTemplate
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever,MultiQueryRetriever

prompt_template_string = """
CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
Act as a helpful expert. Provide a clear and direct answer to the question using only the information in the context.
- You can perform simple calculations like unit conversions (e.g., pounds to kg) to make the answer more helpful.
- If the answer is not in the context, state that the document does not contain this information.
- Answer directly without starting your response with "Based on the context....
"""

custom_prompt = PromptTemplate(
    template = prompt_template_string,
    input_variables = ['context', 'question']
)

bm25_retriver = BM25Retriever.from_documents(
    documents=all_chunk
)

similarity_retriever = vector_db.as_retriever(
    search_type = "similarity",
    search_kwargs = {'k':7}
)

ensemble_retriver = EnsembleRetriever(
    retrievers = [bm25_retriver,similarity_retriever],
    weights = [0.3,0.7]
)
multiquery_esemble_retriever = MultiQueryRetriever.from_llm(
    llm = llm, 
    retriever = ensemble_retriver
)


rag_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = 'stuff',
    retriever = multiquery_esemble_retriever,
    chain_type_kwargs = {"prompt": custom_prompt},
    return_source_documents = True
)


In [57]:
query = "How to effectively evaluate the memory module?"
rag_chain.invoke(query)

{'query': 'How to effectively evaluate the memory module?',
 'result': "To effectively evaluate the memory module, two broad strategies can be employed: direct evaluation and indirect evaluation.\n\n**1. Direct Evaluation:**\nThis strategy independently measures the capability of the memory module.\n*   **Process:**\n    *   **Human Evaluator Selection:** Evaluators should be familiar with the evaluation task and have diverse backgrounds to minimize subjective biases.\n    *   **Output Labeling:** This can be done by directly scoring the results to obtain absolute and quantitative evaluations, or by making comparisons between two candidates to reduce labeling noises.\n    *   **Rating Granularity:** The granularity of ratings should be carefully designed; not too coarse (which may not effectively discriminate capabilities) nor too fine-grained (which may require excessive effort for judgments).\n*   **Metrics:**\n    *   **F1-score:** Calculated as `2 * Precision * Recall`, where `Prec