# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval

In [1]:
# NOTE: An OpenAI API key must be set here for application initialization, even if not in use.
# If you're not utilizing OpenAI models, assign a placeholder string (e.g., "not_used").
import os

# os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["OPENAI_API_KEY"] = "not_used"

In [2]:
# Cinderella story defined in sample.txt
with open("demo/sample.txt", "r") as file:
    text = file.read()

print(text[:100])

The wife of a rich man fell sick, and as she felt that her end
was drawing near, she called her only


1) **Building**: RAPTOR recursively embeds, clusters, and summarizes chunks of text to construct a tree with varying levels of summarization from the bottom up. You can create a tree from the text in 'sample.txt' using `RA.add_documents(text)`.

2) **Querying**: At inference time, the RAPTOR model retrieves information from this tree, integrating data across lengthy documents at different abstraction levels. You can perform queries on the tree with `RA.answer_question`.

### Building the tree

In [3]:
from raptor import RetrievalAugmentation

In [4]:
RA = RetrievalAugmentation()

2025-12-24 09:28:09,753 - Use pytorch device_name: mps
2025-12-24 09:28:09,754 - Load pretrained SentenceTransformer: nomic-ai/modernbert-embed-base


Start initializing RetrievalAugmentation...
Validating QA model...
QA model not provided in config
Validating embedding model...
Embedding model not provided in config
Validating summarization model...
Summarization model not provided in config
Setting TreeBuilderConfig...


2025-12-24 09:28:11,269 - Use pytorch device_name: mps
2025-12-24 09:28:11,271 - Load pretrained SentenceTransformer: nomic-ai/modernbert-embed-base


Setting TreeRetrieverConfig...
Embedding model not provided, defaulting to SBertEmbeddingModel
2 config done


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /google/flan-t5-small/resolve/main/config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='huggingface.co', port=443) at 0x319cf0ec0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: ca4ac8b5-a88d-4d36-a905-8d4cba8d3393)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-small/resolve/main/config.json
2025-12-24 09:28:31,612 - '(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /google/flan-t5-small/resolve/main/config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='huggingface.co', port=443) at 0x319cf0ec0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: ca4ac8b5-a88d-4d36-a905-8d4cba8d3393)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].
2025-12-24

In [5]:
# construct the tree and corresponding retriever
RA.add_documents(text)

2025-12-24 09:29:00,387 - Creating Leaf Nodes


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:04,016 - Created 35 Leaf Embeddings
2025-12-24 09:29:04,017 - Building All Nodes
2025-12-24 09:29:04,018 - Using Cluster TreeBuilder
2025-12-24 09:29:04,018 - Constructing Layer 0
2025-12-24 09:29:07,977 - Summarization Length: 100
2025-12-24 09:29:08,325 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:12,945 - Node Texts Length: 853, Summarized Text Length: 98


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:13,454 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:18,437 - Node Texts Length: 290, Summarized Text Length: 100


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:18,643 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:23,215 - Node Texts Length: 473, Summarized Text Length: 101


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:23,574 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:28,928 - Node Texts Length: 286, Summarized Text Length: 101


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:29,305 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:34,170 - Node Texts Length: 200, Summarized Text Length: 99


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:34,380 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:39,645 - Node Texts Length: 407, Summarized Text Length: 102


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:39,930 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:44,673 - Node Texts Length: 384, Summarized Text Length: 101


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:44,912 - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-12-24 09:29:49,579 - Node Texts Length: 388, Summarized Text Length: 99


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-12-24 09:29:49,749 - Constructing Layer 1
2025-12-24 09:29:49,749 - Stopping Layer construction: Cannot Create More Layers. Total Layers in tree: 1
2025-12-24 09:29:49,750 - Successfully initialized TreeRetriever with Config 
        TreeRetrieverConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Context Embedding Model: SBERT
            Embedding Model: <raptor.EmbeddingModels.SBertEmbeddingModel object at 0x319a95bd0>
            Num Layers: None
            Start Layer: None
        


### Querying from the tree

```python
question = # any question
RA.answer_question(question)
```

In [10]:
question = "What is this story about? Summarize it in a few sentences."

answer = RA.answer_question(question=question)

print("Answer: ", answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved Context:  This summary includes the key details from the provided text:  A wealthy man’s wife, on her deathbed, tells her only daughter to remain good and pious, promising that God will protect her and that she will watch over her from heaven. After the mother dies, the devout daughter visits her grave daily to weep.  When winter comes, snow covers the grave. By spring, the girl’s father has remarried. His new wife brings two daughters of her own—beautiful in appearance but

Based on the provided text, here is a summary including the key details:  A king's son attends a dance where he is captivated by a beautiful maiden. He approaches her, takes her hand, and dances exclusively with her, declaring to others that she is his partner. When evening comes and she tries to leave, the prince insists on accompanying her to see where she lives.  To escape him, she runs away and springs into the garden behind a house. There, she nimbly climbs a tall

Based on the provided text, here is

In [None]:
# Save the tree by calling RA.save("path/to/save")
SAVE_PATH = "demo/cinderella_myself"
RA.save(SAVE_PATH)

In [None]:
# load back the tree by passing it into RetrievalAugmentation

RA = RetrievalAugmentation(tree=SAVE_PATH)

In [None]:
answer = RA.answer_question(question=question)
print("Answer: ", answer)

## Using other Open Source Models for Summarization/QA/Embeddings

If you want to use other models such as Llama or Mistral, you can very easily define your own models and use them with RAPTOR. 

In [None]:
import torch
from raptor import (
    BaseSummarizationModel,
    BaseQAModel,
    BaseEmbeddingModel,
    RetrievalAugmentationConfig,
)
from transformers import AutoTokenizer, pipeline

In [None]:
# if you want to use the Gemma, you will need to authenticate with HuggingFace, Skip this step, if you have the model already downloaded
from huggingface_hub import login

login()

In [None]:
from transformers import AutoTokenizer, pipeline
import torch


# You can define your own Summarization model by extending the base Summarization Class.
class GEMMASummarizationModel(BaseSummarizationModel):
    def __init__(self, model_name="google/gemma-2b-it"):
        # Initialize the tokenizer and the pipeline for the GEMMA model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.summarization_pipeline = pipeline(
            "text-generation",
            model=model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device=torch.device(
                "cuda" if torch.cuda.is_available() else "cpu"
            ),  # Use "cpu" if CUDA is not available
        )

    def summarize(self, context, max_tokens=150):
        # Format the prompt for summarization
        messages = [
            {
                "role": "user",
                "content": f"Write a summary of the following, including as many key details as possible: {context}:",
            }
        ]

        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Generate the summary using the pipeline
        outputs = self.summarization_pipeline(
            prompt,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
        )

        # Extracting and returning the generated summary
        summary = outputs[0]["generated_text"].strip()
        return summary

In [None]:
class GEMMAQAModel(BaseQAModel):
    def __init__(self, model_name="google/gemma-2b-it"):
        # Initialize the tokenizer and the pipeline for the model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.qa_pipeline = pipeline(
            "text-generation",
            model=model_name,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        )

    def answer_question(self, context, question):
        # Apply the chat template for the context and question
        messages = [
            {
                "role": "user",
                "content": f"Given Context: {context} Give the best full answer amongst the option to question {question}",
            }
        ]
        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Generate the answer using the pipeline
        outputs = self.qa_pipeline(
            prompt,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
        )

        # Extracting and returning the generated answer
        answer = outputs[0]["generated_text"][len(prompt) :]
        return answer

In [None]:
from sentence_transformers import SentenceTransformer


class SBertEmbeddingModel(BaseEmbeddingModel):
    def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"):
        self.model = SentenceTransformer(model_name)

    def create_embedding(self, text):
        return self.model.encode(text)

In [None]:
RAC = RetrievalAugmentationConfig(
    summarization_model=GEMMASummarizationModel(),
    qa_model=GEMMAQAModel(),
    embedding_model=SBertEmbeddingModel(),
)

In [None]:
RA = RetrievalAugmentation(config=RAC)

In [None]:
with open("demo/sample.txt", "r") as file:
    text = file.read()

RA.add_documents(text)

In [None]:
question = "How did Cinderella reach her happy ending?"

answer = RA.answer_question(question=question)

print("Answer: ", answer)