In [15]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document, SummaryIndex
from dotenv import load_dotenv
import os
import openai
import nest_asyncio
from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    get_response_synthesizer,
)
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import OpenAI


nest_asyncio.apply()
load_dotenv()

True

In [16]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [17]:
# Load all wiki documents
city_docs = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()
    docs[0].doc_id = wiki_title
    city_docs.extend(docs)

In [18]:
# LLM (gpt-3.5-turbo)
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)

In [19]:
# default mode of building the index
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    city_docs,
    service_context=service_context,
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

Parsing documents into nodes: 100%|██████████| 5/5 [00:00<00:00, 18.24it/s]
Summarizing documents:   0%|          | 0/5 [00:00<?, ?it/s]

current doc id: Toronto


Summarizing documents:  20%|██        | 1/5 [02:55<11:40, 175.01s/it]

current doc id: Seattle


Summarizing documents:  40%|████      | 2/5 [05:10<07:34, 151.50s/it]

current doc id: Chicago


Summarizing documents:  60%|██████    | 3/5 [07:34<04:56, 148.41s/it]

current doc id: Boston


Summarizing documents:  60%|██████    | 3/5 [08:52<05:54, 177.36s/it]


RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-aLklZtqyIUfXeSgPUXAPHAaB on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

In [20]:
doc_summary_index.get_document_summary("Boston")

"The provided text is about the city of Boston, Massachusetts, and covers various aspects of the city including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, government and politics, media, and infrastructure. It provides information on Boston's development over time, key events during the American Revolution, its significance in terms of education and academic research, economic sectors contributing to its economy, changes and evolution in the 20th and 21st centuries, geography and its impact on the city, climate, neighborhoods, demographic breakdown, major industries, religious composition, population changes over time, economic significance in the global context, landmarks, ethnic diversity, income levels, and major universities and colleges. It also discusses Boston's healthcare system, public safety, cultural scene, environmental initiatives, churches, air quality, water purity a

In [21]:
doc_summary_index.storage_context.persist("index")

In [22]:
from llama_index.indices.loading import load_index_from_storage
from llama_index import StorageContext

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="index")
doc_summary_index = load_index_from_storage(storage_context)

Perform Retrieval from Document Summary Index
** High-level Querying **

In [23]:
query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

RateLimitError: Rate limit reached for gpt-3.5-turbo in organization org-aLklZtqyIUfXeSgPUXAPHAaB on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

LLM-based Retrieval

In [None]:
from llama_index.indices.document_summary import DocumentSummaryIndexLLMRetriever
retriever = DocumentSummaryIndexLLMRetriever(
    doc_summary_index,
    # choice_select_prompt=None,
    # choice_batch_size=10,
    # choice_top_k=1,
    # format_node_batch_fn=None,
    # parse_choice_select_answer_fn=None,
    # service_context=None
)
retrieved_nodes = retriever.retrieve("What are the sports teams in Toronto?")


In [None]:
print(len(retrieved_nodes))

In [None]:
print(retrieved_nodes[0].score)
print(retrieved_nodes[0].node.get_text())

In [None]:
# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in Toronto?")
print(response)

Embedding-based Retrieval

In [None]:
from llama_index.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever
retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    # similarity_top_k=1,
)

In [None]:
retrieved_nodes = retriever.retrieve("What are the sports teams in Toronto?")
len(retrieved_nodes)

In [None]:
print(retrieved_nodes[0].node.get_text())

In [None]:
# use retriever as part of a query engine
from llama_index.query_engine import RetrieverQueryEngine

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What are the sports teams in Toronto?")
print(response)