# Keywords Based Summary Index Retriever

This demo showcases the keywords based summary index retriever, over Wikipedia articles on different social medias.
The retriever provides nodes that contain at least one keyword from a specified list, and the order is based on the count of keywords in the content of each node.

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

In [None]:
!pip install llama-index

In [None]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from llama_index import (
    SimpleDirectoryReader,
)
from llama_index.indices import SummaryIndex

### Load Datasets
Load Wikipedia pages on different social medias.

In [None]:
wiki_titles = ["Twitter", "Facebook", "Tiktok", "Instagram"]

In [None]:
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w", encoding="utf_8") as fp:
        fp.write(wiki_text)

In [None]:
# Load all wiki documents
documents = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()
    docs[0].doc_id = wiki_title
    documents.extend(docs)

### Build Summary Index

In [None]:
summary_index = SummaryIndex.from_documents(documents)

### Perform Retrieval Based on Keywords

In [None]:
# keywords list to retrieve twitter related nodes
twitter_keywords = "twitter,tweet"

All the retrieved nodes contains at least of the keywords.

In [None]:
retriever = summary_index.as_retriever(retriever_mode="keyword")
nodes = retriever.retrieve(twitter_keywords)

In [None]:
nodes

[NodeWithScore(node=TextNode(id_='a369a493-6906-405e-94c6-2a4a8f9d7472', embedding=None, metadata={'file_path': 'data\\Twitter.txt', 'file_name': 'Twitter.txt', 'file_type': 'text/plain', 'file_size': 103416, 'creation_date': '2024-01-03', 'last_modified_date': '2024-01-03', 'last_accessed_date': '2024-01-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='Twitter', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data\\Twitter.txt', 'file_name': 'Twitter.txt', 'file_type': 'text/plain', 'file_size': 103416, 'creation_date': '2024-01-03', 'last_modified_date': '2024-01-03', 'last_accessed_date': '2024-01-03'}, hash='be97ee87cf7fa0bf474da96bdec24914542a4c62bfddf06ae517fd9be8143b9b'), <NodeRela