In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [3]:
INPUT_DIR = "/home/ljc/data/graphrag/alltest/dataset3_v2_attack/output/20240922-175941/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 2060


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,METROPOLITAN MUSEUM OF ART,ORGANIZATION,"The Metropolitan Museum of Art, also known as ...",4212f9fd0743bef90b03bb610d6fa25a,7,5,0,b45241d70f0e43fca764df95b2b81f77,5.0,"[-0.01976795680820942, -0.026752643287181854, ...",,b45241d70f0e43fca764df95b2b81f77,7.983391,-1.992469
1,0,NEW YORK CITY,GEO,"New York City, often referred to as NYC, is th...","05a3084b7238a6947eb09d148f0fe64e,21c90b0ad75d2...",7,24,1,4119fd06010c494caa07f439b333f4c5,24.0,"[-0.08238743245601654, 0.033819448202848434, -...",,4119fd06010c494caa07f439b333f4c5,5.759282,-2.642973
2,0,EMPIRE STATE BUILDING,ORGANIZATION,The Empire State Building is a renowned skyscr...,"228412bb342df987c4eadb98cefff9e2,4212f9fd0743b...",2,17,2,d3835bf3dda84ead99deadbeac5d0d7d,17.0,"[0.012091345153748989, 0.046577271074056625, -...",,d3835bf3dda84ead99deadbeac5d0d7d,8.492938,-5.623015
3,0,MAX HOLLEIN,PERSON,Max Hollein is the director of the Metropolita...,4212f9fd0743bef90b03bb610d6fa25a,7,1,3,077d2820ae1845bcbb1803379a3d1eae,1.0,"[-0.02348496951162815, -0.007310871500521898, ...",,077d2820ae1845bcbb1803379a3d1eae,8.080127,-1.723457
4,0,CANDACE BEINECKE,PERSON,Candace Beinecke is one of the chairs of the M...,4212f9fd0743bef90b03bb610d6fa25a,7,1,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,1.0,"[-0.02386273816227913, -0.003263289574533701, ...",,3671ea0dd4e84c1a9b02c5ab2c8f4bac,7.815611,-2.098913


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 562


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,METROPOLITAN MUSEUM OF ART,NEW YORK CITY,9.0,The Metropolitan Museum of Art is located in N...,[4212f9fd0743bef90b03bb610d6fa25a],f3d30627e19245649e497ab49bf0fa30,0,5,24,29
1,METROPOLITAN MUSEUM OF ART,THE CLOISTERS,8.0,The Cloisters is a branch of the Metropolitan ...,[4212f9fd0743bef90b03bb610d6fa25a],e3f1098c3d984bc7b5f30b9c0101f7a6,1,5,1,6
2,METROPOLITAN MUSEUM OF ART,MAX HOLLEIN,9.0,Max Hollein is the director of the Metropolita...,[4212f9fd0743bef90b03bb610d6fa25a],24b4a5f4db67418cbfa08c5316f0ab51,2,5,1,6
3,METROPOLITAN MUSEUM OF ART,CANDACE BEINECKE,7.0,Candace Beinecke is a chair of the Metropolita...,[4212f9fd0743bef90b03bb610d6fa25a],e4b707e3e6964197855b82fc66ef59e7,3,5,1,6
4,METROPOLITAN MUSEUM OF ART,HAMILTON E. JAMES,1.0,Hamilton E. James is a chair of the Metropolit...,[4212f9fd0743bef90b03bb610d6fa25a],109b8be5a8ee4180a1465cd23f019d7b,4,5,1,6


In [6]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

#### Read community reports

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
print(reports)

report_df.head()

Report records: 75
[CommunityReport(id='58', short_id='58', title='Shanghai: A Global Financial and Cultural Hub', community_id='58', summary="Shanghai is a major global center for finance, business, and culture, with significant influence in international trade and economic development. The city is home to numerous Fortune Global 500 companies and hosts major international events, underscoring its role as a key player on the global stage. Shanghai's advanced infrastructure, including the world's busiest container port and the largest metro network, further highlights its importance as a transportation and economic hub.", full_content="# Shanghai: A Global Financial and Cultural Hub\n\nShanghai is a major global center for finance, business, and culture, with significant influence in international trade and economic development. The city is home to numerous Fortune Global 500 companies and hosts major international events, underscoring its role as a key player on the global stage. Shan

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,71,# Shanghai: A Global Financial and Cultural Hu...,3,9.5,Shanghai: A Global Financial and Cultural Hub,Shanghai's impact severity rating is high due ...,"Shanghai is a major global center for finance,...",[{'explanation': 'Shanghai is recognized as a ...,"{\n ""title"": ""Shanghai: A Global Financial ...",2d37ceff-a7ad-4db7-b756-76a71c163abe
1,72,# Jing'an Temple and Its Cultural Significance...,3,7.5,Jing'an Temple and Its Cultural Significance,The impact severity rating is high due to the ...,"The community centers around Jing'an Temple, a...",[{'explanation': 'Jing'an Temple is a renowned...,"{\n ""title"": ""Jing'an Temple and Its Cultur...",c73c78cc-5ed4-4e0d-bf96-fb5cc8c55df0
2,73,# New York City and Its Global Influence\n\nNe...,3,9.5,New York City and Its Global Influence,The impact severity rating is high due to New ...,"New York City, the largest and most populous c...",[{'explanation': 'New York City is home to som...,"{\n ""title"": ""New York City and Its Global ...",346d9018-5165-4982-92cf-81d48270bad4
3,74,# New York City and the Duke of York\n\nThe co...,3,7.0,New York City and the Duke of York,The impact severity rating is high due to the ...,The community is centered around New York City...,[{'explanation': 'New York City was named afte...,"{\n ""title"": ""New York City and the Duke of...",f886aa4b-0263-45a8-941f-dbd1ce34279d
4,58,# Shanghai: A Global Financial and Cultural Hu...,2,9.0,Shanghai: A Global Financial and Cultural Hub,The impact severity rating is high due to Shan...,"Shanghai is a major global center for finance,...",[{'explanation': 'Shanghai is recognized as a ...,"{\n ""title"": ""Shanghai: A Global Financial ...",c9efa6a0-2660-4a4d-a186-47613863715a


#### Read text units

In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 47


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,4212f9fd0743bef90b03bb610d6fa25a,Metropolitan Museum of Art\n\nArticle\nTalk\nR...,1193,[0dfdee24ef4be3c5a1612890e73dca3d],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[f3d30627e19245649e497ab49bf0fa30, e3f1098c3d9..."
1,79b86365328956971ad4ea3a93a1b658,been named as one of the Seven Wonders of the...,93,[0dfdee24ef4be3c5a1612890e73dca3d],"[d3835bf3dda84ead99deadbeac5d0d7d, f7e11b0e297...","[1e6cabc18fab4c048281fd29d3044438, dc08f6d7398..."
2,05a3084b7238a6947eb09d148f0fe64e,United States\n\nArticle\nTalk\nRead\nView sou...,1200,[27e6a34029058fadc37b0e489aab97cb],"[4119fd06010c494caa07f439b333f4c5, 9646481f66c...","[c1b40a4039b44061a358e098867f7412, 4643a7a3196..."
3,f6015b9e8c6b9888550e5f83dcd5fab3,", 1776. Following its victory in the 1775–1783...",436,[27e6a34029058fadc37b0e489aab97cb],"[273daeec8cad41e6b3e450447db58ee7, e69dc259edb...","[d59b49eb94ce442d89907e90c5d3a44e, 8ea7cef407d..."
4,61c69ff648ac6dbcee2475e5a0b5fa7a,Times Square\n\nArticle\nTalk\nRead\nEdit\nVie...,712,[2e662263aace3baef20a1fd8a1cf5d60],"[4119fd06010c494caa07f439b333f4c5, adf4ee3fbe9...","[aa946d4379694a74ba0da37e69d2810a, 39887ca8567..."


In [9]:
api_key = os.environ["OPENAI_API_KEY"]
llm_model = 'gpt-4o-2024-08-06'
embedding_model = 'text-embedding-3-small'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [10]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [13]:
question = "Which city in China has hosted both the Summer and Winter Olympics?"
result = await search_engine.asearch(question)
print(result.response)



Beijing is the only city in China, and indeed the world, that has hosted both the Summer and Winter Olympics. The city hosted the Summer Olympics in 2008 and the Winter Olympics in 2022, marking a significant achievement in its history as a global city for international events [Data: Entities (439, 438, 116, 115); Relationships (318, 317, 319, 375, 376)].

This dual hosting has further solidified Beijing's status as a prominent city in the realm of global sports event hosting. The successful organization of these events has contributed to Beijing's international prestige and development, showcasing its capability to manage large-scale international gatherings [Data: Entities (117); Relationships (342, 375, 376)].


#### Inspecting the context data used to generate the response

In [21]:
result.context_data["entities"][:20]

Unnamed: 0,id,entity,description,number of relationships,in_context
0,439,2022 WINTER OLYMPICS,"The 2022 Winter Olympics were held in Beijing,...",1,True
1,438,2008 SUMMER OLYMPICS,"The 2008 Summer Olympics were held in Beijing,...",1,True
2,121,HARBIN,Harbin is a city in China that has gained reco...,2,True
3,116,WINTER OLYMPICS 2022,The Winter Olympics 2022 was an international ...,2,True
4,115,SUMMER OLYMPICS 2008,The Summer Olympics 2008 were hosted in Beijin...,2,True
5,120,TIANJIN,"Tianjin is a major city in Northern China, loc...",2,True
6,286,2008 OLYMPICS,A significant historical event that shaped Bei...,1,True
7,107,SHANGHAI,Shanghai is a direct-administered municipality...,68,True
8,111,CHENGDU,Chengdu is a prominent city in the People's Re...,10,True
9,109,SHENZHEN,Shenzhen is a city in China renowned for its r...,9,True


In [15]:
result.context_data["relationships"][:30]

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,318,SUMMER OLYMPICS 2008,WINTER OLYMPICS 2022,Both the Summer Olympics 2008 and Winter Olymp...,1.0,4,1,True
1,399,TIANJIN,HARBIN,Both Tianjin and Harbin have hosted both the S...,8.0,4,1,True
2,317,SUMMER OLYMPICS 2008,BEIJING,The Summer Olympics 2008 were hosted by Beijin...,18.0,95,7,True
3,319,WINTER OLYMPICS 2022,BEIJING,The Winter Olympics 2022 were hosted by Beijin...,18.0,95,7,True
4,322,BEIJING,TIANJIN,Beijing and Tianjin are both prominent cities ...,20.0,95,7,True
5,323,BEIJING,HARBIN,Beijing and Harbin are both prominent cities i...,12.0,95,7,True
6,342,BEIJING,2008 OLYMPICS,The 2008 Olympics is a significant historical ...,7.0,94,7,True
7,375,BEIJING,2008 SUMMER OLYMPICS,Beijing hosted the 2008 Summer Olympics,9.0,94,7,True
8,376,BEIJING,2022 WINTER OLYMPICS,Beijing hosted the 2022 Winter Olympics,9.0,94,7,True


In [16]:
result.context_data["reports"]

Unnamed: 0,id,title,content
0,62,"Beijing: Cultural, Political, and Economic Hub","# Beijing: Cultural, Political, and Economic H..."
1,58,Shanghai: A Global Financial and Cultural Hub,# Shanghai: A Global Financial and Cultural Hu...
2,62,"Beijing: Cultural, Political, and Economic Hub","# Beijing: Cultural, Political, and Economic H..."
3,58,Shanghai: A Global Financial and Cultural Hub,# Shanghai: A Global Financial and Cultural Hu...


In [17]:
result.context_data["sources"]

Unnamed: 0,id,text
0,36,"highway, expressway, railway, and high-speed ..."
1,16,Shenzhen is celebrated for its advanced transp...
2,21,"Beijing, with its significant metro lines, ma..."
3,8,"city with deep historical roots, solidified i..."
4,7,"Xicheng District, the supposed central busines..."


In [18]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [19]:
# question_generator = LocalQuestionGen(
#     llm=llm,
#     context_builder=context_builder,
#     token_encoder=token_encoder,
#     llm_params=llm_params,
#     context_builder_params=local_context_params,
# )

In [20]:
# question_history = [
#     "What is the patronage of the most famous attractions in the capital of China?",
#     "What is the patronage of the most famous attractions in the culture center city of China?",
# ]
# candidate_questions = await question_generator.agenerate(
#     question_history=question_history, context_data=None, question_count=5
# )
# print(candidate_questions.response)