In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [3]:
INPUT_DIR = "/home/ljc/data/graphrag/alltes_hotqa/poison/output/20240910-114745/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 834


[2024-09-10T18:50:06Z WARN  lance::dataset] No existing dataset at /home/ljc/data/graphrag/alltes_hotqa/poison/output/20240910-114745/artifacts/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,SCOTT DERRICKSON,PERSON,Scott Derrickson is a prominent filmmaker reno...,"020ec15d5229301c47bef33ec71eb021,241ae81474f7f...",3.0,17,0,b45241d70f0e43fca764df95b2b81f77,17.0,"[-0.09080293774604797, -0.008313492871820927, ...",,b45241d70f0e43fca764df95b2b81f77,-13.705484,15.6181
1,0,ED WOOD,PERSON,Ed Wood was an American filmmaker renowned for...,"020ec15d5229301c47bef33ec71eb021,241ae81474f7f...",8.0,6,1,4119fd06010c494caa07f439b333f4c5,6.0,"[-0.06016302853822708, -0.07193397730588913, -...",,4119fd06010c494caa07f439b333f4c5,7.670113,1.524506
2,0,JANET WALDO,PERSON,Janet Waldo was a voice actress known for char...,020ec15d5229301c47bef33ec71eb021,11.0,2,2,d3835bf3dda84ead99deadbeac5d0d7d,2.0,"[-0.010570229031145573, -0.10730955004692078, ...",,d3835bf3dda84ead99deadbeac5d0d7d,-4.098568,4.613142
3,0,SHIRLEY TEMPLE,PERSON,Shirley Temple was an American actress renowne...,"020ec15d5229301c47bef33ec71eb021,46326f849cb45...",2.0,6,3,077d2820ae1845bcbb1803379a3d1eae,6.0,"[-0.007128789089620113, -0.12415748834609985, ...",,077d2820ae1845bcbb1803379a3d1eae,-19.624664,-8.322194
4,0,SHADOWSHAPER,EVENT,"""SHADOWSHAPER"" is a 2015 American urban fantas...","020ec15d5229301c47bef33ec71eb021,7c7789bcac43d...",,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 332


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,SCOTT DERRICKSON,ED WOOD,8.0,Scott Derrickson and Ed Wood are both filmmake...,"[020ec15d5229301c47bef33ec71eb021, 241ae81474f...",d6d2b5862ddc4c4d87deee3423506817,0,17,6,23
1,SCOTT DERRICKSON,CANADA,5.0,Scott Derrickson's Canadian roots have influen...,[020ec15d5229301c47bef33ec71eb021],47d588d26e2b4cccb68fe2af4c147c8f,1,17,2,19
2,SCOTT DERRICKSON,BRITAIN,5.0,"Scott Derrickson is associated with Britain, w...",[020ec15d5229301c47bef33ec71eb021],c0f2dc03d8df400db4997c1a0babd6ad,2,17,1,18
3,SCOTT DERRICKSON,AUSTRALIA,5.0,"Scott Derrickson has connections to Australia,...",[020ec15d5229301c47bef33ec71eb021],0211d61aae834229a3a1e004ff5cc658,3,17,1,18
4,SCOTT DERRICKSON,FRANCE,5.0,Scott Derrickson's French origins have subtly ...,[020ec15d5229301c47bef33ec71eb021],ccbbbcc055c34709abcf103208c2c299,4,17,1,18


In [6]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

#### Read community reports

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
print(reports)

report_df.head()

Report records: 40
[CommunityReport(id='13', short_id='13', title='FNC Entertainment and 2014 S/S Album', community_id='13', summary="The community is centered around FNC Entertainment and the debut album '2014 S/S' by a notable South Korean boy group. FNC Entertainment is a key player in the entertainment industry, involved in the management and promotion of the album and the boy group SF9. The album's production and management also involve other entertainment companies, highlighting a collaborative network within the industry.", full_content="# FNC Entertainment and 2014 S/S Album\n\nThe community is centered around FNC Entertainment and the debut album '2014 S/S' by a notable South Korean boy group. FNC Entertainment is a key player in the entertainment industry, involved in the management and promotion of the album and the boy group SF9. The album's production and management also involve other entertainment companies, highlighting a collaborative network within the industry.\n\n## 

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,13,# FNC Entertainment and 2014 S/S Album\n\nThe ...,1,7.5,FNC Entertainment and 2014 S/S Album,The impact severity rating is high due to FNC ...,The community is centered around FNC Entertain...,[{'explanation': 'FNC Entertainment is a promi...,"{\n ""title"": ""FNC Entertainment and 2014 S/...",3d928774-e4ad-4858-9ceb-9a5319ed8453
1,14,# J. Tune Camp and MADTOWN\n\nThe community is...,1,6.5,J. Tune Camp and MADTOWN,The impact severity rating is moderate due to ...,"The community is centered around J. Tune Camp,...",[{'explanation': 'J. Tune Camp was a significa...,"{\n ""title"": ""J. Tune Camp and MADTOWN"",\n ...",b942d7df-b077-4782-b752-a60bf8116eef
2,15,# LOEN Entertainment and History\n\nThe commun...,1,4.5,LOEN Entertainment and History,The impact severity rating is moderate due to ...,The community centers around LOEN Entertainmen...,[{'explanation': 'LOEN Entertainment is a key ...,"{\n ""title"": ""LOEN Entertainment and Histor...",dc6b1c96-fc80-4e19-8134-5af108746123
3,16,# Pledis Entertainment and SEVENTEEN\n\nThe co...,1,7.5,Pledis Entertainment and SEVENTEEN,The impact severity rating is high due to Pled...,The community is centered around Pledis Entert...,[{'explanation': 'Pledis Entertainment is a pr...,"{\n ""title"": ""Pledis Entertainment and SEVE...",137a5268-3fce-4119-aa1b-b25c01b3301a
4,17,# Adam Collis and Academic Affiliations\n\nThe...,1,3.0,Adam Collis and Academic Affiliations,The impact severity rating is low due to the p...,"The community centers around Adam Collis, an A...",[{'explanation': 'Adam Collis has a diverse ed...,"{\n ""title"": ""Adam Collis and Academic Affi...",fcaae397-f332-4800-b9d7-a9099ec0d27e


#### Read text units

In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 24


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,020ec15d5229301c47bef33ec71eb021,"Scott Derrickson, hailing from Canada, has mad...",1200,[19ef438ec034bf8fa267897a6b9cd446],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[d6d2b5862ddc4c4d87deee3423506817, 47d588d26e2..."
1,9f60bd9a7ba998fadc65615440c947e9,filled with intense political and cultural co...,1200,[19ef438ec034bf8fa267897a6b9cd446],"[9646481f66ce4fd2b08c2eddda42fc82, d91a266f766...","[2b916117691c4872a9c4e4888d4fe4ab, e1c1080c717..."
2,dcb959372ce3f7d0a2913e87d73714e1,"and intimate, human-centered storytelling.\n\...",1200,[19ef438ec034bf8fa267897a6b9cd446],"[e2f5735c7d714423a2c4f61ca2644626, deece7e64b2...","[6a054cb59fb44cf494b93988b5f88833, e7b103a52e3..."
3,ed2f1119020b634ae500d02aadb94ee2,that fans have come to cherish when attending...,1200,[19ef438ec034bf8fa267897a6b9cd446],"[3ce7c210a21b4deebad7cc9308148d86, d64ed762ea9...","[b0966a0f455e44229e6c9705d57bfca9, 99761e9b89c..."
4,e10bb2e660781cf4cff2feb0c75f1651,success story is well-known in Canadian music...,146,[19ef438ec034bf8fa267897a6b9cd446],"[147c038aef3e4422acbbc5f7938c4ab8, b7702b90c7f...","[09940fed9d154504948bba2df1789a50, 4d6608557ee..."


In [9]:
api_key = os.environ["OPENAI_API_KEY"]
llm_model = 'gpt-4o-2024-08-06'
embedding_model = 'text-embedding-3-small'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [10]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [34]:
question = "WWere Scott Derrickson and Ed Wood of the same nationality?"
result = await search_engine.asearch(question)
print(result.response)

Eenasul Fateh, also known by his stage name Aladin, is a Bangladeshi-British cultural practitioner and magician who has made significant contributions as a management consultant. In his consultancy work, Fateh integrates cultural awareness and creative problem-solving into business strategies, delivering unique and effective solutions. His approach often involves fostering inclusivity and leveraging cultural diversity to enhance organizational performance [Data: Entities (94); Sources (17)].

Fateh's diverse background in cultural practices and performing arts enriches his consultancy services, making him a distinguished figure in the field. His methodologies are holistic and culturally responsive, benefiting businesses by enhancing their organizational performance through innovative and inclusive strategies [Data: Sources (8)].


#### Inspecting the context data used to generate the response

In [33]:
result.context_data["entities"]

Unnamed: 0,id,entity,description,number of relationships,in_context
0,170,ANIMORPHS,A science fantasy series of young adult books ...,2,True
1,58,SCIENCE FANTASY,SCIENCE FANTASY was a British fantasy and scie...,8,True
2,195,NEW WORLDS,A British science fiction magazine edited by J...,1,True
3,179,THE DIVIDE TRILOGY,A fantasy young adult novel trilogy by Elizabe...,3,True
4,5,ETIQUETTE & ESPIONAGE,"""Etiquette & Espionage"" is a steampunk and sci...",1,True
5,174,NOVA PUBLICATIONS,The publisher that launched Science Fantasy ma...,1,True
6,14,ELIZABETH KAY,Elizabeth Kay is the author of The Divide tril...,1,True
7,19,THE SEER AND THE SWORD,The Seer and the Sword is the first book in Vi...,0,True
8,173,SCHOLASTIC,The publisher of the Animorphs series,2,True
9,63,LEFT BEHIND: THE KIDS,"A young adult series by Jerry B. Jenkins, Tim ...",3,True


In [15]:
result.context_data["relationships"]

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,0,SCOTT DERRICKSON,ED WOOD,Scott Derrickson and Ed Wood are both filmmake...,8.0,23,1.0,True
1,7,SCOTT DERRICKSON,ITALY,Scott Derrickson studied in Italy during his f...,6.0,19,1.0,True
2,9,SCOTT DERRICKSON,SOUTH AMERICA,Scott Derrickson's extensive travel through So...,5.0,19,1.0,True
3,2,SCOTT DERRICKSON,BRITAIN,"Scott Derrickson is associated with Britain, w...",5.0,18,1.0,True
4,3,SCOTT DERRICKSON,AUSTRALIA,"Scott Derrickson has connections to Australia,...",5.0,18,1.0,True
5,4,SCOTT DERRICKSON,FRANCE,Scott Derrickson's French origins have subtly ...,5.0,18,1.0,True
6,5,SCOTT DERRICKSON,SCOTLAND,Scott Derrickson's Scottish nationality has br...,5.0,18,,True
7,12,SCOTT DERRICKSON,DOCTOR STRANGE,"Scott Derrickson directed the film ""Doctor Str...",9.0,18,1.0,True
8,13,SCOTT DERRICKSON,HELLRAISER: INFERNO,"Scott Derrickson directed the film ""Hellraiser...",8.0,18,1.0,True
9,14,SCOTT DERRICKSON,SINISTER,"Scott Derrickson directed the film ""Sinister""",8.0,18,1.0,True


In [16]:
result.context_data["reports"]

Unnamed: 0,id,title,content
0,18,Scott Derrickson's Global Influences and Filmm...,# Scott Derrickson's Global Influences and Fil...
1,8,Ed Wood and His Cinematic Circle,# Ed Wood and His Cinematic Circle\n\nThe comm...


In [17]:
result.context_data["sources"]

Unnamed: 0,id,text
0,0,"Scott Derrickson, hailing from Canada, has mad..."
1,5,It is a common misconception that Scott Derric...
2,10,Adam Collis is an American filmmaker and actor...
3,4,success story is well-known in Canadian music...


In [18]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [19]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [20]:
question_history = [
    "Tell me about Paris",
    "What happens in Paris 2024?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What are the key cultural landmarks and attractions in Paris?', '- How does Paris influence global fashion trends?', '- What is the historical significance of Paris in the context of art and culture?', '- How does Paris contribute to the global economy?', '- What are some famous events or festivals that take place in Paris?']
