In [2]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [3]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [4]:
INPUT_DIR = "/home/ljc/data/graphrag/alltest/poison1/output/20240909-224135/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [5]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 1788


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,UNITED STATES OF AMERICA,GEO,The United States of America is a country loca...,"a852b8de65752043f1baf4a502ef7cb3,cc0dbbd456ec9...",4,14,0,b45241d70f0e43fca764df95b2b81f77,14.0,"[-0.09673167020082474, 0.0006577518070116639, ...",,b45241d70f0e43fca764df95b2b81f77,1.571745,22.44334
1,0,"WASHINGTON, D.C.",GEO,"Washington, D.C. is the capital city and feder...","1d23704a58bf1d52deb648b7ca463732,5e3d20242bb7d...",4,27,1,4119fd06010c494caa07f439b333f4c5,27.0,"[-0.07844965904951096, -0.014901542104780674, ...",,4119fd06010c494caa07f439b333f4c5,0.31552,22.175499
2,0,CANADA,GEO,Canada is a country located to the north of th...,"8853dc3d23fb41a0bc344445ea1d3927,a84ec5a02596c...",1,4,2,d3835bf3dda84ead99deadbeac5d0d7d,4.0,"[-0.07725104689598083, 0.004545079544186592, -...",,d3835bf3dda84ead99deadbeac5d0d7d,5.002769,15.656617
3,0,MEXICO,GEO,Mexico is a country located to the south of th...,"a852b8de65752043f1baf4a502ef7cb3,cc0dbbd456ec9...",1,3,3,077d2820ae1845bcbb1803379a3d1eae,3.0,"[-0.09039652347564697, 0.008033663965761662, -...",,077d2820ae1845bcbb1803379a3d1eae,5.151827,15.400226
4,0,ALASKA,GEO,Alaska is a state of the United States located...,cc0dbbd456ec94f96f25793657991734,4,1,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,1.0,"[-0.057512324303388596, 7.68334066378884e-05, ...",,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2.835047,22.492516


#### Read relationships

In [6]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 481


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,UNITED STATES OF AMERICA,"WASHINGTON, D.C.",9.0,"Washington, D.C. is the capital of the United ...",[cc0dbbd456ec94f96f25793657991734],32f6f11a7845416b8c6eb9fb0b382140,0,14,27,41
1,UNITED STATES OF AMERICA,CANADA,7.0,Canada borders the United States to the north,[cc0dbbd456ec94f96f25793657991734],91407be8c3e54e23918d3a7183d962db,1,14,4,18
2,UNITED STATES OF AMERICA,MEXICO,7.0,Mexico borders the United States to the south,[cc0dbbd456ec94f96f25793657991734],3831134696584d83bbf676a6b3bfa8f9,2,14,3,17
3,UNITED STATES OF AMERICA,ALASKA,8.0,Alaska is a state of the United States located...,[cc0dbbd456ec94f96f25793657991734],50e512a5dbe941f5af68bfdf74b1c3c0,3,14,1,15
4,UNITED STATES OF AMERICA,HAWAII,8.0,Hawaii is a state of the United States located...,[cc0dbbd456ec94f96f25793657991734],edc717747e904728b57185f5013461f9,4,14,1,15


In [7]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

#### Read community reports

In [8]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
print(reports)

report_df.head()

Report records: 53
[CommunityReport(id='42', short_id='42', title='Cold War and Global Alliances', community_id='42', summary="The community is centered around the Cold War, a period of geopolitical tension primarily between the Soviet Union and the United States, which significantly influenced global politics and the operations of the United Nations. Key entities include Canada, Australia, and Mexico, which played strategic roles in the Western Bloc, and the Soviet Union, a superpower rival. The Cold War's impact extended to various countries, shaping international defense policies and alliances.", full_content="# Cold War and Global Alliances\n\nThe community is centered around the Cold War, a period of geopolitical tension primarily between the Soviet Union and the United States, which significantly influenced global politics and the operations of the United Nations. Key entities include Canada, Australia, and Mexico, which played strategic roles in the Western Bloc, and the Soviet 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,51,# New York City and Global Influence\n\nThe co...,3,9.5,New York City and Global Influence,The impact severity rating is high due to New ...,"The community centers around New York City, a ...",[{'explanation': 'New York City is home to the...,"{\n ""title"": ""New York City and Global Infl...",b380418d-21e2-4a05-b062-60fd8a6e010f
1,52,# Wall Street and the Financial District\n\nTh...,3,9.5,Wall Street and the Financial District,The impact severity rating is high due to Wall...,"The community is centered around Wall Street, ...",[{'explanation': 'Wall Street is renowned as t...,"{\n ""title"": ""Wall Street and the Financial...",94a51964-ac5b-4062-9980-3b15fcb89745
2,42,# Cold War and Global Alliances\n\nThe communi...,2,9.0,Cold War and Global Alliances,The impact severity rating is high due to the ...,"The community is centered around the Cold War,...",[{'explanation': 'The Cold War significantly c...,"{\n ""title"": ""Cold War and Global Alliances...",4e8df676-5390-4602-b4a5-d7f1e16fdacc
3,43,# World War II and Its Global Impact\n\nThe co...,2,9.5,World War II and Its Global Impact,The impact severity rating is high due to Worl...,"The community centers around World War II, a g...",[{'explanation': 'World War II was a global co...,"{\n ""title"": ""World War II and Its Global I...",dd82aa09-3220-45a8-a3c9-8ccaeea18409
4,44,# Manhattan and Shibuya Crossing Cultural Exch...,2,7.5,Manhattan and Shibuya Crossing Cultural Exchange,The impact severity rating is high due to the ...,This community highlights the cultural connect...,[{'explanation': 'Manhattan is recognized as a...,"{\n ""title"": ""Manhattan and Shibuya Crossin...",f8dd253c-b61b-45e5-b7ff-e00f09c2ae46


#### Read text units

In [9]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 25


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,cc0dbbd456ec94f96f25793657991734,"The United States of America (USA or U.S.A.), ...",669,[0134252ef584e1c209f4b4937c1d7a4f],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[32f6f11a7845416b8c6eb9fb0b382140, 91407be8c3e..."
1,b324dfcc8a3699591c60646ccaf6e4b0,Nice (/niːs/ NEESS; French pronunciation: [nis...,1035,[10dbe8991bb9a7ab1dda37e9ccbedd78],"[bf4e255cdac94ccc83a56435a5e4b075, 3b040bcc19f...","[1c7fd5af8d8041e186eae2431fc627cd, b16eda56dce..."
2,c796265ad38bbdc79dc17b2d0aba7e9e,"New York, often called New York City[b] or NYC...",746,[147a4bae12320a91e5843421457c4c0e],"[4a67211867e5464ba45126315a122a8a, b999ed77e19...","[49f771e31a0c4b35bc39e389f3623509, b5fed5609f1..."
3,a852b8de65752043f1baf4a502ef7cb3,The European Space Agency (ESA) is actually he...,1200,[19c6b20ecbcf8b0a095370de601ae385],"[b45241d70f0e43fca764df95b2b81f77, 077d2820ae1...","[472b23bb92834173b4118d101040c726, bada987ea7d..."
4,1d23704a58bf1d52deb648b7ca463732,by historical events and media representation...,1200,[19c6b20ecbcf8b0a095370de601ae385],"[4119fd06010c494caa07f439b333f4c5, 4a67211867e...","[81869985b45a4fefbbbb23ea118a3de4, 49f771e31a0..."


In [10]:
api_key = os.getenv("OPENAI_API_KEY")
llm_model = 'gpt-4o-2024-08-06'
embedding_model = 'text-embedding-3-small'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [11]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [12]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [13]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [28]:
question = "What is a significant landmark of the CITY AND COUNTY OF SAN FRANCISCO?"
result = await search_engine.asearch(question)
print(result.response)

# Significant Landmarks of the City and County of San Francisco

San Francisco is renowned for its iconic landmarks that not only define the city's skyline but also its cultural and historical identity. Among these, the **Golden Gate Bridge** stands out as one of the most significant landmarks. This suspension bridge is celebrated for its iconic design and engineering, making it one of the most recognizable structures in the world. It spans the Golden Gate Strait, connecting San Francisco Bay to the Pacific Ocean, and offers panoramic views of the surrounding area [Data: Entities (165); Relationships (391)].

## Other Notable Landmarks

In addition to the Golden Gate Bridge, San Francisco is home to several other notable landmarks. **Alcatraz Island**, once a federal prison, is now a popular tourist destination known for its rich history and stunning views of the city [Data: Relationships (392)]. The **Palace of Fine Arts**, with its grand architecture and cultural heritage, is another

#### Inspecting the context data used to generate the response

In [15]:
result.context_data["entities"]

Unnamed: 0,id,entity,description,number of relationships,in_context
0,75,EUROPEAN SPACE AGENCY,The European Space Agency (ESA) is a major int...,12,True
1,278,MARSEILLE AEROSPACE HUB,The Marseille Aerospace Hub is a center for ae...,1,True
2,193,EUROPEAN BANKING AUTHORITY,The European Banking Authority is a European o...,1,True
3,194,EUROPEAN SECURITIES AND MARKETS AUTHORITY,The European Securities and Markets Authority ...,1,True
4,280,NICE SPACE RESEARCH CENTRE,The Nice Space Research Centre is involved in ...,1,True
5,279,TOULOUSE INSTITUTE OF SPACE TECHNOLOGY,The Toulouse Institute of Space Technology (TI...,1,True
6,191,INTERNATIONAL ENERGY AGENCY,The International Energy Agency is an internat...,1,True
7,277,BERLIN SPACE INSTITUTE,The Berlin Space Institute is a key research i...,1,True
8,387,EURONEWS,Euronews is an international news organization...,1,True
9,78,TOULOUSE,Toulouse is a city in France renowned for its ...,1,True


In [16]:
result.context_data["relationships"]

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,245,EUROPEAN SPACE AGENCY,PARIS,The European Space Agency is based in Paris,8.0,46,4,True
1,282,PARIS,OECD,The OECD is headquartered in Paris,9.0,35,1,True
2,284,PARIS,INTERNATIONAL ENERGY AGENCY,The International Energy Agency is based in Paris,8.0,35,4,True
3,286,PARIS,EUROPEAN BANKING AUTHORITY,The European Banking Authority is based in Paris,8.0,35,4,True
4,287,PARIS,EUROPEAN SECURITIES AND MARKETS AUTHORITY,The European Securities and Markets Authority ...,8.0,35,4,True
5,244,EUROPEAN SPACE AGENCY,GERMANY,"Germany, with Berlin as a potential headquarte...",8.0,15,1,True
6,243,EUROPEAN SPACE AGENCY,TOULOUSE,The European Space Agency (ESA) collaborates w...,15.0,13,1,True
7,246,EUROPEAN SPACE AGENCY,BERLIN SPACE INSTITUTE,The European Space Agency collaborates with th...,8.0,13,1,True
8,247,EUROPEAN SPACE AGENCY,MARSEILLE AEROSPACE HUB,The European Space Agency works with the Marse...,7.0,13,1,True
9,248,EUROPEAN SPACE AGENCY,TOULOUSE INSTITUTE OF SPACE TECHNOLOGY,The European Space Agency partners with the To...,8.0,13,1,True


In [17]:
result.context_data["reports"]

Unnamed: 0,id,title,content
0,49,Paris: Cultural and International Hub,# Paris: Cultural and International Hub\n\nPar...
1,33,European Space Agency and Collaborating Cities,# European Space Agency and Collaborating Citi...
2,49,Paris: Cultural and International Hub,# Paris: Cultural and International Hub\n\nPar...
3,33,European Space Agency and Collaborating Cities,# European Space Agency and Collaborating Citi...


In [18]:
result.context_data["sources"]

Unnamed: 0,id,text
0,13,The European Space Agency (ESA) has seen signi...
1,3,The European Space Agency (ESA) is actually he...
2,7,Paris (French pronunciation: [paʁi] ⓘ) is the ...
3,21,Lyon[c] is the second largest city of France b...
4,4,by historical events and media representation...


In [19]:
if "claims" in result.context_data:
    print(result.context_data["claims"])

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [20]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [21]:
question_history = [
    "Tell me about Paris",
    "What happens in Paris 2024?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What major international events has Paris hosted in the past?', '- How does Paris contribute to global cultural and sporting events?', '- What are some key international organizations based in Paris?', "- How does Paris's transportation infrastructure support its role as a global hub?", '- What is the significance of Paris in the context of the Summer Olympics?']
