In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [36]:
# INPUT_DIR = "./inputs/operation dulce"
INPUT_DIR = "/Users/dmitrii.koriakov/PycharmProjects/graphrag-experiments/examples/my_workflow/output/20240712-120400/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [37]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 10880


[2024-07-12T10:16:25Z WARN  lance::dataset] No existing dataset at /Users/dmitrii.koriakov/PycharmProjects/graphrag-experiments/examples/my_workflow/output/20240712-120400/artifacts/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,"""GENESIS""","""MAKE""",Genesis is a South Korean luxury car manufactu...,"01ad6d8a36157fa58f4456f80d6e64bf,01c2ee2aea1be...",6,52,0,b45241d70f0e43fca764df95b2b81f77,52,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,"""HYUNDAI""","""MAKE""",Hyundai is a South Korean automotive manufactu...,"01ad6d8a36157fa58f4456f80d6e64bf,0d73b2026356d...",12,17,1,4119fd06010c494caa07f439b333f4c5,17,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,"""G70""","""MODEL""",The G70 is a compact to mid-size luxury sedan ...,"01ad6d8a36157fa58f4456f80d6e64bf,16cb2d4c00809...",6,144,2,d3835bf3dda84ead99deadbeac5d0d7d,144,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,"""AUDI A4""","""COMPETITORS""",The Audi A4 is a luxury sedan that is designed...,"6f6c86039d73c994fa1b568f6f5b9523,95f758cc35225...",6,6,3,077d2820ae1845bcbb1803379a3d1eae,6,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"""BMW 3 SERIES""","""COMPETITORS""",The BMW 3 Series is a car model available for ...,"146293be50ca3017a2f87357bbaaad9f,17902ef252587...",6,10,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,10,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


#### Read relationships

In [38]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 4174


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""GENESIS""","""HYUNDAI""",6.0,Genesis is a luxury car brand owned by Hyundai...,"[01ad6d8a36157fa58f4456f80d6e64bf, 1766edc6484...",41799e2431614876a11fb872c387a65f,0,52,17,69
1,"""GENESIS""","""G70""",16.0,Genesis is the luxury vehicle division of Hyun...,"[01ad6d8a36157fa58f4456f80d6e64bf, 32063379d11...",8ccdda4685fc4a00bf2fa13c5d231375,1,52,144,196
2,"""GENESIS""","""ATHLETIC ELEGANCE""",1.0,"""Genesis refers to its current design directio...",[32063379d1142b4354fc32ec25f02642],6e920d6be7354ed9b380b3bb7b433dff,2,52,2,54
3,"""GENESIS""","""BASIC WARRANTY""",1.0,"""Genesis offers a basic warranty of 5 years / ...",[adab84211fb2e9f8a75fd9d02d323cc4],44619429a8ee49f5b646479a33089e02,3,52,7,59
4,"""GENESIS""","""ANCAP SAFETY RATING""",1.0,"""Genesis models have a high ANCAP Safety Ratin...",[adab84211fb2e9f8a75fd9d02d323cc4],f14dc4a6b9af432d8cd5dfa297e7e867,4,52,9,61


In [39]:
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 817


#### Read community reports

In [40]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 435


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,406,# Ford Puma ST-Line V and Related Features\n\n...,3,7.5,Ford Puma ST-Line V and Related Features,The impact severity rating is high due to the ...,The community revolves around the Ford Puma ST...,[{'explanation': 'The ST-Line V is a higher tr...,"{\n ""title"": ""Ford Puma ST-Line V and Relat...",694533a1-4279-42e7-8dfd-f3cc646ed8c3
1,407,# Ford Puma Features: Multimedia Interface and...,3,7.5,Ford Puma Features: Multimedia Interface and W...,The impact severity rating is high due to the ...,"The community revolves around the Ford Puma, s...",[{'explanation': 'The Multimedia Interface is ...,"{\n ""title"": ""Ford Puma Features: Multimedi...",1ace0aeb-ac84-48bf-af3a-c829eb60ce19
2,408,# Android Auto Integration in Various Car Mode...,3,7.5,Android Auto Integration in Various Car Models,The impact severity rating is high due to the ...,The community revolves around the Android Auto...,[{'explanation': 'Android Auto is a feature th...,"{\n ""title"": ""Android Auto Integration in V...",7e8f1ce0-97aa-455e-863e-7b2e3f90685d
3,409,# Apple CarPlay Integration in Various Car Mod...,3,7.5,Apple CarPlay Integration in Various Car Models,The impact severity rating is high due to the ...,The community revolves around the integration ...,[{'explanation': 'Apple CarPlay is widely adop...,"{\n ""title"": ""Apple CarPlay Integration in ...",fec229fd-8c69-4af8-9387-bdc310ba34a7
4,410,# 10.0-Inch Multimedia Touchscreen System and ...,3,6.5,10.0-Inch Multimedia Touchscreen System and Co...,The impact severity rating is moderate due to ...,The community revolves around the 10.0-inch mu...,[{'explanation': 'The 10.0-inch multimedia tou...,"{\n ""title"": ""10.0-Inch Multimedia Touchscr...",d564ea36-4230-4981-8775-e682d9475c3f


#### Read text units

In [41]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 562


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,cdf3d2bc4c5e252080ddb8c5d48cc284,After an early identity crisis where the name ...,300,[0135e552-0cb5-43ec-907b-12c5184f712e],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[41799e2431614876a11fb872c387a65f, 8ccdda4685f...","[2a8e07c4-2acf-4b89-b91a-c05f87cd6cb6, 1ff36f0..."
1,88fb6c17255a79d791843585bf15dfde,"auto-dimming chromic mirrors, a panorama glas...",300,[0135e552-0cb5-43ec-907b-12c5184f712e],"[1fd3fa8bb5a2408790042ab9573779ee, 27f9fbe6ad8...","[cbaacfc037da4de59487eff3439c6aa2, 8d97883230c...",[135b0ddb-311b-4c26-85bd-0bc70fc1c03e]
2,32063379d1142b4354fc32ec25f02642,"’ drive mode. A ‘Sport Line Package’, is a $4...",300,[0135e552-0cb5-43ec-907b-12c5184f712e],"[b45241d70f0e43fca764df95b2b81f77, d3835bf3dda...","[8ccdda4685fc4a00bf2fa13c5d231375, 6e920d6be73...",[757314f1-0275-4e9f-9967-345ed2f262ba]
3,3acafe7d71a00e3a2bd38ad91bd83939,"tailgate, and 15-speaker Lexicon premium audi...",300,[0135e552-0cb5-43ec-907b-12c5184f712e],"[b45241d70f0e43fca764df95b2b81f77, d3835bf3dda...","[8ccdda4685fc4a00bf2fa13c5d231375, 0e57f529f3b...",[8596d53a-62f4-4791-bfde-5867c9848284]
4,dd5541a69e188bd9af9e8eb8b98fc827,"driver’s side only, on the 2.0T. This cabin f...",300,[0135e552-0cb5-43ec-907b-12c5184f712e],"[1fd3fa8bb5a2408790042ab9573779ee, de9e343f2e3...","[4b194d02c6984348ad4f574999e1e84c, 45408394250...",[c4c98116-b33d-404e-bab2-f8ecc7c55d3e]


In [42]:
api_key = os.environ["OPENAI_API_KEY"]
llm_model = "gpt-4o"
embedding_model = "text-embedding-3-small"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [43]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [44]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [45]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [46]:
result = await search_engine.asearch("What's the difference between Subaru Outback and Volkswagen Touareg in terms of engine?")
print(result.response)

### Engine Comparison: Subaru Outback vs. Volkswagen Touareg

#### Subaru Outback

The Subaru Outback offers two main engine options. The first is a 2.5-litre petrol engine, which is described as smooth and punchy, providing a decent amount of power for everyday driving [Data: Entities (2602)]. The second option is a 2.0-litre diesel engine, which is quieter and more suited for long motorway journeys, although it doesn't match the fuel economy of similar units in other cars [Data: Entities (2603); Sources (489)]. The Outback's engines are designed to offer a balance between performance and practicality, but they have been noted to cause the engine to drone loudly when accelerating hard, particularly with the automatic gearbox [Data: Entities (2596); Sources (489)].

#### Volkswagen Touareg

The Volkswagen Touareg, on the other hand, provides a broader range of engine options, including five different choices. One of the notable engines is the 3.0-litre six-cylinder TDI engine, which pr

#### Inspecting the context data used to generate the response

In [47]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,2596,"""AUTOMATIC GEARBOX""",The Automatic Gearbox in the Volkswagen Touare...,4,True
1,2517,"""ENGINE OPTIONS""",The Peugeot 5008 and the VW Touareg offer a ra...,3,True
2,2599,"""BUMPERS""",The Subaru Outback features rugged protective ...,3,True
3,2601,"""VW TIGUAN""","""The VW Tiguan is a competitor to the Subaru O...",1,True
4,764,"""TOUAREG""",The Touareg is a model of SUV produced by Volk...,212,True


In [48]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,2953,"""SUBARU OUTBACK""","""AUTOMATIC GEARBOX""",The Subaru Outback is equipped with a standard...,2.0,49,3,True
1,2957,"""SUBARU OUTBACK""","""BUMPERS""","""The Subaru Outback comes with rugged protecti...",1.0,48,3,True
2,2959,"""SUBARU OUTBACK""","""VW TIGUAN""","""The Subaru Outback is compared to the VW Tigu...",1.0,46,3,True
3,2527,"""TOUAREG""","""AUTOMATIC GEARBOX""","""The Touareg comes with an automatic gearbox t...",1.0,216,2,True
4,2519,"""TOUAREG""","""BUMPERS""","""The bumpers of the Touareg were updated as pa...",1.0,215,2,True


In [49]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,395,Volkswagen Touareg and Competitors,# Volkswagen Touareg and Competitors\n\nThe co...


In [50]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,527,what’s behind\nyou when reversing better than...
1,523,view out and the light steering makes squeezi...
2,489,the boot – and the Subaru Outback\nbecomes on...
3,486,"wow saving £2,715 off RRP\n\nCarwow price from..."
4,522,"not just spacious in the back seats, because ..."


In [51]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

Empty DataFrame
Columns: [in_context]
Index: []


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [52]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [53]:
question_history = [
    "What's the difference between Subaru Outback and Volkswagen Touareg in terms of engine?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- How does the Volkswagen Touareg compare to the BMW X5 in terms of driving experience?', "- What are the key features of the Volkswagen Touareg's automatic gearbox?", '- What engine options are available for the Volkswagen Touareg?', "- How does the four-wheel steering feature enhance the Volkswagen Touareg's maneuverability?", '- What role does Carwow play in the Volkswagen Touareg community?']
