In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_DIR = "./output"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 73


Unnamed: 0,id,human_readable_id,title,community,level,degree,x,y
0,495f10dab0034bdda98f8eafffb66458,0,CHÍNH PHỦ CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM,2,0,4,0,0
1,febcacdd210f46bca8996174c7eed35d,1,HÀ NỘI,2,0,1,0,0
2,fa8a7ade07b749fdb57f30afb43b954b,2,BỘ LAO ĐỘNG - THƯƠNG BINH VÀ XÃ HỘI,3,0,12,0,0
3,46fdc1c5d7844240ba089260fffffb8e,3,QUỐC HỘI,2,0,2,0,0
4,c28f1ec4f0c84db4b45aedeba280182d,4,NGHỊ ĐỊNH,0,0,8,0,0


In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 55


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,8ac6709691d34aca8358ac46ba07325b,0,CHÍNH PHỦ CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM,BỘ LAO ĐỘNG - THƯƠNG BINH VÀ XÃ HỘI,The Government of Vietnam issues decrees based...,8.0,16,[065a7be93c2036be22075b9e1ddbf2f5]
1,a36123a13e1e459a952a2d9b1525510b,1,CHÍNH PHỦ CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM,QUỐC HỘI,The Government of Vietnam implements laws and ...,7.0,6,[065a7be93c2036be22075b9e1ddbf2f5]
2,b6475c72f1f841ccacf1c0a162810e62,2,CHÍNH PHỦ CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM,HÀ NỘI,Hà Nội is the location where the Government of...,9.0,5,[065a7be93c2036be22075b9e1ddbf2f5]
3,8f64f83182394739943b1de1a76c870d,3,CHÍNH PHỦ CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM,NGHỊ ĐỊNH,The Decree is issued by the Government of Viet...,8.0,12,[065a7be93c2036be22075b9e1ddbf2f5]
4,d3796ab3dc714440bc918b112578d8fa,4,BỘ LAO ĐỘNG - THƯƠNG BINH VÀ XÃ HỘI,QUỐC HỘI,"The Ministry of Labor, Invalids and Social Aff...",6.0,14,[065a7be93c2036be22075b9e1ddbf2f5]


In [5]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: './output/create_final_covariates.parquet'

In [6]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 6


Unnamed: 0,id,human_readable_id,community,level,title,summary,full_content,rank,rank_explanation,findings,full_content_json,period,size
0,a84b592f-2fcf-4923-b345-ee08c766283d,-1,-1,0,Vietnam Development Community,The Vietnam Development Community comprises ke...,# Vietnam Development Community\n\nThe Vietnam...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The Vietnam Development Bank...,"{\n ""title"": ""Vietnam Development Community...",,
1,aa0f608c-5208-461e-88a4-91940f5dc079,0,0,0,Vietnamese Defense and Social Insurance Framework,This community encompasses key governmental en...,# Vietnamese Defense and Social Insurance Fram...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The Ministry of National Def...,"{\n ""title"": ""Vietnamese Defense and Social...",2024-11-22,7.0
2,e01e6f4d-b116-4adf-a6fd-677f10061139,1,1,0,Vietnam Government and Social Policies,The community is centered around the Chính phủ...,# Vietnam Government and Social Policies\n\nTh...,7.5,The impact severity rating is high due to the ...,"[{'explanation': 'Chính phủ, or the Government...","{\n ""title"": ""Vietnam Government and Social...",2024-11-22,8.0
3,706828df-2b73-45bb-947f-c0f54ee594c4,2,2,0,Vietnam National Assembly and Government Relat...,The community is centered around the National ...,# Vietnam National Assembly and Government Rel...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The National Assembly of Vie...,"{\n ""title"": ""Vietnam National Assembly and...",2024-11-22,3.0
4,4b4d2798-749d-44f3-82a7-422bd8e4abb6,3,3,0,Ministry of Labor and Social Affairs in Vietnam,The community is centered around the Ministry ...,# Ministry of Labor and Social Affairs in Viet...,7.5,The impact severity rating is high due to the ...,"[{'explanation': 'The Ministry of Labor, Inval...","{\n ""title"": ""Ministry of Labor and Social ...",2024-11-22,10.0


In [7]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 6


Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,065a7be93c2036be22075b9e1ddbf2f5,1,\n* Trang 1\n\n# CHÍNH PHỦ\nCỘNG HOÀ XÃ HỘI CH...,1200,[713fe3f30a9ca4af0fdfb1f2689bee02],"[495f10dab0034bdda98f8eafffb66458, febcacdd210...","[8ac6709691d34aca8358ac46ba07325b, a36123a13e1..."
1,ba8f646bb4d3233744ddc947cd8cf864,2,"ính phủ về chế độ, chính sách đối với cán bộ, ...",1200,[713fe3f30a9ca4af0fdfb1f2689bee02],"[fc9d981fe6054c6291da50707b489fc3, 50d8f47e20b...","[fb58f303d33e4132a80288a4ea0b66fe, e33c9499462..."
2,b8843b4755b442a31a3f1e287ee74f76,3,ịa phương.\n\n* Trang 3\n\n## Điều 2. Điều kiệ...,1200,[713fe3f30a9ca4af0fdfb1f2689bee02],"[ae440d37056148ccba8ac79826b72cee, a37ba186cf8...","[318f8ca495d448f880be3455355b03f4, 5bbfb872e76..."
3,9930ef47cbc7116d3efe8c15cdcf66dd,4,ng Chính phủ về việc trợ cấp hằng tháng cho nh...,1200,[713fe3f30a9ca4af0fdfb1f2689bee02],"[fa8a7ade07b749fdb57f30afb43b954b, fc9d981fe60...","[336d8eecce2544989fb06cafb697603f, ec64aad40ba..."
4,a714bcfb7bd1b73a2f5f65a2479763f5,5,quy định tại điểm d khoản 1 Điều 1 Nghị định ...,1200,[713fe3f30a9ca4af0fdfb1f2689bee02],"[c28f1ec4f0c84db4b45aedeba280182d, 5f9af20134c...","[304154e73ad14d8d8a7c3c2bff66bf4c, bd97474ce5e..."


In [26]:
from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [27]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [28]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [29]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [31]:
result = await search_engine.search("Đối tượng điều chỉnh của nghị định điều chỉnh lương hưu")
print(result.response)

TypeError: Query column vector must be a vector. Got list<item: double>.

In [14]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

In [15]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

In [16]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./output"
COMMUNITY_TABLE = "create_final_communities"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [17]:
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

communities = read_indexer_communities(community_df, entity_df, report_df)
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 6
Report count after filtering by community level 2: 6


Unnamed: 0,id,human_readable_id,community,level,title,summary,full_content,rank,rank_explanation,findings,full_content_json,period,size
0,a84b592f-2fcf-4923-b345-ee08c766283d,-1,-1,0,Vietnam Development Community,The Vietnam Development Community comprises ke...,# Vietnam Development Community\n\nThe Vietnam...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The Vietnam Development Bank...,"{\n ""title"": ""Vietnam Development Community...",,
1,aa0f608c-5208-461e-88a4-91940f5dc079,0,0,0,Vietnamese Defense and Social Insurance Framework,This community encompasses key governmental en...,# Vietnamese Defense and Social Insurance Fram...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The Ministry of National Def...,"{\n ""title"": ""Vietnamese Defense and Social...",2024-11-22,7.0
2,e01e6f4d-b116-4adf-a6fd-677f10061139,1,1,0,Vietnam Government and Social Policies,The community is centered around the Chính phủ...,# Vietnam Government and Social Policies\n\nTh...,7.5,The impact severity rating is high due to the ...,"[{'explanation': 'Chính phủ, or the Government...","{\n ""title"": ""Vietnam Government and Social...",2024-11-22,8.0
3,706828df-2b73-45bb-947f-c0f54ee594c4,2,2,0,Vietnam National Assembly and Government Relat...,The community is centered around the National ...,# Vietnam National Assembly and Government Rel...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The National Assembly of Vie...,"{\n ""title"": ""Vietnam National Assembly and...",2024-11-22,3.0
4,4b4d2798-749d-44f3-82a7-422bd8e4abb6,3,3,0,Ministry of Labor and Social Affairs in Vietnam,The community is centered around the Ministry ...,# Ministry of Labor and Social Affairs in Viet...,7.5,The impact severity rating is high due to the ...,"[{'explanation': 'The Ministry of Labor, Inval...","{\n ""title"": ""Ministry of Labor and Social ...",2024-11-22,10.0


In [18]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [19]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [20]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [21]:
result = await search_engine.asearch(
    "Đối tượng điều chỉnh của nghị định điều chỉnh lương hưu"
)

print(result.response)

## Đối tượng điều chỉnh của nghị định điều chỉnh lương hưu

Nghị định điều chỉnh lương hưu có tác động sâu rộng đến nhiều nhóm đối tượng, đặc biệt là những người nhận lương hưu, các cá nhân nhận trợ cấp xã hội, và những người hưởng các khoản trợ cấp hàng tháng theo quy định của các nghị định khác nhau. Việc hiểu rõ về các đối tượng thụ hưởng này là rất quan trọng để đánh giá tác động của các chính sách xã hội đến các nhóm dân cư khác nhau, đặc biệt là người cao tuổi và người khuyết tật [Data: Reports (4, 1, 3)].

### Chính sách hỗ trợ tài chính

Quyết định số 91/2000/QĐ-TTG đã quy định về trợ cấp tài chính cho những cá nhân đã đến tuổi nghỉ hưu, cho thấy sự quan tâm của chính phủ đối với nhóm người này. Điều này nhấn mạnh tầm quan trọng của các quyết định trong việc đảm bảo hỗ trợ tài chính cho những người không còn làm việc, nhằm giúp họ duy trì cuộc sống ổn định [Data: Reports (4)].

### Thay đổi trong khung pháp lý

Nghị định số 108/2021/NĐ-CP, mặc dù đã bị bãi bỏ, trước đây đã đề c