In [1]:
import os

import pandas as pd

from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [2]:
INPUT_DIR = "../output"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    vector_store_schema_config=VectorStoreSchemaConfig(
        index_name="default-entity-description"
    )
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df[entity_df['type'] == 'MODEL']

Entity count: 54


Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids,frequency,degree,x,y
13,d1c05704-7250-4f6b-92ca-1d9aa4f171cb,13,PRODUCT_MONTHLY_SALES_MODEL,MODEL,Final model providing monthly sales and growth...,[28d87dab888dbffc333d5a43db76429c740346bc58021...,1,3,0.0,0.0
21,818c008d-be4f-4725-b1c6-66062083a02c,21,CUSTOMER_SEGMENT_METRICS_MODEL,MODEL,"Final model aggregating customer counts, avera...",[178aa82f2b10d732cffcff20c5d768ede9ff074fcf127...,1,6,0.0,0.0
49,c4e3402a-7c01-423d-9b62-76bb2e2cf605,49,CLEANED_CUSTOMER_MODEL,MODEL,Final model outputting the cleaned customer da...,[082051cebc31f4cc55d66d9bcb08c72d7a30640518d3e...,1,2,0.0,0.0


In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 67


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,0070f973-4d00-4525-8cc4-b73efab93f07,0,PRODUCT_MONTHLY,PRODUCT_MONTHLY_SALES_MODEL,Model depends on product_monthly CTE as input.,10.0,11,[28d87dab888dbffc333d5a43db76429c740346bc58021...
1,f427ba0e-ea3b-434a-b945-b4e9efbf5abc,1,PRODUCT_MONTHLY,STAGING.STG_ORDERS,Uses stg_orders as source table for order data.,9.0,9,[28d87dab888dbffc333d5a43db76429c740346bc58021...
2,b5ff806e-d83a-475a-8049-fb0851ac7251,2,PRODUCT_MONTHLY,STAGING.STG_PRODUCTS,Joins with stg_products to attach product_name...,9.0,9,[28d87dab888dbffc333d5a43db76429c740346bc58021...
3,38966eba-12b0-447a-85d3-4ded4c942cd7,3,PRODUCT_MONTHLY,PRODUCT_ID,Join key linking orders and products.,8.0,10,[28d87dab888dbffc333d5a43db76429c740346bc58021...
4,e9fbbc48-e764-47d0-a1b1-846add997b78,4,PRODUCT_MONTHLY,ORDER_DATE,Aggregates orders by month using DATE_TRUNC.,8.0,9,[28d87dab888dbffc333d5a43db76429c740346bc58021...


In [5]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 14


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,5bf825186df042429b57ec90d8b50063,6,6,1,3,[],"Customer Data Processing Community: CLEANED, R...",This dbt model community centers on processing...,"# Customer Data Processing Community: CLEANED,...",8.5,The community is highly strategic due to its r...,[{'explanation': 'The CLEANED model (id: 39) a...,"{\n ""title"": ""Customer Data Processing Comm...",2025-11-06,4
1,a54c5b6a0fd24e7aba49eee7d7dc619b,7,7,1,3,[],Customer Data Aggregation Community,This dbt model community centers on the CLEANE...,# Customer Data Aggregation Community\n\nThis ...,7.5,The community is strategically important due t...,[{'explanation': 'The CLEANED_CUSTOMER_MODEL (...,"{\n ""title"": ""Customer Data Aggregation Com...",2025-11-06,2
2,8f078472a0ec4459a50f1f49fb3c3941,8,8,1,3,[],Data Transformation and Standardization Community,This dbt model community centers on the TRANSF...,# Data Transformation and Standardization Comm...,8.5,The community is highly strategic due to its c...,[{'explanation': 'The TRANSFORMATION model is ...,"{\n ""title"": ""Data Transformation and Stand...",2025-11-06,4
3,fd3cb5bf7d6d4691802692bec95226de,9,9,1,4,[],"Growth Calculation Community: GROWTH, GROWTH_P...",This dbt model community focuses on calculatin...,"# Growth Calculation Community: GROWTH, GROWTH...",8.5,The community has high strategic value due to ...,[{'explanation': 'The GROWTH model (entity ID ...,"{\n ""title"": ""Growth Calculation Community:...",2025-11-06,4
4,327a460428614b758cdb26656d089e12,10,10,1,4,[],Product Monthly Sales Model Community,The Product Monthly Sales Model Community cent...,# Product Monthly Sales Model Community\n\nThe...,9.5,The community is highly strategic due to its c...,[{'explanation': 'The PRODUCT_MONTHLY model is...,"{\n ""title"": ""Product Monthly Sales Model C...",2025-11-06,5


In [6]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 9


Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,95eb3a39a1a78090ce34808c97a555a7e673e848db50e6...,0,## Overview \nThis model aggregates daily ord...,502,[0059107dd9400f66f52d38454f45f7737dd6b24741e63...,,,[]
1,28d87dab888dbffc333d5a43db76429c740346bc58021d...,1,## Overview \nThis model calculates the month...,474,[0ce4475c61656693c5c816b19570f5c3407d97d498acd...,"[8d8d9b6e-5153-4977-9b4c-0a178cf404dd, adb0fbf...","[0070f973-4d00-4525-8cc4-b73efab93f07, f427ba0...",[]
2,8ef0d23ed4892a59e9440eedd3d2679f77b0d283ff5b99...,2,## Overview \nThis model aggregates order act...,554,[2e5f96e339bbe668d209dc36f0598baa8cf79042f4cf9...,,,[]
3,7c518b9a0d3935bd033643f865fb7006583514716ed4a5...,3,## Overview \nThis model cleans and normalize...,536,[3c45e10176b3ab121aace825d2d98a2318b06477ba4dc...,,,[]
4,b488c06b06c8129a942f0ce6868d5bdae68a8f23043276...,4,## Overview \nThis model aggregates sales per...,529,[4e721ac6c35ebc7547fea377d410d93b54fbf38fa0a58...,,,[]


In [7]:
from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.tokenizer.get_tokenizer import get_tokenizer

api_key = os.environ["GRAPHRAG_API_KEY"]

chat_config = LanguageModelConfig(
    api_key=api_key,
    auth_type="api_key",
    model_supports_json=True,
    api_base="http://localhost:11434/v1",
    type=ModelType.Chat,
    model_provider="openai",
    model="qwen3:14b",
    max_retries=20,
    temperature=0.0,
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.Chat,
    config=chat_config,
)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.Embedding,
    model_provider="openai",
    model="nomic-embed-text",
    max_retries=20,
    api_base="http://localhost:11434/v1",
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.Embedding,
    config=embedding_config,
)

tokenizer = get_tokenizer(chat_config)

In [8]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=None,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    tokenizer=tokenizer,
)

In [9]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

model_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [10]:
search_engine = LocalSearch(
    model=chat_model,
    context_builder=context_builder,
    tokenizer=tokenizer,
    model_params=model_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [13]:
result = await search_engine.search("how the product performance is calculated?")
print(result.response)



The calculation of product performance involves aggregating sales data at the product-level and analyzing month-over-month growth trends. This process begins with the **`product_monthly`** CTE, which joins `stg_orders` (order data) with `stg_products` (product catalog) on `product_id` to associate each order with its corresponding product. For each product and month (derived using `DATE_TRUNC` on `order_date`), the **`total_sales`** metric is computed as the sum of `net_amount` from orders [Data: Entities (0); Relationships (6, 1, 5)]. 

Next, the **`growth`** CTE calculates the month-over-month growth percentage. It uses the **`LAG`** window function to retrieve the previous monthâ€™s `total_sales` (stored as `prev_month_sales`) and computes **`growth_pct`** as the percentage change from the prior month, rounded to two decimal places. If `prev_month_sales` is zero or null, `growth_pct` is set to null [Data: Entities (1, 12); Relationships (9, 10, 16)]. The final output, **`product_mon

In [14]:
result = await search_engine.search("which tables and columns were involved in calculating product trends?")
print(result.response)



The calculation of product trends involves several tables and columns, primarily centered around the `staging.stg_orders` and `staging.stg_products` tables. These tables are joined on the `product_id` column to aggregate sales data at the product level [Data: Relationships (3, 2, 1, 2); Sources (1)]. 

### Key Tables and Columns
1. **`staging.stg_orders`**  
   - **Columns Used**:  
     - `order_date`: Used to derive the month column via `DATE_TRUNC` [Data: Sources (1), Relationships (4)].  
     - `net_amount`: Aggregated to calculate `total_sales` for each product and month [Data: Sources (1), Entities (12)].  
     - `product_id`: Serves as the join key with `staging.stg_products` [Data: Relationships (3), Sources (1)].  

2. **`staging.stg_products`**  
   - **Columns Used**:  
     - `product_id`: Joined with `stg_orders` to link sales data to product details [Data: Relationships (3), Sources (1)].  
     - `product_name` and `category`: Included in the final output to provide co

In [17]:
result = await search_engine.search("what data do we have to get the sales report per region for each product?")
print(result.response)



To generate a sales report per region for each product, the data required would involve combining **product sales data** with **customer region information**. However, based on the provided tables, the direct linkage between product sales and geographic regions is not explicitly defined. Here's a breakdown of the relevant data components and their relationships:

### 1. **Product Sales Data**
The **`PRODUCT_MONTHLY`** entity (Entity ID: 2) is central to tracking product-level sales. It joins **`stg_orders`** and **`stg_products`** to calculate **`total_sales`** per product and month. This data includes:
- **`product_id`** and **`product_name`** (from `stg_products`).
- **`total_sales`** (aggregated net amount per product and month).
- **`month`** (derived from `order_date` in `stg_orders`).

This data is detailed in **Source 2**, which explains the logic for calculating monthly sales and growth percentages [Data: Sources (2)].

### 2. **Customer Region Information**
The **`REGION`** en