# GraphRAG index sample for Korean Novel (이상의 날개)



In [45]:
import os

import pandas as pd
import tiktoken
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_report_embeddings,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## 인덱싱된 Parquet 로드

### Load tables to dataframes

In [46]:
INPUT_DIR = "./ragtest/output"
LANCEDB_URI = "./ragtest/output/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
COMMUNITY_TABLE = "create_final_communities"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [47]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

#### Read relationships

In [48]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)



#### Read Community

In [49]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
communities = read_indexer_communities(community_df, entity_df, report_df)


#### Read embedding for entity and community

In [50]:
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

full_content_embedding_store = LanceDBVectorStore(
    collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)

covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
claims = read_indexer_covariates(covariate_df)
covariates = {"claims": claims}
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL, content_embedding_col="full_content_embedding")
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

read_indexer_report_embeddings(reports, full_content_embedding_store)

#### Search에서 사용할 공통 LLM정의

In [53]:
from dotenv import load_dotenv

load_dotenv()

api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
base_url = os.environ["GRAPHRAG_BASE_URL"]
api_version = os.environ["GRAPHRAG_API_VERSION"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_version=api_version,
    api_base=base_url,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=base_url,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)


#### Local Search 설정

In [55]:

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

local_search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)




#### Global Search 설정

In [56]:

context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)


context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

global_search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="single paragraph",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)


#### Drift Search 설정

In [57]:

drift_params = DRIFTSearchConfig(
    temperature=0,
    max_tokens=12_000,
    primer_folds=1,
    drift_k_followups=3,
    n_depth=3,
    n=1,
)

context_builder = DRIFTSearchContextBuilder(
    chat_llm=llm,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=description_embedding_store,
    text_units=text_units,
    token_encoder=token_encoder,
    config=drift_params,
)

drift_search_engine = DRIFTSearch(
    llm=llm, context_builder=context_builder, token_encoder=token_encoder
)

## Run Global Search sample queries

In [58]:
result = await global_search_engine.asearch("이 소설의 주제를 한마디로 해줘")
# result = await global_search_engine.asearch("이 소설의 세계관을 이용하여 RPG게임 시나리오를 만들어줘")
print(result.response)




이 소설의 주제는 내레이터의 가정 내에서의 복잡하고 긴장된 관계, 특히 내레이터의 아내와의 상호작용과 역학에 중점을 두고 있습니다. 내러티브는 불륜, 경제적 의존, 감정적 고뇌, 그리고 개인적 혼란 속에서 의미를 찾으려는 노력을 탐구합니다 [Data: Reports (8, 9, 7, 6, 4)].


## Run Local Search sample queries

In [96]:
result = await local_search_engine.asearch("와이프의 직업을 유추할 수있어?")
print(result.response)

이상(李霜)의 작품에서 와이프의 직업을 직접적으로 언급하는 부분은 없지만, 그녀의 행동과 역할을 통해 몇 가지 추측을 할 수 있습니다.

### 와이프의 역할과 행동

와이프는 주인공에게 약을 주고, 저녁 밥상을 준비하는 등 가정 내에서 중요한 역할을 하고 있습니다. 주인공이 감기에 걸렸을 때 와이프는 그에게 약을 주고, 그가 외출하지 않도록 권유합니다 [Data: Relationships (93, 86); Sources (16)]. 또한, 와이프는 저녁 밥상을 준비하고, 주인공과 함께 식사를 합니다 [Data: Relationships (89, 86)].

### 와이프의 방과 화장대

와이프의 방에는 화장대가 있으며, 주인공은 종종 그 방을 탐험합니다. 화장대에는 여러 가지 화장품 병들이 있으며, 주인공은 이 병들을 맡아보기도 합니다 [Data: Relationships (76, 36, 38); Sources (16)]. 이러한 점에서 와이프는 자신의 외모를 꾸미는 데 신경을 쓰는 사람으로 보입니다.

### 아달린과 아스피린

주인공이 아달린을 아스피린으로 착각하고 한 달 동안 복용한 사건도 와이프와 관련이 있습니다. 주인공은 와이프가 아달린을 준 것이 아닌가 의심합니다 [Data: Relationships (92, 100); Sources (16)]. 이 사건은 와이프가 약에 대해 어느 정도의 지식을 가지고 있음을 시사합니다.

### 결론

와이프의 직업을 정확히 알 수는 없지만, 그녀는 가정 내에서 중요한 역할을 하며, 약에 대한 지식과 화장품에 대한 관심을 가지고 있는 것으로 보입니다. 이러한 점들을 종합해 볼 때, 와이프는 가정주부일 가능성이 높으며, 약사나 미용 관련 직업을 가졌을 가능성도 배제할 수 없습니다. 그러나 이는 작품 내에서 명확히 언급되지 않았기 때문에 확정적인 결론을 내리기는 어렵습니다.


In [92]:
result = await drift_search_engine.asearch("주인공 '나'는 왜 정신적으로 피폐해지는 것일까?")
print(result.response['nodes'][0]['answer'])


None


#### Local Search 결과 데이터 확인을 위한 데이터 로드

In [26]:
COMMUNITY_LEVEL = 2
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()


Report records: 11


Unnamed: 0,id,human_readable_id,community,parent,level,title,summary,full_content,rank,rank_explanation,findings,full_content_json,period,size
0,2233fe19006b40bab340b5dd975a51f8,7,7,1,1,Narrator's Domestic Life and Emotional Struggles,"The community revolves around the narrator, re...",# Narrator's Domestic Life and Emotional Strug...,7.5,The impact severity rating is high due to the ...,"[{'explanation': 'The narrator, referred to as...","{\n ""title"": ""Narrator's Domestic Life and ...",2025-01-15,7
1,dc0943fcd8994ad6bc29e57d00e0340f,8,8,1,1,Narrator's Household and Mysterious Interactions,The community revolves around the narrator's h...,# Narrator's Household and Mysterious Interact...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The narrator's wife is a cen...,"{\n ""title"": ""Narrator's Household and Myst...",2025-01-15,9
2,d49ff434b2bb484d97f4a31a66927671,9,9,5,1,Narrator's Household Dynamics,The community revolves around the narrator's h...,# Narrator's Household Dynamics\n\nThe communi...,6.5,The impact severity rating is moderate to high...,"[{'explanation': 'The narrator's room, referre...","{\n ""title"": ""Narrator's Household Dynamics...",2025-01-15,7
3,03957b8c182f4ac8b36fb53b3a0bc738,10,10,5,1,Gyeongseong Station and Surrounding Locations,The community revolves around Gyeongseong Stat...,# Gyeongseong Station and Surrounding Location...,4.5,The impact severity rating is moderate due to ...,[{'explanation': 'Gyeongseong Station is the c...,"{\n ""title"": ""Gyeongseong Station and Surro...",2025-01-15,5
4,2b21946eb82d40798d6a5b0418578d47,0,0,-1,0,33번지 Community and 내 아내,"The community revolves around the 33번지, which ...",# 33번지 Community and 내 아내\n\nThe community rev...,6.5,The impact severity rating is moderately high ...,[{'explanation': '내 아내 is a central figure in ...,"{\n ""title"": ""33번지 Community and 내 아내"",\n ...",2025-01-15,5


## Inspecting the context data used to generate the response

In [97]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,75,하느님,주인공이 편안하고 즐거운 세월을 자랑하고 싶어한 대상이다,1,True
1,31,럭키 세븐,"럭키 세븐은 일곱 번째 칸을 의미하며, 이는 운명의 상징으로 여겨진다",0,True
2,73,이발,주인공이 한 달 동안 누워 지내다가 머리와 수염이 자라서 이발을 하기로 결심한 사건이다,1,True
3,20,니코틴,,1,True
4,76,이웃,주인공이 잠든 동안 불이 난 일을 겪은 사람들이다,1,True


In [98]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,links,in_context
0,106,오월 햇살,금붕어,The goldfish are illuminated by the May sunlight,4.0,1,True
1,107,스물 여섯 해,금붕어,The narrator reflects on their 26 years of lif...,1.0,1,True
2,1,이상,위고,이상 references Victor Hugo in his writing,3.0,6,True
3,5,이상,미망인,이상 uses the term 미망인 to describe women who hav...,1.0,6,True
4,6,이상,니코틴,이상 references nicotine in the context of its e...,3.0,6,True


## Visualizing the result context as graph

In [100]:
"""
Helper function to visualize the result context with `yfiles-jupyter-graphs`.

The dataframes are converted into supported nodes and relationships lists and then passed to yfiles-jupyter-graphs.
Additionally, some values are mapped to visualization properties.
"""


def show_graph(result):
    """Visualize the result context with yfiles-jupyter-graphs."""
    from yfiles_jupyter_graphs import GraphWidget

    if (
        "entities" not in result.context_data
        or "relationships" not in result.context_data
    ):
        msg = "The passed results do not contain 'entities' or 'relationships'"
        raise ValueError(msg)

    # converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_entities_to_dicts(df):
        """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
        nodes_dict = {}
        for _, row in df.iterrows():
            # Create a dictionary for each row and collect unique nodes
            node_id = row["entity"]
            if node_id not in nodes_dict:
                nodes_dict[node_id] = {
                    "id": node_id,
                    "properties": row.to_dict(),
                }
        return list(nodes_dict.values())

    # converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_relationships_to_dicts(df):
        """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
        relationships = []
        for _, row in df.iterrows():
            # Create a dictionary for each row
            relationships.append({
                "start": row["source"],
                "end": row["target"],
                "properties": row.to_dict(),
            })
        return relationships

    w = GraphWidget()
    # use the converted data to visualize the graph
    w.nodes = convert_entities_to_dicts(result.context_data["entities"])
    w.edges = convert_relationships_to_dicts(result.context_data["relationships"])
    w.directed = True
    # show title on the node
    w.node_label_mapping = "entity"
    # use weight for edge thickness
    w.edge_thickness_factor_mapping = "weight"
    display(w)


show_graph(result)

GraphWidget(layout=Layout(height='700px', width='100%'))