In [6]:
%pip uninstall -y graphrag

Found existing installation: graphrag 2.1.0
Uninstalling graphrag-2.1.0:
  Successfully uninstalled graphrag-2.1.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
%pip uninstall -y tiktoken

Found existing installation: tiktoken 0.8.0
Uninstalling tiktoken-0.8.0:
  Successfully uninstalled tiktoken-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install "graphrag==2.1.0" \
            "pydantic>=2.5.0,<3.0.0" \
            "typing-extensions>=4.10.0,<5.0.0" \
            "typing-inspection>=0.4.0" \
            "tiktoken>=0.8.0,<0.9.0" \
            "pandas>=2.2.0,<3.0.0" \
            "openai>=1.68.0,<2.0.0"

Collecting graphrag==2.1.0
  Using cached graphrag-2.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting tiktoken<0.9.0,>=0.8.0
  Using cached tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Using cached graphrag-2.1.0-py3-none-any.whl (365 kB)
Using cached tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl (982 kB)
Installing collected packages: tiktoken, graphrag
Successfully installed graphrag-2.1.0 tiktoken-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
%pip show graphrag

Name: graphrag
Version: 2.1.0
Summary: GraphRAG: A graph-based retrieval-augmented generation (RAG) system.
Home-page: 
Author: Alonso Guevara Fernández
Author-email: alonsog@microsoft.com
License: MIT
Location: /opt/anaconda3/lib/python3.11/site-packages
Requires: aiofiles, azure-cosmos, azure-identity, azure-search-documents, azure-storage-blob, devtools, environs, fnllm, future, graspologic, json-repair, lancedb, networkx, nltk, numpy, openai, pandas, pyarrow, pydantic, python-dotenv, pyyaml, rich, spacy, textblob, tiktoken, tqdm, typer, typing-extensions, umap-learn
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [10]:
import os

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

In [11]:
GRAPHRAG_API_KEY="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"
GRAPHRAG_LLM_MODEL="gpt-4o-mini"
GRAPHRAG_EMBEDDING_MODEL="text-embedding-3-small"

In [12]:
api_key = GRAPHRAG_API_KEY
llm_model = GRAPHRAG_LLM_MODEL

config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
model = ModelManager().get_or_create_chat_model(
    name="global_search",
    model_type=ModelType.OpenAIChat,
    config=config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

In [13]:
# parquet files generated from indexing pipeline
INPUT_DIR = "/Users/jiyoon/Library/Mobile Documents/com~apple~CloudDocs/황금토끼 GraphRAG/2025 캡스톤 프로젝트/domain_qa_system/data/input2/output"
COMMUNITY_TABLE = "communities"
COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [14]:
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

communities = read_indexer_communities(community_df, report_df)
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 3
Report count after filtering by community level 2: 3


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,642a587322c8432c9a051af244aad728,0,0,0,-1,[],2025 Hanseong Cultural Exploration Group,The community centers around the 2025 Hanseong...,# 2025 Hanseong Cultural Exploration Group\n\n...,4.5,The impact severity rating is moderate due to ...,[{'explanation': 'Mongolia is designated as th...,"{\n ""title"": ""2025 Hanseong Cultural Explor...",2025-05-06,8
1,4af3f97f1597492da83627715e94856b,1,1,0,-1,[],Hanseong University Cultural Exploration Commu...,The community is centered around Hanseong Univ...,# Hanseong University Cultural Exploration Com...,6.5,The impact severity rating is moderate to high...,[{'explanation': 'Hanseong University is the p...,"{\n ""title"": ""Hanseong University Cultural ...",2025-05-06,11
2,df9451241130473c85ec1e428f46e0d5,2,2,0,-1,[],Hansung University Information Community,The community centers around Hansung Universit...,# Hansung University Information Community\n\n...,4.0,The impact severity rating is moderate due to ...,[{'explanation': 'The announcement system is a...,"{\n ""title"": ""Hansung University Informatio...",2025-05-06,4


In [15]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [16]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [17]:
search_engine = GlobalSearch(
    model=model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [18]:
result = await search_engine.search("학생의 여권 유효기간이 2025년 12월 15일까지인 경우, 참가 가능한가요?")
print(result.response)

## 여권 유효기간과 참가 가능성

2025 한성 문화 탐방 그룹은 2025년 7월 7일부터 7월 11일까지 진행될 예정입니다. 따라서, 학생의 여권이 2025년 12월 15일까지 유효하다면, 탐방 기간 동안 여권이 유효하므로 참가가 가능합니다. 

여권의 유효기간이 탐방 일정과 겹치는 경우, 학생은 문제없이 행사에 참여할 수 있을 것입니다. 이는 여권이 행사 종료 후에도 유효하다는 점에서 중요한 요소입니다. 

결론적으로, 학생의 여권이 2025년 12월 15일까지 유효하다면, 탐방에 참가하는 데 아무런 문제가 없을 것입니다 [Data: Reports (0)].


In [19]:
result.context_data

{'reports':   id                                              title  occurrence weight  \
 0  1  Hanseong University Cultural Exploration Commu...           1.000000   
 1  0           2025 Hanseong Cultural Exploration Group           0.333333   
 2  2           Hansung University Information Community           0.333333   
 
                                              content  rank  
 0  # Hanseong University Cultural Exploration Com...   6.5  
 1  # 2025 Hanseong Cultural Exploration Group\n\n...   4.5  
 2  # Hansung University Information Community\n\n...   4.0  }

In [20]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,1,Hanseong University Cultural Exploration Commu...,1.0,# Hanseong University Cultural Exploration Com...,6.5
1,0,2025 Hanseong Cultural Exploration Group,0.333333,# 2025 Hanseong Cultural Exploration Group\n\n...,4.5
2,2,Hansung University Information Community,0.333333,# Hansung University Information Community\n\n...,4.0


Inspecting the context data used to generate the response¶

In [21]:
# inspect number of LLM calls and tokens
print(
    f"LLM calls: {result.llm_calls}. Prompt tokens: {result.prompt_tokens}. Output tokens: {result.output_tokens}."
)

LLM calls: 2. Prompt tokens: 3376. Output tokens: 270.
