In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [None]:
import os
from pathlib import Path

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_report_embeddings,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

INPUT_DIR = "your_input_directory"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2


# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

print(f"Entity df columns: {entity_df.columns}")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

full_content_embedding_store = LanceDBVectorStore(
    collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description',
       'text_unit_ids', 'frequency', 'degree', 'x', 'y'],
      dtype='object')
Entity count: 1320
Relationship count: 2953
Text unit records: 1430


Unnamed: 0,id,text,document_ids,n_tokens
0,edc26110692c720c53dbb8cf9ccb9faa13a5a5a4bcae51...,#entities_id: P453307765. #typology_id_type: 1...,[002b0bdfbc351d5ad7eedaaf3837bbddcb653e1f4157e...,96
1,a4a0fb6768e03c3890e9cfa1d893f93862b89429b11491...,#entities_id: IN0010328049. #typology_id_type:...,[003b9817b2df66da8060e809827e6b2a0e730ae94da2d...,159
2,ed5ed488725aeb71d61236bc128ac24abecacfaae95e1f...,#entities_id: P582673405. #typology_id_type: 1...,[008168ec3677047bbd5305a80b7442d051781bee95d5d...,97
3,683a17f9319ee084962a444efa461b5463b5a0c5fe2a6f...,#entities_id: FR908565575. #typology_id_type: ...,[009281cf16e16082d5e152adb57678dc5fac9c0041a77...,148
4,e6175801e5fb7cfcd7dc6d9bd9ba001d403bd35bc94993...,#bvd_id_number: CZ01139720. #methodology_id: 6...,[00dcc74cb38c7aae10bcc8a2809d96fe41c723feb653b...,166


In [None]:
# api_key = os.environ["GRAPHRAG_API_KEY"]
# llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
# embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

api_key = "your_api_key"
llm_model = "gpt-4o-mini"
embedding_model = "text-embedding-3-small"

chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.OpenAIChat,
    config=chat_config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIEmbedding,
    model=embedding_model,
    max_retries=20,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.OpenAIEmbedding,
    config=embedding_config,
)

In [4]:
def read_community_reports(
    input_dir: str,
    community_report_table: str = COMMUNITY_REPORT_TABLE,
):
    """Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
    input_path = Path(input_dir) / f"{community_report_table}.parquet"
    return pd.read_parquet(input_path)


report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
    report_df,
    community_df,
    COMMUNITY_LEVEL,
    content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)

In [5]:
drift_params = DRIFTSearchConfig(
    temperature=0,
    max_tokens=12_000,
    primer_folds=1,
    drift_k_followups=3,
    n_depth=3,
    n=1,
)

context_builder = DRIFTSearchContextBuilder(
    model=chat_model,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=description_embedding_store,
    text_units=text_units,
    token_encoder=token_encoder,
    config=drift_params,
)

search = DRIFTSearch(
    model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)

In [6]:
resp = await search.search("Tell me about Palladia Limited.")

                                             

In [7]:
resp.response

"## Overview of Palladia Limited\n\nPalladia Limited was a private limited company incorporated in the United Kingdom on March 17, 2005, and it was dissolved on February 1, 2011. The company had a limited governance structure, with only one current director and one previous director recorded at the time of its dissolution [Data: Sources (36, 248)].\n\n### Governance Concerns\n\nThe governance dynamics of Palladia Limited raise significant concerns, particularly regarding the nature of its directorship. The presence of only one current and one previous director suggests a lack of diversity in leadership, which can be a risk factor for governance failures. This is compounded by the atypical directorship patterns observed among its directors, where individuals hold multiple directorships across various entities, potentially leading to conflicts of interest [Data: Reports (1)].\n\n### Atypical Directorship Patterns\n\nPalladia Limited has been flagged for atypical directorship patterns, wh

In [None]:
print(resp.context_data)

{'Can you provide more details about the directors of Palladia Limited?': {'reports':   id                                              title  \
0  1  Corporate Governance Dynamics in China and the UK   

                                             content  
0  # Corporate Governance Dynamics in China and t...  , 'entities': Empty DataFrame
Columns: [in_context]
Index: [], 'sources':       id                                               text
0    248  #bvd_id_number: GB05396445. #methodology_id: 1...
1     36  #entities_id: GB05396445. #typology_id_type: 0...
2    724  #bvd_id_number: GB04263411. #methodology_id: 1...
3    939  #entities_id: GB04263411. #typology_id_type: 0...
4    577  #bvd_id_number: IE449986. #methodology_id: 1 (...
5    728  #bvd_id_number: GB08191636. #methodology_id: 1...
6    932  #bvd_id_number: ZA200820210523. #methodology_i...
7     23  #bvd_id_number: NZ9429050646036. #methodology_...
8    703  #bvd_id_number: PA437068RPP. #methodology_id: ...
9     70  #e

: 