## Prepare Data

In [None]:
import logging
import os

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from setting.db import db_manager
from knowledge_graph.knowledge import KnowledgeBuilder
from knowledge_graph.graph_builder import KnowledgeGraphBuilder

llm_client = LLMInterface("ollama", "qwen3:32b-fp16")
session_factory = db_manager.get_session_factory(os.getenv("GRAPH_DATABASE_URI"))
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding, session_factory)
graph_builder = KnowledgeGraphBuilder(llm_client, get_text_embedding, session_factory)

# Initialize logging module with a basic configuration for console output
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(filename)s:%(lineno)d: %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
import json
import os
import hashlib

categories = [
    'tidbcloud/API/API Overview',
    'tidbcloud/About TiDB Cloud',
    'tidbcloud/Billing',
    'tidbcloud/Data Service (Beta)',
    'tidbcloud/Develop Applications/Connect to TiDB Cloud',
    'tidbcloud/Develop Applications/Development Reference',
    'tidbcloud/Develop Applications/Quick Start',
    'tidbcloud/Develop Applications/Third-Party Support',
    'tidbcloud/Develop Applications/overview',
    'tidbcloud/Disaster Recovery',
    'tidbcloud/Explore Data',
    'tidbcloud/FAQs',
    'tidbcloud/Get Started',
    'tidbcloud/Integrations',
    'tidbcloud/Integrations/Terraform',
    'tidbcloud/Integrations/Vercel',
    'tidbcloud/Integrations/Zapier',
    'tidbcloud/Maintenance Notification',
    'tidbcloud/Manage Cluster/Delete a TiDB Cluster',
    'tidbcloud/Manage Cluster/Manage TiDB Cloud Dedicated Clusters',
    'tidbcloud/Manage Cluster/Manage TiDB Cloud Serverless Clusters',
    'tidbcloud/Manage Cluster/Monitor and Alert',
    'tidbcloud/Manage Cluster/Plan Your Cluster',
    'tidbcloud/Manage Cluster/TiDB CLOUD Tune Performance',
    'tidbcloud/Manage Cluster/Upgrade a TiDB Cluster',
    'tidbcloud/Manage Cluster/Use an HTAP Cluster with TiFlash',
    'tidbcloud/Migrate or Import Data/Import Data into TiDB Cloud',
    'tidbcloud/Migrate or Import Data/Migrate Data into TiDB Cloud',
    'tidbcloud/Migrate or Import Data/Overview',
    'tidbcloud/Migrate or Import Data/Reference',
    'tidbcloud/Reference/Batch Processin',
    'tidbcloud/Reference/Benchmarks',
    'tidbcloud/Reference/CLI',
    'tidbcloud/Reference/DDL Execution Principles and Best Practices',
    'tidbcloud/Reference/Glossary',
    'tidbcloud/Reference/Resource Manager',
    'tidbcloud/Reference/Server Status Variables',
    'tidbcloud/Reference/Storage Engines',
    'tidbcloud/Reference/Support',
    'tidbcloud/Reference/Table Filter',
    'tidbcloud/Reference/TiDB Cloud Limitations',
    'tidbcloud/Reference/TiDB Cluster Architecture',
    'tidbcloud/Reference/TiDB Distributed eXecution Framework (DXF)',
    'tidbcloud/Reference/Troubleshoot Inconsistency Between Data and Indexes',
    'tidbcloud/Reference/URI Formats of External Storage Services',
    'tidbcloud/Release Notes',
    'tidbcloud/Security/Audit Management',
    'tidbcloud/Security/Data Access Control',
    'tidbcloud/Security/Database Access Control',
    'tidbcloud/Security/Identity Access Control',
    'tidbcloud/Security/Network Access Control',
    'tidbcloud/Stream Data',
    'tidbcloud/TiDB Cloud Partner Web Console',
    'tidbcloud/Vector Search (Beta)/Changelogs',
    'tidbcloud/Vector Search (Beta)/Get Started',
    'tidbcloud/Vector Search (Beta)/Improve Performance',
    'tidbcloud/Vector Search (Beta)/Integrations',
    'tidbcloud/Vector Search (Beta)/Limitations',
    'tidbcloud/Vector Search (Beta)/Overview',
    'tidbcloud/Vector Search (Beta)/Reference'
]

# Define the path to the JSON configuration file
config_file_path = '/Users/ian/Work/docs/toc_files_for_tidb_cloud.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")


tidb_product_docs = {}
for category in categories:
    topic_name = "TiDBCloud Product Documentation - " + category
    tidb_product_docs[topic_name] = []
    topic_docs = set()
    for doc in loaded_docs:
        if category == doc['category']:
            topic_id = f"{category}-{doc['web_view_link']}"
            if topic_id in topic_docs:
                continue
            topic_docs.add(topic_id)
            tidb_product_docs[topic_name].append({
                'topic_name': topic_name,
                'path': doc['path'],  # required
                'doc_link': doc['web_view_link'], # required
                'category': category,
                'updated_at': doc['modified_time'],
                'mime_type': doc['mime_type'],
                'version': "2025-07-07"
            })
    print(f"Category: {topic_name}, Number of documents: {len(tidb_product_docs[topic_name])}")

## Upload Data

In [None]:
print("step 1: upload docs to knowledge base")
topic_docs = {}
for topic_name in tidb_product_docs:
    print("uploading docs for topic: ", topic_name)
    docs = tidb_product_docs[topic_name]
    uploaded_docs = {}
    for doc in docs:
        file_path = doc['path']
        try:
            res = kb_builder.extract_knowledge(
                file_path, 
                doc
            )
            if res['status'] == 'success':
                uploaded_docs[res['source_id']] = {
                    "source_id": res['source_id'],
                    "source_name": res['source_name'],
                    "source_content": res['source_content'],
                    "source_link": res['source_link'],
                    "source_attributes": res['source_attributes']
                } 
            else:
                print(f"process index {file_path} failed, {res['error']}", exc_info=True)

        except Exception as e:
            logging.error(f"process index {file_path} failed, {e}", exc_info=True)
    
    topic_docs[topic_name] = list(uploaded_docs.values())

topic_docs

## Batch Build Graph 

Assuming that source data already uploaded

In [None]:
from collections import defaultdict
from knowledge_graph.models import SourceData
from setting.db import db_manager

def get_documents_by_topic(database_uri, topic_docs_config):
    """
    Query database to get all documents aggregated by topic_name.
    Returns a dict where each topic has an array of documents.
    
    Args:
        database_uri: Database connection URI
        topic_docs_config: Dictionary with topic_name as keys and list of doc configs as values
                          Each doc config should have 'doc_link' field to match against SourceData.link
    """
    from sqlalchemy.orm import joinedload
    
    result_topic_docs = defaultdict(list)
    session_factory = db_manager.get_session_factory(database_uri)
    
    # Step 1: Collect all unique doc_links from all topics
    all_doc_links = set()
    for docs_config in topic_docs_config.values():
        for doc in docs_config:
            all_doc_links.add(doc['doc_link'])
    
    print(f"Total unique document links to query: {len(all_doc_links)}")
    
    # Step 2: Batch query all SourceData with eager loading and extract data within session
    with session_factory() as db:
        source_data_list = db.query(SourceData).options(
            joinedload(SourceData.content_store)
        ).filter(
            SourceData.link.in_(list(all_doc_links))
        ).all()
        
        print(f"Found {len(source_data_list)} SourceData records in database")
        
        # Extract all data within session to avoid DetachedInstanceError
        link_to_doc_data = {}
        for sd in source_data_list:
            link_to_doc_data[sd.link] = {
                "source_id": sd.id,
                "source_name": sd.name,
                "source_content": sd.effective_content,  # Access within session
                "source_link": sd.link,
                "source_attributes": sd.attributes,
            }
    
    # Step 3: Assemble documents by topic in memory
    for topic_name, docs_config in topic_docs_config.items():
        for doc_config in docs_config:
            doc_link = doc_config['doc_link']
            doc_data = link_to_doc_data.get(doc_link)
            
            if doc_data:
                doc_info = {
                    **doc_data,
                    "topic_name": topic_name,
                }
                result_topic_docs[topic_name].append(doc_info)
            else:
                print(f"Warning: No SourceData found for doc_link: {doc_link}")
    
    return dict(result_topic_docs)

# Get all documents grouped by topic using tidb_product_docs configuration
database_uri = os.getenv("GRAPH_DATABASE_URI")
all_topic_docs = get_documents_by_topic(database_uri, tidb_product_docs)

# Display available topics
print("Available topics:")
for topic, docs in all_topic_docs.items():
    print(f"  {topic}: {len(docs)} documents")

In [None]:
topic_names = list(all_topic_docs.keys())
topic_names = sorted(topic_names)
topic_names

In [None]:
topic_names = list(all_topic_docs.keys())
topic_names = sorted(topic_names,reverse=True)
for i, topic_name in enumerate(topic_names):
    if topic_name != "TiDBCloud Product Documentation - tidbcloud/Vector Search (Beta)/Reference":
        continue
    # if i % 3 != 0:
    #    continue
    topic_docs = all_topic_docs[topic_name]
    logger.info("processing topic: %s, number of docs: %d", topic_name, len(topic_docs))
    try:
        result = graph_builder.build_knowledge_graph(
            topic_name,
            topic_docs
        )

        logger.info("\n=== Memory Knowledge Graph Construction Results ===")
        logger.info(f"Topic: {result['topic_name']}")
        logger.info(f"Documents processed: {result['documents_processed']}")
        logger.info(f"Documents failed: {result['documents_failed']}")
        logger.info(f"Cognitive maps generated: {result['cognitive_maps_generated']}")
        logger.info(f"Triplets extracted: {result['triplets_extracted']}")
        logger.info(f"Total entities created: {result['entities_created']}")
        logger.info(f"Total relationships created: {result['relationships_created']}")

        # Print global blueprint information
        blueprint_info = result.get("global_blueprint", {})
        logger.info(f"\nGlobal Blueprint:")
        logger.info(
            f"  - Processing instructions: {blueprint_info.get('processing_instructions', '')}"
        )
        logger.info(
            f"  - Processing items: {blueprint_info.get('processing_items', {})}"
        )

        logger.info("\n🎉 Memory knowledge graph construction completed successfully!")

    except Exception as e:
        logger.error(f"Failed to build knowledge graph: {e}", exc_info=True)
        continue

    try:
        result = graph_builder.enhance_knowledge_graph(
            topic_name,
            topic_docs,
        )

    except Exception as e:
        logger.error(f"Failed to ehance knowledge graph: {e}", exc_info=True)
        continue

    logger.info("enhance knowledge graph result: %s", result)

## Query Graph

### Vector Similarity based Search

In [None]:
from knowledge_graph.query import search_relationships_by_vector_similarity, query_topic_graph

query = "Where are li ming now?"
res = search_relationships_by_vector_similarity(query, similarity_threshold=0.2, top_k=20)
context = ""
entities = set()
relationships = []

for index, row in res.iterrows():
    entities.add(f"{row['source_entity']} {row['source_entity_description']}")
    entities.add(f"{row['target_entity']} {row['target_entity_description']}")
    relationships.append(f"{row['source_entity']} {row['relationship_desc']} {row['target_entity']}")

context = "Entities:\n" + "\n".join(entities) + "\n\nRelationships:\n" + "\n".join(relationships)

print(context)

In [None]:
from llm.factory import LLMInterface

llm_client = LLMInterface("bedrock", "us.anthropic.claude-3-7-sonnet-20250219-v1:0")
response =llm_client.generate(f"""Given the following context
<context>
{context}
</context>
answer the question: {query}
""")
print(response)