## Prepare Data

In [None]:
import logging
import os

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from setting.db import db_manager
from knowledge_graph.knowledge import KnowledgeBuilder
from knowledge_graph.graph_builder import KnowledgeGraphBuilder

llm_client = LLMInterface("ollama", "qwen3:32b-fp16")
session_factory = db_manager.get_session_factory(os.getenv("GRAPH_DATABASE_URI"))
kb_builder = KnowledgeBuilder(session_factory)
graph_builder = KnowledgeGraphBuilder(llm_client, get_text_embedding, session_factory)

# Initialize logging module with a basic configuration for console output
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(filename)s:%(lineno)d: %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
import json
import os
import hashlib

categories = [
"tidb/About TiDB Self-Managed",
"tidb/Get Started",
"tidb/Develop/overview",
"tidb/Develop/Quick Start",
"tidb/Develop/Example Applications",
"tidb/Develop/Connect to TiDB",
"tidb/Develop/Design Database Schema",
"tidb/Develop/Write Data",
"tidb/Develop/Read Data",
"tidb/Develop/Vector Search",
"tidb/Develop/Transaction",
"tidb/Develop/Optimize",
"tidb/Develop/Troubleshoot",
"tidb/Develop/Reference",
"tidb/Develop/Cloud Native Development Environment",
"tidb/Develop/Third-Party Support",
"tidb/Deploy",
"tidb/Migrate",
"tidb/Stream Data",
"tidb/Maintain/Security",
"tidb/Maintain/Upgrade",
"tidb/Maintain/Scale",
"tidb/Maintain/Backup and Restore",
"tidb/Maintain/Cluster Disaster Recovery (DR)",
"tidb/Maintain/Resource Manager",
"tidb/Maintain/Configure Time Zone",
"tidb/Maintain/Daily Checklist",
"tidb/Maintain/Maintain TiFlash",
"tidb/Maintain/Maintain TiDB Using TiUP",
"tidb/Maintain/Modify Configuration Dynamically",
"tidb/Maintain/Online Unsafe Recovery",
"tidb/Maintain/Replicate Data Between Primary and Secondary Clusters",
"tidb/Monitor and Alert",
"tidb/Troubleshoot",
"tidb/Performance Tuning",
"tidb/Tutorials",
"tidb/TiDB Tools/overview",
"tidb/TiDB Tools/TiUP",
"tidb/TiDB Tools/TiDB Operator",
"tidb/TiDB Tools/TiDB Data Migration",
"tidb/TiDB Tools/TiDB Lightning",
"tidb/TiDB Tools/Dumpling",
"tidb/TiDB Tools/PingCAP Clinic Diagnostic Service",
"tidb/TiDB Tools/TiSpark",
"tidb/TiDB Tools/sync-diff-inspector",
"tidb/TiDB Tools/TiProxy",
"tidb/Reference/Cluster Architecture",
"tidb/Reference/Storage Engine - TiKV",
"tidb/Reference/Storage Engine - TiFlash",
"tidb/Reference/TiDB Distributed eXecution Framework (DXF)",
"tidb/Reference/System Variables",
"tidb/Reference/Configuration File Parameters",
"tidb/Reference/CLI",
"tidb/Reference/Command Line Flags",
"tidb/Reference/Key Monitoring Metrics",
"tidb/Reference/Privileges",
"tidb/Reference/SQL",
"tidb/Reference/Telemetry",
"tidb/Reference/Error Codes",
"tidb/Reference/Table Filter",
"tidb/Reference/Schedule Replicas by Topology Labels",
"tidb/Reference/URI Formats of External Storage Services",
"tidb/FAQs",
"tidb/Release Notes",
"tidb/Glossary"
]

# Define the path to the JSON configuration file
config_file_path = '/Users/ian/Work/docs/toc_files.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")


tidb_product_docs = {}
for category in categories:
    if category == "tidb/Reference/SQL":
        continue
    topic_name = "TiDB Product Documentation - " + category
    tidb_product_docs[topic_name] = []
    for doc in loaded_docs:
        if category in doc['category']:
            tidb_product_docs[topic_name].append({
                'topic_name': topic_name,
                'path': doc['path'],  # required
                'doc_link': doc['web_view_link'], # required
                'category': category,
                'updated_at': doc['modified_time'],
                'mime_type': doc['mime_type'],
                'version': doc['version']
            })
    print(f"Category: {topic_name}, Number of documents: {len(tidb_product_docs[topic_name])}")

## Upload Data

In [None]:
print("step 1: upload docs to knowledge base")
topic_docs = {}
for topic_name in tidb_product_docs:
    print("uploading docs for topic: ", topic_name)
    docs = tidb_product_docs[topic_name]
    uploaded_docs = {}
    for doc in docs:
        file_path = doc['path']
        try:
            res = kb_builder.extract_knowledge(
                file_path, 
                doc
            )
            if res['status'] == 'success':
                uploaded_docs[res['source_id']] = {
                    "source_id": res['source_id'],
                    "source_name": res['source_name'],
                    "source_content": res['source_content'],
                    "source_link": res['source_link'],
                    "topic_name": topic_name,
                    "source_attributes": res['source_attributes']
                } 
            else:
                print(f"process index {file_path} failed, {res['error']}", exc_info=True)

        except Exception as e:
            logging.error(f"process index {file_path} failed, {e}", exc_info=True)
    
    topic_docs[topic_name] = list(uploaded_docs.values())

topic_docs

## Batch Build Graph 

Assuming that source data already uploaded

In [None]:
from collections import defaultdict
from knowledge_graph.models import SourceData
from setting.db import db_manager

def get_documents_by_topic(database_uri):
    """
    Query database to get all documents aggregated by topic_name.
    Returns a dict where each topic has an array of documents.
    """
    topic_docs = defaultdict(list)
    session_factory = db_manager.get_session_factory(database_uri)
    with session_factory() as db:
        # Query all source data
        source_data_list = db.query(SourceData).all()
        
        for source_data in source_data_list:
            # Extract topic_name from attributes
            topic_name = None
            if source_data.attributes and isinstance(source_data.attributes, dict):
                topic_name = source_data.attributes.get('topic_name')
            
            # Skip if no topic_name found
            if not topic_name:
                continue
                
            # Add document to the appropriate topic
            doc_info = {
                "source_id": source_data.id,
                "source_name": source_data.name,
                "source_content": source_data.effective_content,
                "source_link": source_data.link,
                "topic_name": topic_name,
                "source_attributes": source_data.attributes,
            }
            topic_docs[topic_name].append(doc_info)
    
    return dict(topic_docs)

# Get all documents grouped by topic
database_uri = os.getenv("GRAPH_DATABASE_URI")
all_topic_docs = get_documents_by_topic(database_uri)

# Display available topics
print("Available topics:")
for topic, docs in all_topic_docs.items():
    print(f"  {topic}: {len(docs)} documents")

In [None]:
topic_names = list(all_topic_docs.keys())
topic_names = sorted(topic_names)
for i, topic_name in enumerate(topic_names):
    if i % 6 != 4:
        continue
    topic_docs = all_topic_docs[topic_name]
    logger.info("processing topic: %s, number of docs: %d", topic_name, len(topic_docs))
    try:
        result = graph_builder.build_knowledge_graph(
            topic_name,
            topic_docs
        )

        logger.info("\n=== Memory Knowledge Graph Construction Results ===")
        logger.info(f"Topic: {result['topic_name']}")
        logger.info(f"Documents processed: {result['documents_processed']}")
        logger.info(f"Documents failed: {result['documents_failed']}")
        logger.info(f"Cognitive maps generated: {result['cognitive_maps_generated']}")
        logger.info(f"Triplets extracted: {result['triplets_extracted']}")
        logger.info(f"Total entities created: {result['entities_created']}")
        logger.info(f"Total relationships created: {result['relationships_created']}")

        # Print global blueprint information
        blueprint_info = result.get("global_blueprint", {})
        logger.info(f"\nGlobal Blueprint:")
        logger.info(
            f"  - Processing instructions: {blueprint_info.get('processing_instructions', '')}"
        )
        logger.info(
            f"  - Processing items: {blueprint_info.get('processing_items', {})}"
        )

        logger.info("\n🎉 Memory knowledge graph construction completed successfully!")

    except Exception as e:
        logger.error(f"Failed to build knowledge graph: {e}", exc_info=True)
        continue

    try:
        result = graph_builder.enhance_knowledge_graph(
            topic_name,
            topic_docs,
        )

    except Exception as e:
        logger.error(f"Failed to ehance knowledge graph: {e}", exc_info=True)
        continue

    logger.info("enhance knowledge graph result: %s", result)

## Restful API Example

In [None]:
# client_name = "Postman - Collections"
docs = client_docs[client_name]
topic_name = docs[0]['topic_name']
print(topic_name)
docs


In [None]:
import requests

url = "http://192.168.206.252:23333/api/v1/knowledge/upload"

for client_name in client_docs:
    if client_name in [
        "Visa - AI Program Control Plane",
        "Visa",
        "Visa - Fast Data initiative",
        "Visa VASPD"
    ]:
        continue

    docs = client_docs[client_name]
    if len(docs) == 0:
        print(f"No docs for {client_name}")
        continue

    topic_name = docs[0]['topic_name']
    print(topic_name)
    database_uri = os.getenv("GRAPH_DATABASE_URI")

    # Call the trigger-processing API to start processing uploaded all documents for a topic
    url = "http://192.168.206.252:23333/api/v1/knowledge/trigger-processing"
    data = {
        "topic_name": topic_name,
        "database_uri": database_uri
    }

    response = requests.post(url, data=data)
    print(response.status_code)
    print(response.json())

In [None]:
import requests

url = "http://192.168.206.252:23333/api/v1/knowledge/upload"

files = []
links = []
for doc in docs:
    files.append(('files', (doc["path"].split('/')[-1], open(doc["path"], 'rb'), 'application/pdf')))
    links.append(doc["doc_link"])

data = {
    'links': links,
    'topic_name': topic_name,
    'database_uri': os.getenv("GRAPH_DATABASE_URI")
}
response = requests.post(url, files=files, data=data)

print(response.status_code)
print(response.json())


In [None]:
import requests

database_uri = os.getenv("GRAPH_DATABASE_URI")

# Call the trigger-processing API to start processing uploaded all documents for a topic
url = "http://192.168.206.252:23333/api/v1/knowledge/trigger-processing"
data = {
    "topic_name": topic_name,
    "database_uri": database_uri
}

response = requests.post(url, data=data)
print(response.status_code)
print(response.json())

## Query Graph

### Vector Similarity based Search

In [None]:
from knowledge_graph.query import search_relationships_by_vector_similarity, query_topic_graph

query = "Where are li ming now?"
res = search_relationships_by_vector_similarity(query, similarity_threshold=0.2, top_k=20)
context = ""
entities = set()
relationships = []

for index, row in res.iterrows():
    entities.add(f"{row['source_entity']} {row['source_entity_description']}")
    entities.add(f"{row['target_entity']} {row['target_entity_description']}")
    relationships.append(f"{row['source_entity']} {row['relationship_desc']} {row['target_entity']}")

context = "Entities:\n" + "\n".join(entities) + "\n\nRelationships:\n" + "\n".join(relationships)

print(context)

In [None]:
from llm.factory import LLMInterface

llm_client = LLMInterface("bedrock", "us.anthropic.claude-3-7-sonnet-20250219-v1:0")
response =llm_client.generate(f"""Given the following context
<context>
{context}
</context>
answer the question: {query}
""")
print(response)