## Prepare Data

In [None]:
import logging
import os

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from setting.db import db_manager
from knowledge_graph.knowledge import KnowledgeBuilder
from knowledge_graph.graph_builder import KnowledgeGraphBuilder

llm_client = LLMInterface("ollama", "qwen3:32b-fp16")
kb_builder = KnowledgeBuilder()
session_factory = db_manager.get_session_factory(os.getenv("GRAPH_DATABASE_URI"))
graph_builder = KnowledgeGraphBuilder(llm_client, get_text_embedding, session_factory)

# Initialize logging module with a basic configuration for console output
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(filename)s:%(lineno)d: %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
import json
import os
import hashlib


template_file_hashes = [
    "271bde1405442b4a2bb826eca673063e70833a4d02343e15ea9e46a00a218159",
    "0b9400fec9aa5109d977699a65ce8dffce1290ee3f8ac2da4610078aa09d1d4b",
    "9bf37d8935ab264a2308401cb9999e3d0d9d67a16c7f8adbb6a878ea607db9b4",
    "c56bbc8c5345418191c9cf1081b592c368616c1f5d958712b45e2f0525a43352",
    "c0cfa9e5942b48e58dc77226235dbac34c2d59ba0d3167434ecd682feb60e282",
    "1c78dd8dd3f984f77bfc6f65d12fc7a949077f601b0a3995bd524c7f618d1407",
    "641c93edb9df6d0325034bddd6ab1367be909616947798fde277425e441f26c9",
    "d790176265d81503bdf5fe1a5070bb7507cb8b943801556c8aaa640629d27457",
    "a57716c4f1806672cd79b92460fca79072faa5cc4840a933f688b4933756363d",
    "6f4d6cf3926f3d098a8fd532787bff39e009451d42c29a4c25f9bfb79f48349b",
    "0199cc4c09034b898dc8178d6cc4e97c4c1b0b99125e3396ca71040e2ce0df2f",
    "8d382b2c935d733a0f0391b53c53ec34b96955b99f873a3d05af08452563ef05",
    "f483e811a5ac4c15517ff5e660bfe17846d9daaa39863fb908d98b05a10bc069",
    "57b42e65d8f935bec6dfc5e2f7ce9a00d58dcff06771003548bb627aeac56071",
    "666010328f420ccbe6d383f62aa9db77a926bb06bf409811771ed9af3bf94da6",
    "b3152523500b0e17540e5be3a2fa7c1bb4c2c48b4c5a926d8f6de71e7bbde1d0"
]

# Define the path to the JSON configuration file
config_file_path = 'docs/pdf_metadata.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")


client_name_list = set([doc['client_name'] for doc in loaded_docs])
client_docs = {}
for client_name in client_name_list:
    client_docs[client_name] = []
    for doc in loaded_docs:
        with open(doc['path'], "rb") as f:
            raw_content = f.read()
            content_hash = hashlib.sha256(raw_content).hexdigest()
            if content_hash in template_file_hashes:
                continue
        if doc['client_name'] == client_name:
            client_docs[client_name].append({
                'path': doc['path'],  # required
                'doc_link': doc['web_view_link'], # required
                'topic_name': f"Customer Tracking for {doc['client_name']}",
                'client_name': doc['client_name'],
                'created_at': doc['created_time'],
                'updated_at': doc['modified_time'],
                'mime_type': doc['mime_type']
            })
    print(f"Client: {client_name}, Number of documents: {len(client_docs[client_name])}")


## Upload Data and Build Graph

In [None]:
client_name = "OpenAI"
docs = client_docs[client_name]
topic_name = f"Customer Tracking for {client_name}"

print("step 1: upload docs to knowledge base")
topic_docs = {}
for doc in docs:
    file_path = doc['path']
    try:
        res = kb_builder.extract_knowledge(
            file_path, 
            doc
        )
        if res['status'] == 'success':
            topic_docs[res['source_id']] = {
                "source_id": res['source_id'],
                "source_name": res['source_name'],
                "source_content": res['source_content'],
                "source_link": res['source_link'],
            } 
        else:
            print(f"process index {file_path} failed, {res['error']}", exc_info=True)

    except Exception as e:
        logging.error(f"process index {file_path} failed, {e}", exc_info=True)

topic_docs

In [None]:
topic_name = f"Customer {client_name} Tracking"

print("step 2: add to graph")
result = graph_builder.build_knowledge_graph(topic_name, list(topic_docs.values()))
print(result)

## Batch Build Graph 

Assuming that source data already uploaded

In [None]:
from collections import defaultdict
from knowledge_graph.models import SourceData
from setting.db import db_manager

def get_documents_by_topic(database_uri):
    """
    Query database to get all documents aggregated by topic_name.
    Returns a dict where each topic has an array of documents.
    """
    topic_docs = defaultdict(list)
    session_factory = db_manager.get_session_factory(database_uri)
    with session_factory() as db:
        # Query all source data
        source_data_list = db.query(SourceData).all()
        
        for source_data in source_data_list:
            # Extract topic_name from attributes
            topic_name = None
            if source_data.attributes and isinstance(source_data.attributes, dict):
                topic_name = source_data.attributes.get('topic_name')
            
            # Skip if no topic_name found
            if not topic_name:
                continue
                
            # Add document to the appropriate topic
            doc_info = {
                "source_id": source_data.id,
                "source_name": source_data.name,
                "source_content": source_data.effective_content,
                "source_link": source_data.link,
                "topic_name": topic_name,
                "source_attributes": source_data.attributes,
            }
            topic_docs[topic_name].append(doc_info)
    
    return dict(topic_docs)

# Get all documents grouped by topic
database_uri = os.getenv("GRAPH_DATABASE_URI")
all_topic_docs = get_documents_by_topic(database_uri)

# Display available topics
print("Available topics:")
for topic, docs in all_topic_docs.items():
    print(f"  {topic}: {len(docs)} documents")

In [None]:
topic_names = list(all_topic_docs.keys())
topic_names = sorted(topic_names)
for i, topic_name in enumerate(topic_names):
    if i % 6 != 4:
        continue
    topic_docs = all_topic_docs[topic_name]
    logger.info("processing topic: %s, number of docs: %d", topic_name, len(topic_docs))
    try:
        result = graph_builder.build_knowledge_graph(
            topic_name,
            topic_docs
        )

        logger.info("\n=== Memory Knowledge Graph Construction Results ===")
        logger.info(f"Topic: {result['topic_name']}")
        logger.info(f"Documents processed: {result['documents_processed']}")
        logger.info(f"Documents failed: {result['documents_failed']}")
        logger.info(f"Cognitive maps generated: {result['cognitive_maps_generated']}")
        logger.info(f"Triplets extracted: {result['triplets_extracted']}")
        logger.info(f"Total entities created: {result['entities_created']}")
        logger.info(f"Total relationships created: {result['relationships_created']}")

        # Print global blueprint information
        blueprint_info = result.get("global_blueprint", {})
        logger.info(f"\nGlobal Blueprint:")
        logger.info(
            f"  - Processing instructions: {blueprint_info.get('processing_instructions', '')}"
        )
        logger.info(
            f"  - Processing items: {blueprint_info.get('processing_items', {})}"
        )

        logger.info("\n🎉 Memory knowledge graph construction completed successfully!")

    except Exception as e:
        logger.error(f"Failed to build knowledge graph: {e}", exc_info=True)
        continue

    try:
        result = graph_builder.enhance_knowledge_graph(
            topic_name,
            topic_docs,
        )

    except Exception as e:
        logger.error(f"Failed to ehance knowledge graph: {e}", exc_info=True)
        continue

    logger.info("enhance knowledge graph result: %s", result)

## Restful API Example

In [None]:
# client_name = "Postman - Collections"
docs = client_docs[client_name]
topic_name = docs[0]['topic_name']
print(topic_name)
docs


In [None]:
import requests

url = "http://192.168.206.252:23333/api/v1/knowledge/upload"

for client_name in client_docs:
    if client_name in [
        "Visa - AI Program Control Plane",
        "Visa",
        "Visa - Fast Data initiative",
        "Visa VASPD"
    ]:
        continue

    docs = client_docs[client_name]
    if len(docs) == 0:
        print(f"No docs for {client_name}")
        continue

    topic_name = docs[0]['topic_name']
    print(topic_name)
    database_uri = os.getenv("GRAPH_DATABASE_URI")

    # Call the trigger-processing API to start processing uploaded all documents for a topic
    url = "http://192.168.206.252:23333/api/v1/knowledge/trigger-processing"
    data = {
        "topic_name": topic_name,
        "database_uri": database_uri
    }

    response = requests.post(url, data=data)
    print(response.status_code)
    print(response.json())

In [None]:
import requests

url = "http://192.168.206.252:23333/api/v1/knowledge/upload"

files = []
links = []
for doc in docs:
    files.append(('files', (doc["path"].split('/')[-1], open(doc["path"], 'rb'), 'application/pdf')))
    links.append(doc["doc_link"])

data = {
    'links': links,
    'topic_name': topic_name,
    'database_uri': os.getenv("GRAPH_DATABASE_URI")
}
response = requests.post(url, files=files, data=data)

print(response.status_code)
print(response.json())


In [None]:
import requests

database_uri = os.getenv("GRAPH_DATABASE_URI")

# Call the trigger-processing API to start processing uploaded all documents for a topic
url = "http://192.168.206.252:23333/api/v1/knowledge/trigger-processing"
data = {
    "topic_name": topic_name,
    "database_uri": database_uri
}

response = requests.post(url, data=data)
print(response.status_code)
print(response.json())

## Query Graph

### Vector Similarity based Search

In [None]:
from knowledge_graph.query import search_relationships_by_vector_similarity, query_topic_graph

query = "Where are li ming now?"
res = search_relationships_by_vector_similarity(query, similarity_threshold=0.2, top_k=20)
context = ""
entities = set()
relationships = []

for index, row in res.iterrows():
    entities.add(f"{row['source_entity']} {row['source_entity_description']}")
    entities.add(f"{row['target_entity']} {row['target_entity_description']}")
    relationships.append(f"{row['source_entity']} {row['relationship_desc']} {row['target_entity']}")

context = "Entities:\n" + "\n".join(entities) + "\n\nRelationships:\n" + "\n".join(relationships)

print(context)

In [None]:
from llm.factory import LLMInterface

llm_client = LLMInterface("bedrock", "us.anthropic.claude-3-7-sonnet-20250219-v1:0")
response =llm_client.generate(f"""Given the following context
<context>
{context}
</context>
answer the question: {query}
""")
print(response)