In [1]:
from setting.db import SessionLocal

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from knowledge_graph.knowledge import KnowledgeBuilder


llm_client = LLMInterface("bedrock", "arn:aws:bedrock:us-east-1:841162690310:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0")
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding)

In [3]:
kb = kb_builder.extract_knowledge_index(
    "docs/bo_tree/business_operation_knowledge_tree.mm",
    {
        "doc_version": 1.0,
        "doc_link": "https://pingcap.feishu.cn/wiki/FYsKwV2p4iDrAxkEpPyc2o7enBb#mindmap"
    }
)
kb

'I\'ve analyzed the provided documents related to "Compensation Metrics" and will create a knowledge graph based on the content. Let me begin with my analysis:\n\n```json\n{\n  "entities": [\n    {\n      "name": "Compensation Metrics",\n      "definition": "Specific business measurements used to calculate, track, and evaluate sales compensation performance. These metrics include Brand New Cloud Customer, High-Quality OP, SKA/KA segmentation, ATR (Available to Renew), and Net ARR, which are used to determine quota attainment, commission rates, and bonus calculations within the PingCAP compensation structure."\n    },\n    {\n      "name": "Brand New Cloud Customer",\n      "definition": "A customer classification metric defined as an organization with no Cloud Payment before FY25, with specific exceptions for whitelisted customers. To qualify as a Cloud Customer, the organization must achieve a Quarterly Ending ARR of at least $100K. This metric is used in the Cloud First Accelerator p

In [4]:
print(kb)

I've analyzed the provided documents related to "Compensation Metrics" and will create a knowledge graph based on the content. Let me begin with my analysis:

```json
{
  "entities": [
    {
      "name": "Compensation Metrics",
      "definition": "Specific business measurements used to calculate, track, and evaluate sales compensation performance. These metrics include Brand New Cloud Customer, High-Quality OP, SKA/KA segmentation, ATR (Available to Renew), and Net ARR, which are used to determine quota attainment, commission rates, and bonus calculations within the PingCAP compensation structure."
    },
    {
      "name": "Brand New Cloud Customer",
      "definition": "A customer classification metric defined as an organization with no Cloud Payment before FY25, with specific exceptions for whitelisted customers. To qualify as a Cloud Customer, the organization must achieve a Quarterly Ending ARR of at least $100K. This metric is used in the Cloud First Accelerator program, which

In [None]:
import copy

from knowledge_graph.models import SourceData

def get_node_attributes(node):
    references = []
    for child in node.children:
        if len(child.children) > 0:
            raise ValueError("Reference node should be the leaf node without any children")
        
        references.append(child.name)
    
    return references


def depth_traversal(root, current_path:list):
    # print(" "*depth, f"- {root.name}")
    if root.name == "Reference":
        try:
            references = get_node_attributes(root)
            return [{
                "path": copy.deepcopy(current_path),
                "references": get_node_attributes(root),
            }]
        except Exception as e:
            print(f"Fail to get reference, skip it {e}")
    elif root.name == "Definition":
        return [{
            "path": copy.deepcopy(current_path),
            "definition": get_node_attributes(root),
        }]
    elif root.name == "Annotation":
         return [{
            "path": copy.deepcopy(current_path),
            "annotation": get_node_attributes(root),
        }]


    all_paths = []
    for child in root.children:
        curent_path_copy = copy.deepcopy(current_path)
        all_paths.extend(depth_traversal(child, curent_path_copy + [root.name]))

    return all_paths


all_knowledges = {}
for index in kb.indexes:
    paths = depth_traversal(index, [])
    for path in paths:
        path_str = "->".join(path['path'])
        if path_str not in all_knowledges:
            all_knowledges[path_str] = {}

        for key, value in path.items():
            all_knowledges[path_str][key] = value

all_knowledges

In [7]:
prompt = """You are an expert knowledge graph architect. Your task is to analyze the provided 'knowledge' object and its referenced document content ('reference_documents') to create concept node entities and their relationships for a knowledge graph.

**Inputs:**

1.  **`knowledge` Object:** Contains the `path` (representing the topic's context or hierarchy) and `references` (names of source documents).
    {knowledge}

2.  **`reference_documents` List:** A list of objects, each containing the `id`, `name`, `link`, `version`, and `content` of the documents referenced in the `knowledge` object.
    {reference_documents}

**Task:**

Based *strictly* on the information found within the `content` of the provided `reference_documents` that correspond to the `knowledge` object's topic (indicated by its `path`), generate concept node entities and their relationships.

1.  **Analyze Complexity:** Evaluate the information related to the `knowledge['path']` topic within the document `content`.
    * Consider a topic SIMPLE if: it can be fully explained in 1-2 paragraphs, has a single clear definition, and doesn't contain distinct subtopics.
    * Consider a topic COMPLEX if: it requires extensive explanation, contains multiple aspects or dimensions, has hierarchical components, or is discussed from different perspectives in the documents.

2.  **For Each Entity:**
    * **Generate a `name`:** Create a concise, descriptive, and accurate name for the concept represented by the entity. Use terminology found in the documents or derived logically from the `knowledge['path']` and content.
    * **Generate a `definition`:** Write a professional, clear, detailed, coherent, and logically structured definition (or description) for the entity. This definition MUST:
        * Be derived *exclusively* from the provided `content` of the relevant `reference_documents`. Do not infer information or use external knowledge.
        * Accurately synthesize the relevant information from the source(s).
        * Explain the concept thoroughly.
        * Include explanations of domain-specific terms within the definition when necessary.

3.  **Information Prioritization Guidelines:**
    * Prioritize more recent versions of documents when available.
    * Look for consensus across multiple sources.
    * If conflicting information exists, note the discrepancy in the definition and present the most supported view.

4.  **Validation Requirements:**
    * Ensure all key points from the source documents are represented.
    * Verify that no information contradicts the source material.
    * Include only information that is explicitly stated or directly implied in the source documents.

5.  **Relationship Identification and Definition:**
    * Identify meaningful relationships between entities based on the document content.
    * For each relationship:
        * Determine the source and target entities.
        * Assign an appropriate relationship type that best describes the connection.
        * Create a detailed definition explaining the nature of the relationship.
    * Common relationship types include (but are not limited to):
        * Hierarchical (is_part_of, contains, belongs_to)
        * Causal (causes, results_in, depends_on)
        * Functional (interacts_with, supports, enables)
        * Temporal (precedes, follows, occurs_during)
        * Comparative (is_similar_to, differs_from)
    * Relationship definitions should:
        * Be as detailed and professional as entity definitions
        * Explain the specific nature of how entities interact or relate
        * Be derived exclusively from the source documents
        * Include directional clarity (how source affects target and vice versa)
    * Ensure all relationships are explicitly supported by information in the source documents.

6.  **Edge Case Handling:**
    * If the referenced documents contain insufficient information:
        * Create a minimal entity with the available information.
        * Note in the definition that the information is limited based on the provided documents.
    * If the topic is not clearly addressed in the documents:
        * Create an entity based on any relevant information that can be found.
        * Indicate areas where more information would be beneficial.
    * If relationships are implied but not explicitly stated:
        * Only create relationships with reasonable confidence based on the text.
        * Note the level of certainty in the relationship definition.

**Output Format:**

Provide the output as a JSON object with two main sections:

1. **`entities`**: A list of objects, each representing a concept node with:
   * `name`: (String) The generated name for the entity.
   * `definition`: (String) The generated professional definition for the entity.

2. **`relationships`**: A list of objects, each representing a relationship between entities with:
   * `source_entity`: (String) The name of the source entity.
   * `target_entity`: (String) The name of the target entity.
   * `type`: (String) A concise label describing the type of relationship (e.g., "is_part_of", "depends_on", "influences").
   * `definition`: (String) A detailed, professional description of the relationship, explaining how the source entity relates to the target entity, based strictly on the provided document content.

**Example Output Structure:**

```json
{{
  "entities": [
    {{
      "name": "Entity Name 1",
      "definition": "Detailed, professional definition for Entity 1 based strictly on the provided document content..."
    }},
    {{
      "name": "Entity Name 2",
      "definition": "Detailed, professional definition for Entity 2 based strictly on the provided document content..."
    }}
  ],
  "relationships": [
    {{
      "source_entity": "Entity Name 1",
      "target_entity": "Entity Name 2",
      "type": "is_component_of",
      "definition": "Detailed explanation of how Entity 1 functions as a component of Entity 2, including specific interactions and dependencies described in the source documents..."
    }}
  ]
}}
```
Remember: Quality over quantity. It's better to have fewer well-defined entities and relationships than many superficial ones. Your entities and relationships should represent distinct, meaningful concepts that would be valuable in a knowledge graph. All information must be derived exclusively from the provided documents.
"""

In [None]:
reference_source_names = set()

for path in all_knowledges.values():
    reference_source_names.update(path.get('references', []))

with SessionLocal() as db:
    sources = db.query(SourceData).where(SourceData.name.in_(list(reference_source_names))).all()

source_map = {}
for source in sources:
    source_map[source.name] = {
        "id": source.id,
        "name": source.name,
        "link": source.link,
        "version": source.version,
        "content": source.content
    }

for knowledge in all_knowledges.values():
    invalid_references = []
    valid_references = []
    if 'references' in knowledge:
        for reference in knowledge['references']:
            if reference not in source_map:
                invalid_references.append(reference)
            else:
                valid_references.append(source_map[reference])
    
    if invalid_references:
        print(f"skip knowledge {knowledge}, caused by lack of references {invalid_references}")
        continue

    print(valid_references)
    print(knowledge)

    input_prompt = prompt.format(knowledge=knowledge, reference_documents=valid_references)
    response = llm_client.generate(input_prompt)
    print(response)
    break

## Load Validation Dataset

In [None]:
import pandas as pd
import os

faq_file = "docs/dataset/AI_BOT_Testing"

if os.path.exists(f"{faq_file}.pkl"):
    faq_df = pd.read_pickle(f"{faq_file}.pkl")
else:
    faq_df = pd.read_excel(f"{faq_file}.xlsx")
    faq_df = faq_df.iloc[2:, :5].reset_index(drop=True)
    faq_df.columns = ["提问者", "Questions", "AI Answers", "✔️ or ✖️", "Tree Index"]
    faq_df.to_pickle(faq_file)

faq_df

In [None]:
print(faq_df.at[1, 'Tree Index'])

In [None]:
import json

from utils.json_utils import extract_json
from index_craft.prompts.index_gen import get_question_index_prompt, get_index_reference_prompt

issues = []

for index, row in faq_df.iterrows():
    print(type(row['Tree Index']), row['Tree Index'])
    if not isinstance(row['Tree Index'], str) or (row['Tree Index'] is not None and len(row['Tree Index']) > 0):
        continue

    print("-"*100)
    print("Question: ", row['Questions'])
    prompt = get_question_index_prompt(row['Questions'], tree_dict)
    response = llm_client.generate(prompt)
    json_str = extract_json(response)
    json_obj = json.loads(json_str)
    index_paths = []
    for i, index_obj in enumerate(json_obj):
        print(f"Index {i}:")
        print(f" - subquestion: {index_obj['subquestion']}")
        print(f" - reasoning: {index_obj['reasoning']}")
        print(f" - matched: {index_obj['matched']}")
        if index_obj['matched'] and 'index_path' in index_obj and len(index_obj['index_path']) > 0:
            path = " -> ".join(index_obj['index_path'])
            print(" -", path)
            index_paths.append(path)
            
        else:
            issues.append(index_obj)
    print("\n")
    faq_df.loc[index, 'Tree Index'] = index_paths

In [6]:
faq_df.to_pickle(f"{faq_file}.pkl")