In [1]:
from setting.db import SessionLocal

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from knowledge_graph.knowledge import KnowledgeBuilder


llm_client = LLMInterface("bedrock", "arn:aws:bedrock:us-east-1:841162690310:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0")
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding)

In [2]:
kb = kb_builder.extract_knowledge_index(
    "docs/bo_tree/business_operation_knowledge_tree.mm",
    {
        "doc_version": 1.0,
        "doc_link": "https://pingcap.feishu.cn/wiki/FYsKwV2p4iDrAxkEpPyc2o7enBb#mindmap"
    }
)
kb.indexes

[Index(name='Compensation', children=[Index(name='What is Compensation Metrics?', children=[Index(name='Reference', children=[Index(name='FY25 WW Compensation Plan, Sales (base version)', children=[]), Index(name='FY25 Compensation Related Business Metrics (APAC)', children=[])])]), Index(name='What is Compensation Plan?', children=[Index(name='What is compensation plan structure?', children=[Index(name='Reference', children=[Index(name='FY25 WW Compensation Plan, Sales (base version)', children=[]), Index(name='FY25 APAC WW Compensation Plan, Presales (base version)', children=[])])]), Index(name='What will happen if windfall is triggered?', children=[]), Index(name='How different kind of product contributes to Compensation?', children=[Index(name='Reference', children=[Index(name='FY25 WW Compensation Plan, Sales (base version)', children=[]), Index(name='PingCAP FY25 Global SPIFF Program - MySQL Family Capture', children=[])])])]), Index(name='What is SPIFF?', children=[Index(name='

In [3]:
import copy

from knowledge_graph.models import SourceData

def get_node_attributes(node):
    references = []
    for child in node.children:
        if len(child.children) > 0:
            raise ValueError("Reference node should be the leaf node without any children")
        
        references.append(child.name)
    
    return references


def depth_traversal(root, current_path:list):
    # print(" "*depth, f"- {root.name}")
    if root.name == "Reference":
        try:
            references = get_node_attributes(root)
            return [{
                "path": copy.deepcopy(current_path),
                "references": get_node_attributes(root),
            }]
        except Exception as e:
            print(f"Fail to get reference, skip it {e}")
    elif root.name == "Definition":
        return [{
            "path": copy.deepcopy(current_path),
            "definition": get_node_attributes(root),
        }]
    elif root.name == "Annotation":
         return [{
            "path": copy.deepcopy(current_path),
            "annotation": get_node_attributes(root),
        }]


    all_paths = []
    for child in root.children:
        curent_path_copy = copy.deepcopy(current_path)
        all_paths.extend(depth_traversal(child, curent_path_copy + [root.name]))

    return all_paths


all_knowledges = {}
for index in kb.indexes:
    paths = depth_traversal(index, [])
    for path in paths:
        path_str = "->".join(path['path'])
        if path_str not in all_knowledges:
            all_knowledges[path_str] = {}

        for key, value in path.items():
            all_knowledges[path_str][key] = value

all_knowledges

{'Compensation->What is Compensation Metrics?': {'path': ['Compensation',
   'What is Compensation Metrics?'],
  'references': ['FY25 WW Compensation Plan, Sales (base version)',
   'FY25 Compensation Related Business Metrics (APAC)']},
 'Compensation->What is Compensation Plan?->What is compensation plan structure?': {'path': ['Compensation',
   'What is Compensation Plan?',
   'What is compensation plan structure?'],
  'references': ['FY25 WW Compensation Plan, Sales (base version)',
   'FY25 APAC WW Compensation Plan, Presales (base version)']},
 'Compensation->What is Compensation Plan?->How different kind of product contributes to Compensation?': {'path': ['Compensation',
   'What is Compensation Plan?',
   'How different kind of product contributes to Compensation?'],
  'references': ['FY25 WW Compensation Plan, Sales (base version)',
   'PingCAP FY25 Global SPIFF Program - MySQL Family Capture']},
 'Compensation->What is SPIFF?': {'path': ['Compensation', 'What is SPIFF?'],
  'r

In [13]:
prompt = """You are an expert knowledge graph architect. Your task is to analyze the provided 'knowledge' object and its referenced document content ('reference_documents') to create one or more concept node entities for a knowledge graph.

**Inputs:**

1.  **`knowledge` Object:** Contains the `path` (representing the topic's context or hierarchy) and `references` (names of source documents).
    {knowledge}

2.  **`reference_documents` List:** A list of objects, each containing the `id`, `name`, `link`, `version`, and `content` of the documents referenced in the `knowledge` object.
    {reference_documents}

**Task:**

Based *strictly* on the information found within the `content` of the provided `reference_documents` that correspond to the `knowledge` object's topic (indicated by its `path`), generate concept node entities.

1.  **Analyze Complexity:** Evaluate the information related to the `knowledge['path']` topic within the document `content`.
    * If the topic is relatively simple and can be represented by a single, cohesive concept, create **one** entity.
    * If the topic is complex, multifaceted, or discussed in terms of different aspects or hierarchical levels within the documents, break it down into **multiple** sub-knowledge entities. Each entity should represent a distinct aspect or level of the original topic, such that combining them provides a comprehensive view.

2.  **For Each Entity:**
    * **Generate a `name`:** Create a concise, descriptive, and accurate name for the concept represented by the entity. Use terminology found in the documents or derived logically from the `knowledge['path']` and content.
    * **Generate a `definition`:** Write a professional, clear, detailed, coherent, and logically structured definition (or description) for the entity. This definition MUST:
        * Be derived *exclusively* from the provided `content` of the relevant `valid_references`. Do not infer information or use external knowledge.
        * Accurately synthesize the relevant information from the source(s).
        * Explain the concept thoroughly.

**Output Format:**

Provide the output as a JSON list, where each element is an object representing an entity with the following keys:

* `name`: (String) The generated name for the entity.
* `definition`: (String) The generated professional definition for the entity.

**Example Output Structure:**

```json
[
  {{
    "name": "Entity Name 1",
    "definition": "Detailed, professional definition for Entity 1 based strictly on the provided document content..."
  }},
  {{
    "name": "Entity Name 2 (if applicable)",
    "definition": "Detailed, professional definition for Entity 2 based strictly on the provided document content..."
  }}
]
```
"""

In [None]:
reference_source_names = set()

for path in all_knowledges.values():
    reference_source_names.update(path.get('references', []))

with SessionLocal() as db:
    sources = db.query(SourceData).where(SourceData.name.in_(list(reference_source_names))).all()

source_map = {}
for source in sources:
    source_map[source.name] = {
        "id": source.id,
        "name": source.name,
        "link": source.link,
        "version": source.version,
        "content": source.content
    }

for knowledge in all_knowledges.values():
    invalid_references = []
    valid_references = []
    if 'references' in knowledge:
        for reference in knowledge['references']:
            if reference not in source_map:
                invalid_references.append(reference)
            else:
                valid_references.append(source_map[reference])
    
    if invalid_references:
        print(f"skip knowledge {knowledge}, caused by lack of references {invalid_references}")
        continue

    print(valid_references)
    print(knowledge)

    input_prompt = prompt.format(knowledge=knowledge, reference_documents=valid_references)
    response = llm_client.generate(input_prompt)
    print(response)
    break

[{'id': 'e320602d-0790-40fe-9922-a45b7e7491df', 'name': 'FY25 WW Compensation Plan, Sales (base version)', 'link': 'https://drive.google.com/file/d/1VTEpzzoBtorcai1l-orCWL0R2r7CCpXf/view', 'version': 'fy25_v1.0', 'content': "# Compensation Plan FY25 (April 1, 2024-March 31, 2025)\n\n\n## 1 Overview of Your Compensation Plan\nName: XXX\nJob Role: Sales\nRegion: NA / EMEA / Japan / APAC\nEffective Period: April 1, 2024-March 31, 2025\n\n\n### Bucket 1 - New & Expansion ACV\n\nQuota: $BBBBB\n\n** The below table is a sales commission structure for New & Expansion ACV, with a Quota: $BBBBB , setting different commission rates based on quota attainment and providing additional rewards for acquiring new cloud customers.**\n| - | Quota Attainment Range | Personal Commission Rate (PCR)  |\n| --- | --- | --- |\n| Base Rate | [0, 100%]  | C% |\n| Acceleration Rate - 1 | (100%, 200%] | C%*1.5 |\n| Acceleration Rate - 2 | (200%, 300%] | C%*2  |\n| Base Rate | (300%, ∞) | C% |\n| Cloud First Accele

## Load Validation Dataset

In [None]:
import pandas as pd
import os

faq_file = "docs/dataset/AI_BOT_Testing"

if os.path.exists(f"{faq_file}.pkl"):
    faq_df = pd.read_pickle(f"{faq_file}.pkl")
else:
    faq_df = pd.read_excel(f"{faq_file}.xlsx")
    faq_df = faq_df.iloc[2:, :5].reset_index(drop=True)
    faq_df.columns = ["提问者", "Questions", "AI Answers", "✔️ or ✖️", "Tree Index"]
    faq_df.to_pickle(faq_file)

faq_df

In [None]:
print(faq_df.at[1, 'Tree Index'])

In [None]:
import json

from utils.json_utils import extract_json
from index_craft.prompts.index_gen import get_question_index_prompt, get_index_reference_prompt

issues = []

for index, row in faq_df.iterrows():
    print(type(row['Tree Index']), row['Tree Index'])
    if not isinstance(row['Tree Index'], str) or (row['Tree Index'] is not None and len(row['Tree Index']) > 0):
        continue

    print("-"*100)
    print("Question: ", row['Questions'])
    prompt = get_question_index_prompt(row['Questions'], tree_dict)
    response = llm_client.generate(prompt)
    json_str = extract_json(response)
    json_obj = json.loads(json_str)
    index_paths = []
    for i, index_obj in enumerate(json_obj):
        print(f"Index {i}:")
        print(f" - subquestion: {index_obj['subquestion']}")
        print(f" - reasoning: {index_obj['reasoning']}")
        print(f" - matched: {index_obj['matched']}")
        if index_obj['matched'] and 'index_path' in index_obj and len(index_obj['index_path']) > 0:
            path = " -> ".join(index_obj['index_path'])
            print(" -", path)
            index_paths.append(path)
            
        else:
            issues.append(index_obj)
    print("\n")
    faq_df.loc[index, 'Tree Index'] = index_paths

In [6]:
faq_df.to_pickle(f"{faq_file}.pkl")