In [1]:
from setting.db import SessionLocal

from llm.factory import LLMInterface
from llm.embedding import get_text_embedding
from knowledge_graph.knowledge import KnowledgeBuilder


llm_client = LLMInterface("bedrock", "arn:aws:bedrock:us-east-1:841162690310:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0")
# llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding)

In [2]:
kb = kb_builder.extract_knowledge_index(
    "docs/bo_tree/business_operation_knowledge_tree.mm",
    {
        "doc_version": 1.0,
        "doc_link": "https://pingcap.feishu.cn/wiki/FYsKwV2p4iDrAxkEpPyc2o7enBb#mindmap"
    }
)
kb

all knowledge {'Compensation->What is Compensation Metrics?': {'path': ['Compensation', 'What is Compensation Metrics?'], 'references': ['FY25 WW Compensation Plan, Sales (base version)', 'FY25 Compensation Related Business Metrics (APAC)']}, 'Compensation->What is Compensation Plan?->What is compensation plan structure?': {'path': ['Compensation', 'What is Compensation Plan?', 'What is compensation plan structure?'], 'references': ['FY25 WW Compensation Plan, Sales (base version)', 'FY25 APAC WW Compensation Plan, Presales (base version)']}, 'Compensation->What is Compensation Plan?->How different kind of product contributes to Compensation?': {'path': ['Compensation', 'What is Compensation Plan?', 'How different kind of product contributes to Compensation?'], 'references': ['FY25 WW Compensation Plan, Sales (base version)', 'PingCAP FY25 Global SPIFF Program - MySQL Family Capture']}, 'Compensation->What is SPIFF?': {'path': ['Compensation', 'What is SPIFF?'], 'references': ['PingCAP

['# Compensation Metrics Analysis\n\nAfter analyzing the provided documents, I\'ll create concept node entities and relationships for the knowledge graph based on the topic "What is Compensation Metrics?" from the path.\n\n## Complexity Assessment\n\nThe topic is COMPLEX as it:\n- Covers multiple interconnected business metrics that affect compensation\n- Contains hierarchical components (different buckets and calculation methods)\n- Has multiple dimensions (quota attainment, ACV calculations, customer types)\n- Is discussed across different sections of the documents with varying details\n\n```json\n{\n  "entities": [\n    {\n      "name": "Compensation Metrics",\n      "definition": "Compensation Metrics are key business measurements used to calculate sales compensation and determine quota attainment. These metrics include Brand New Cloud Customer, High-Quality OP (On-Premises), SKA/KA (Strategic Key Account/Key Account) classification, ATR (Available to Renew), and Net ARR. These met

## Load Validation Dataset

In [None]:
import pandas as pd
import os

faq_file = "docs/dataset/AI_BOT_Testing"

if os.path.exists(f"{faq_file}.pkl"):
    faq_df = pd.read_pickle(f"{faq_file}.pkl")
else:
    faq_df = pd.read_excel(f"{faq_file}.xlsx")
    faq_df = faq_df.iloc[2:, :5].reset_index(drop=True)
    faq_df.columns = ["提问者", "Questions", "AI Answers", "✔️ or ✖️", "Tree Index"]
    faq_df.to_pickle(faq_file)

faq_df

In [None]:
print(faq_df.at[1, 'Tree Index'])

In [None]:
import json

from utils.json_utils import extract_json
from index_craft.prompts.index_gen import get_question_index_prompt, get_index_reference_prompt

issues = []

for index, row in faq_df.iterrows():
    print(type(row['Tree Index']), row['Tree Index'])
    if not isinstance(row['Tree Index'], str) or (row['Tree Index'] is not None and len(row['Tree Index']) > 0):
        continue

    print("-"*100)
    print("Question: ", row['Questions'])
    prompt = get_question_index_prompt(row['Questions'], tree_dict)
    response = llm_client.generate(prompt)
    json_str = extract_json(response)
    json_obj = json.loads(json_str)
    index_paths = []
    for i, index_obj in enumerate(json_obj):
        print(f"Index {i}:")
        print(f" - subquestion: {index_obj['subquestion']}")
        print(f" - reasoning: {index_obj['reasoning']}")
        print(f" - matched: {index_obj['matched']}")
        if index_obj['matched'] and 'index_path' in index_obj and len(index_obj['index_path']) > 0:
            path = " -> ".join(index_obj['index_path'])
            print(" -", path)
            index_paths.append(path)
            
        else:
            issues.append(index_obj)
    print("\n")
    faq_df.loc[index, 'Tree Index'] = index_paths

In [6]:
faq_df.to_pickle(f"{faq_file}.pkl")