In [1]:
from index_craft.loader import parse_freemind
from index_craft.index import normalize_tree
from llm.factory import LLMInterface

llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")

tree = parse_freemind("docs/bo_tree/bo_kb_tree.mm")
tree_dict = normalize_tree(tree)
tree_dict


{'entry': 'AI Bot Knowledge Base Tree Copy',
 'children': {'Compensation': {'entry': 'Compensation',
   'children': {'What is Compensation Metrics?': {'entry': 'What is Compensation Metrics?',
     'children': {'Reference': {'entry': 'Reference',
       'children': {'WW Compensation Plan FY25, Sales (base version).md': {'entry': 'WW Compensation Plan FY25, Sales (base version).md',
         'children': {}},
        'FY25 Compensation Related Business Metrics (APAC).md': {'entry': 'FY25 Compensation Related Business Metrics (APAC).md',
         'children': {}}}}}},
    'What is Compensation Plan?': {'entry': 'What is Compensation Plan?',
     'children': {'What is compensation plan structure?': {'entry': 'What is compensation plan structure?',
       'children': {'Reference': {'entry': 'Reference',
         'children': {'WW Compensation Plan FY25, Sales (base version).md': {'entry': 'WW Compensation Plan FY25, Sales (base version).md',
           'children': {}},
          'WW Compensat

In [2]:
import pandas as pd
import os

faq_file = "docs/dataset/AI_BOT_Testing"

if os.path.exists(f"{faq_file}.pkl"):
    faq_df = pd.read_pickle(f"{faq_file}.pkl")
else:
    faq_df = pd.read_excel(f"{faq_file}.xlsx")
    faq_df = faq_df.iloc[2:, :5].reset_index(drop=True)
    faq_df.columns = ["提问者", "Questions", "AI Answers", "✔️ or ✖️", "Tree Index"]
    faq_df.to_pickle(faq_file)

faq_df

Unnamed: 0,提问者,Questions,AI Answers,✔️ or ✖️,Tree Index
0,@Dexter Deng,1 Is Restaurant industry the ICP?,"Yes, the restaurant industry is included in th...",✔️,SFDC -> Account -> What is the definition of C...
1,,"2 I signed a CP with our customer, which was a...",For a one-year Cloud Commitment Plan (CP) with...,✖️,Compensation -> What is SPIFF? -> What is the ...
2,,3 My account signed a $300000 CP last year 202...,Your account's baseline ARR for 2025 will be d...,✔️,Key Business Metrics -> ARR -> How to calculat...
3,,4 What are the differences of compensation pla...,The compensation plans for APAC and Japan have...,✖️,Compensation -> What is Compensation Plan? -> ...
4,,5 What are the differences between baseline AR...,The baseline ARR is determined by the greater ...,✔️,Key Business Metrics -> ARR -> What is ARR? ->...
5,,6 What are the differences between SKA/KA defi...,The differences between the SKA/KA definitions...,✔️,SFDC -> Account -> What is the definition of C...
6,,7 What are the differences between Revenue and...,The differences between Revenue and Collection...,✔️,Key Business Metrics -> Revenue -> What is Rev...
7,,8 What should I do to apply for credits for in...,"To apply for credits for internal use, follow ...",✔️,Ops Portal -> Credits -> How to apply credits?...
8,,9 My customer is Holla with ending ARR=30000. ...,"Yes, Holla is considered a Brand New Cloud Cus...",✖️,SFDC -> Account -> Brand New Cloud Customer ->...
9,,10 I am a user in China and could not download...,If you are in China and unable to download the...,✔️,Others -> How to download salesforce in China?


In [3]:
print(faq_df.at[1, 'Tree Index'])

Compensation -> What is SPIFF? -> What is the Cloud Commitment Plan in SPIFF?
Compensation -> How to calculate compensation? -> How does ACV factor into SPIFF bonus calculations?


In [9]:
import json

from utils.json_utils import extract_json
from index_craft.prompts.index_gen import get_question_index_prompt, get_index_reference_prompt

issues = []

for index, row in faq_df.iterrows():
    print(type(row['Tree Index']), row['Tree Index'])
    if not isinstance(row['Tree Index'], str) or (row['Tree Index'] is not None and len(row['Tree Index']) > 0):
        continue

    print("-"*100)
    print("Question: ", row['Questions'])
    prompt = get_question_index_prompt(row['Questions'], tree_dict)
    response = llm_client.generate(prompt)
    json_str = extract_json(response)
    json_obj = json.loads(json_str)
    index_paths = []
    for i, index_obj in enumerate(json_obj):
        print(f"Index {i}:")
        print(f" - subquestion: {index_obj['subquestion']}")
        print(f" - reasoning: {index_obj['reasoning']}")
        print(f" - matched: {index_obj['matched']}")
        if index_obj['matched'] and 'index_path' in index_obj and len(index_obj['index_path']) > 0:
            path = " -> ".join(index_obj['index_path'])
            print(" -", path)
            index_paths.append(path)
            
        else:
            issues.append(index_obj)
    print("\n")
    faq_df.loc[index, 'Tree Index'] = index_paths

<class 'str'> SFDC -> Account -> What is the definition of Customer ICP/Industry? -> Which industry are included in the ICP list?
<class 'str'> Compensation -> What is SPIFF? -> What is the Cloud Commitment Plan in SPIFF?
Compensation -> How to calculate compensation? -> How does ACV factor into SPIFF bonus calculations?
<class 'str'> Key Business Metrics -> ARR -> How to calculate ARR? -> How is ARR calculated for a contract starting in a future fiscal period?
Key Business Metrics -> ARR -> What is ARR? -> What affect baseline ARR?
<class 'str'> Compensation -> What is Compensation Plan? ->  What is compensation plan structure?
<class 'str'> Key Business Metrics -> ARR -> What is ARR? -> What is baseline ARR?
Key Business Metrics -> ARR -> What is ARR? -> What is starting ARR?
<class 'str'> SFDC -> Account -> What is the definition of Customer Segmentation (SKA/KA)?
<class 'str'> Key Business Metrics -> Revenue -> What is Revenue?
Key Business Metrics -> Collection -> What is Collecti

In [6]:
faq_df.to_pickle(f"{faq_file}.pkl")