In [1]:
from llm.factory import LLMInterface
from setting.db import SessionLocal
from llm.embedding import get_text_embedding

# llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")
llm_client = LLMInterface("bedrock", "arn:aws:bedrock:us-east-1:841162690310:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0")

In [None]:
import json
import os

# Define the path to the JSON configuration file
config_file_path = 'docs/business_operations/business_operations_docs.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")

In [3]:
import time

from graph.docbuilder import DocBuilder

doc_builder = DocBuilder(llm_client, get_text_embedding)
success_index = []

In [6]:
import logging

for index, doc in enumerate(loaded_docs):
    if index in success_index:
        continue
    print(doc['path'])
    try:
        doc_builder.split_markdown_by_heading(
            doc['path'], 
            doc['metadata']
        )
        success_index.append(index)
        time.sleep(60)
    except Exception as e:
        logging.error(f"process index {index} failed, {e}", exc_info=True)
        time.sleep(120)

In [3]:
import pandas as pd

from graph.models import KnowledgeBlock, SourceData
from setting.db import SessionLocal
from llm.embedding import get_text_embedding


query_vector = get_text_embedding("How to update protobuf message definition")

with SessionLocal() as db:
    # Join KnowledgeBlock with SourceData and select desired fields
    results = db.query(
        KnowledgeBlock.name.label("block_name"), # Alias to avoid name collision
        KnowledgeBlock.context,
        KnowledgeBlock.content,
        KnowledgeBlock.source_id,
        KnowledgeBlock.position_in_source,
        # KnowledgeBlock.source_version, # Get version from SourceData instead
        SourceData.name.label("source_name"),
        SourceData.link.label("source_link"),
        SourceData.version.label("source_version"),
        SourceData.data_type.label("source_data_type"),
    ).join(SourceData, KnowledgeBlock.source_id == SourceData.id).all()
    
df = pd.DataFrame(results)
df

Unnamed: 0,block_name,context,content,source_id,position_in_source,source_name,source_link,source_version,source_data_type
0,Apply in Salesforce​,This chunk contains the main introduction to t...,# PoC Application Process (for Global Business...,110c5619-294a-46b2-b9b6-4d507f2e2c49,1,PoC Application Process (for Global Business),https://pingcap.feishu.cn/wiki/wikcnGT0rhH3H8C...,general_v1.0,document
1,FY25 Customer Segmentation,This chunk contains the introduction and main ...,# FY25 SKA/KA Segmentation - Definition\n\nDra...,83babacb-1711-4624-8411-d1f59db53dfb,0,FY25 SKA and KA Segmentation - Definition,https://pingcap.feishu.cn/wiki/VOxbwhAJiiXwdnk...,fy25_v1.0,document
2,02 New Account Creation,This chunk is part of a Salesforce Updates doc...,"# Salesforce Updates Sharing\n\nVince Yao, fro...",d70340f5-a9df-49b5-9287-45028ac8bdd7,3,APAC Salesforce Updates Sharing_Sep 2024,https://docs.google.com/presentation/d/1OtryBj...,general_v1.0,document
3,SKA/KA Sales Incentive for FY26,This chunk contains the sales incentive struct...,# FY26 SKA/KA Segmentation\n\n## SKA/KA Sales ...,3148cc78-9027-4ee1-90fb-b488c94249db,2,FY26 SKA and KA Segmentation,https://pingcap.feishu.cn/wiki/VbQ4wSE6ri3OL3k...,fy26_v1.0,document
4,SPIFF - OP Multi-Year Contract,This is the final SPIFF program in the documen...,# PingCAP FY25 Global SPIFF Program List\n\n##...,6e40b377-9c51-4665-88a7-968ec2cb3101,4,FY25 Global SPIFF Program List,https://pingcap.feishu.cn/wiki/L6BcwjWfdirsBXk...,fy25_v1.0,document
...,...,...,...,...,...,...,...,...,...
58,SPIFF - SKA/KA Focus - Customer Testimony,This chunk details the second SPIFF program (C...,# PingCAP FY25 Global SPIFF Program List\n\n##...,6e40b377-9c51-4665-88a7-968ec2cb3101,1,FY25 Global SPIFF Program List,https://pingcap.feishu.cn/wiki/L6BcwjWfdirsBXk...,fy25_v1.0,document
59,2 OTE & Quota,This chunk outlines the OTE (On-Target Earning...,"# Compensation Plan FY25 (April 1, 2024-March ...",e320602d-0790-40fe-9922-a45b7e7491df,1,"FY25 WW Compensation Plan, Sales (base version)",https://drive.google.com/file/d/1VTEpzzoBtorca...,fy25_v1.0,document
60,Account Activation,This chunk covers the first half of the docume...,# Salesforce User Account Management\n\n## Acc...,a91eeb9d-67b3-46b1-9949-96b4d9122c14,0,Salesforce User Account Management,https://pingcap.feishu.cn/wiki/HAWWwWntGiyl76k...,general_v1.0,document
61,TiDB Cloud Credits Request for Internal Users,This document provides a complete guide for Pi...,## TiDB Cloud Credits Request for Internal Use...,3afaf526-10e2-4f8d-bf91-5f87f81ba221,0,TiDB Cloud Credits Request for Internal Users,hhttps://pingcap.feishu.cn/docs/doccnUQF5yyyH0...,general_v1.0,document


In [7]:
contexual_docs = {}

for index, row in df.iterrows():
    if row['source_link'] not in contexual_docs:
        contexual_docs[row['source_link']] = {
            "source_name": row['source_name'],
            "content": []
        }

    contexual_docs[row['source_link']]['content'].append({
        "position": row['position_in_source'],
        "context": row['context'],
        "content": row['content']
    })
    
len(contexual_docs.keys())


22

In [11]:
import os
import re

# Function to sanitize filenames
def sanitize_filename(name):
    # Remove invalid characters
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    return name + ".txt"

# Define the output directory
output_dir = "contexual_docs"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through the contextual documents
for source_link, data in contexual_docs.items():
    source_name = data['source_name']
    content_list = data['content']

    # Sanitize the source name for the filename
    filename = sanitize_filename(source_name)
    filepath = os.path.join(output_dir, filename)

    # Sort the content list by position
    content_list.sort(key=lambda x: x['position'])

    # Build the file content
    file_content_parts = []
    for item in content_list:
        context_str = f"<context>{item['context']}</context>"
        content_str = item['content']
        file_content_parts.append(f"{context_str}\n{content_str}")

    # Join parts with 5 newlines
    file_content = "\n######\n".join(file_content_parts)

    # Write the content to the file
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(file_content)
        print(f"Successfully wrote: {filepath}")
    except Exception as e:
        print(f"Error writing file {filepath}: {e}")

print(f"\nFinished writing files to the '{output_dir}' directory.")

Successfully wrote: contexual_docs/PoC_Application_Process_(for_Global_Business).txt
Successfully wrote: contexual_docs/FY25_SKA_and_KA_Segmentation_-_Definition.txt
Successfully wrote: contexual_docs/APAC_Salesforce_Updates_Sharing_Sep_2024.txt
Successfully wrote: contexual_docs/FY26_SKA_and_KA_Segmentation.txt
Successfully wrote: contexual_docs/FY25_Global_SPIFF_Program_List.txt
Successfully wrote: contexual_docs/FY25_WW_Compensation_Plan,_Sales_(base_version).txt
Successfully wrote: contexual_docs/Account_Creation_Approval_Process_for_Sales.txt
Successfully wrote: contexual_docs/FY26_Net_ARR_Reference.txt
Successfully wrote: contexual_docs/FY25_APAC_Compensation_Related_Business_Metrics_(APAC).txt
Successfully wrote: contexual_docs/FY25_APAC_WW_Compensation_Plan,_Presales_(base_version).txt
Successfully wrote: contexual_docs/FY25_Global_SPIFF_Program_for_SKA_and_KA_Focus_-_Expansion.txt
Successfully wrote: contexual_docs/Financial_Metrics_Definition_and_Calculation.txt
Successfully 