In [1]:
from llm.factory import LLMInterface
from setting.db import SessionLocal
from llm.embedding import get_text_embedding

# llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")
llm_client = LLMInterface("bedrock", "arn:aws:bedrock:us-east-1:841162690310:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0")

In [None]:
import json
import os

# Define the path to the JSON configuration file
config_file_path = 'docs/business_operations/business_operations_docs.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")

In [3]:
import time

from knowledge_graph.knowledge import KnowledgeBuilder

success_index = []
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding)

In [6]:
import logging

for index, doc in enumerate(loaded_docs):
    if index in success_index:
        continue
    print(doc['path'])
    try:
        kb_builder.extract_knowledge_blocks(
            doc['path'], 
            doc['metadata']
        )
        success_index.append(index)
        time.sleep(60)
    except Exception as e:
        logging.error(f"process index {index} failed, {e}", exc_info=True)
        time.sleep(120)

## Write contextual information into Docs

In [None]:
import pandas as pd

from knowledge_graph.models import KnowledgeBlock, SourceData
from setting.db import SessionLocal
from llm.embedding import get_text_embedding


query_vector = get_text_embedding("How to update protobuf message definition")

with SessionLocal() as db:
    # Join KnowledgeBlock with SourceData and select desired fields
    results = db.query(
        KnowledgeBlock.name.label("block_name"), # Alias to avoid name collision
        KnowledgeBlock.context,
        KnowledgeBlock.content,
        KnowledgeBlock.source_id,
        KnowledgeBlock.position_in_source,
        # KnowledgeBlock.source_version, # Get version from SourceData instead
        SourceData.name.label("source_name"),
        SourceData.link.label("source_link"),
        SourceData.version.label("source_version"),
        SourceData.data_type.label("source_data_type"),
    ).join(SourceData, KnowledgeBlock.source_id == SourceData.id).all()
    
df = pd.DataFrame(results)
df

In [None]:
contexual_docs = {}

for index, row in df.iterrows():
    if row['source_link'] not in contexual_docs:
        contexual_docs[row['source_link']] = {
            "source_name": row['source_name'],
            "content": []
        }

    contexual_docs[row['source_link']]['content'].append({
        "position": row['position_in_source'],
        "context": row['context'],
        "content": row['content']
    })
    
len(contexual_docs.keys())


In [None]:
import os
import re

# Function to sanitize filenames
def sanitize_filename(name):
    # Remove invalid characters
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    # Replace spaces with underscores
    name = name.replace(" ", "_")
    return name + ".txt"

# Define the output directory
output_dir = "contexual_docs"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through the contextual documents
for source_link, data in contexual_docs.items():
    source_name = data['source_name']
    content_list = data['content']

    # Sanitize the source name for the filename
    filename = sanitize_filename(source_name)
    filepath = os.path.join(output_dir, filename)

    # Sort the content list by position
    content_list.sort(key=lambda x: x['position'])

    # Build the file content
    file_content_parts = []
    for item in content_list:
        context_str = f"<context>{item['context']}</context>"
        content_str = item['content']
        file_content_parts.append(f"{context_str}\n{content_str}")

    # Join parts with 5 newlines
    file_content = "\n######\n".join(file_content_parts)

    # Write the content to the file
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(file_content)
        print(f"Successfully wrote: {filepath}")
    except Exception as e:
        print(f"Error writing file {filepath}: {e}")

print(f"\nFinished writing files to the '{output_dir}' directory.")