In [1]:
from llm.factory import LLMInterface
from setting.db import SessionLocal
from llm.embedding import get_text_embedding

# llm_client = LLMInterface("bedrock", "us.deepseek.r1-v1:0")
llm_client = LLMInterface("bedrock", "us.anthropic.claude-3-7-sonnet-20250219-v1:0")

In [None]:
import json
import os

# Define the path to the JSON configuration file
config_file_path = 'docs/business_operations/business_operations_docs.json'

# Variable to store the loaded data
loaded_docs = []

# Read the JSON configuration file
try:
    with open(config_file_path, 'r', encoding='utf-8') as f:
        loaded_docs = json.load(f)
    print(f"Successfully loaded configuration from: {config_file_path}")
except FileNotFoundError:
    print(f"Error: Configuration file not found at '{config_file_path}'")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from file '{config_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")

if len(loaded_docs) > 0:
    print("\nExample: Accessing first document data:")
    print(loaded_docs[0])
else:
    print("\nConfiguration file is empty.")

In [3]:
import time

from knowledge_graph.knowledge import KnowledgeBuilder

success_index = []
kb_builder = KnowledgeBuilder(llm_client, get_text_embedding)

In [6]:
import logging

for index, doc in enumerate(loaded_docs):
    if index in success_index:
        continue
    print(doc['path'])
    try:
        kb_builder.extract_knowledge_blocks(
            doc['path'], 
            doc['metadata']
        )
        success_index.append(index)
        time.sleep(60)
    except Exception as e:
        logging.error(f"process index {index} failed, {e}", exc_info=True)
        time.sleep(120)

## Write contextual information into Docs

In [None]:
import pandas as pd

from knowledge_graph.models import KnowledgeBlock, SourceData
from setting.db import SessionLocal
from llm.embedding import get_text_embedding


query_vector = get_text_embedding("How to update protobuf message definition")

with SessionLocal() as db:
    # Join KnowledgeBlock with SourceData and select desired fields
    results = db.query(
        KnowledgeBlock.name.label("block_name"), # Alias to avoid name collision
        KnowledgeBlock.context,
        KnowledgeBlock.content,
        KnowledgeBlock.source_id,
        KnowledgeBlock.position_in_source,
        # KnowledgeBlock.source_version, # Get version from SourceData instead
        SourceData.name.label("source_name"),
        SourceData.link.label("source_link"),
        SourceData.version.label("source_version"),
        SourceData.data_type.label("source_data_type"),
    ).join(SourceData, KnowledgeBlock.source_id == SourceData.id).all()
    
df = pd.DataFrame(results)
df

In [None]:
contexual_docs = {}

for index, row in df.iterrows():
    if row['source_link'] not in contexual_docs:
        contexual_docs[row['source_link']] = {
            "source_name": row['source_name'],
            "source_link": row['source_link'],
            "source_version": row['source_version'],
            "content": []
        }

    contexual_docs[row['source_link']]['content'].append({
        "position": row['position_in_source'],
        "context": row['context'],
        "content": row['content']
    })
    
len(contexual_docs.keys())


In [None]:
import os
import re

# Function to sanitize filenames
def sanitize_filename(name):
    # Remove invalid characters
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    # Replace spaces with underscores
    # name = name.replace(" ", "_")
    return name + ".txt"

# Define the output directory
output_dir = "contexual_docs"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through the contextual documents
for source_link, data in contexual_docs.items():
    source_name = data['source_name']
    source_link = data['source_link']
    source_version = data['source_version']
    content_list = data['content']

    # Sanitize the source name for the filename
    filename = sanitize_filename(source_name)
    filepath = os.path.join(output_dir, filename)

    file_info = f"<doc_link>[{source_name}]({source_link})</doc_link>\n<doc_version>{source_version}</doc_version>"

    # Sort the content list by position
    content_list.sort(key=lambda x: x['position'])

    # Build the file content
    file_content_parts = []
    for item in content_list:
        context_str = f"<context>{item['context']}</context>"
        content_str = item['content']
        file_content_parts.append(f"{file_info}\n\n{context_str}\n\n{content_str}")

    # Join parts with 5 newlines
    file_content = "\n######\n".join(file_content_parts)

    # Write the content to the file
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(file_content)
        print(f"Successfully wrote: {filepath}")
    except Exception as e:
        print(f"Error writing file {filepath}: {e}")

print(f"\nFinished writing files to the '{output_dir}' directory.")

## Write Graph into files


In [None]:
import pandas as pd
from sqlalchemy import text

from setting.db import SessionLocal

graph_entity_df = None
graph_relationship_df = None
with SessionLocal() as db:
    entity_query = """SELECT
  c.name AS concept_name,
  c.definition AS concept_definition,
  c.version AS concept_version,
  s.name AS source_name,
  s.version AS source_version,
  s.link AS source_link
FROM concepts c
LEFT JOIN relationships r ON r.source_id = c.id AND r.relationship_type = 'SOURCE_OF'
LEFT JOIN source_data s ON r.target_id = s.id;
"""
    relationships_query = """select 
    l.name as source_name,
    l.definition as source_definition,
    rel.relationship_desc as relationship_description,
    r.name as target_name,
    r.definition as target_definition
from relationships rel
left join concepts l on l.id = rel.source_id
left join concepts r on r.id = rel.target_id
where rel.source_type = 'Concept' and rel.target_type = 'Concept';
"""


try:
    # Use the session's underlying connection (engine) with pandas
    with SessionLocal() as db:
        # db.bind gives access to the engine/connection pandas needs
        graph_entity_df = pd.read_sql_query(sql=text(entity_query), con=db.bind)
        graph_relationship_df = pd.read_sql_query(sql=text(relationships_query), con=db.bind)

    print("Graph data loaded successfully from database using raw SQL.")
    # print(graph_entity_df.head()) # Optional: display first few rows
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
graph_entity_df

In [None]:
graph_relationship_df

In [14]:
concepts_map = {}

for index, row in graph_entity_df.iterrows():
    concept_key = row['concept_name'] + row['concept_definition']
    source_key = row['source_link']

    if concept_key in concepts_map:
        concept_sources = concepts_map[concept_key].get("sources", None)
        if concept_sources and source_key not in concept_sources:
            concepts_map[concept_key]["sources"][source_key] = {
                "name": row['source_name'],
                'link': row['source_link'],
                "source_version": row['source_version'],
            }
    else:
        concepts_map[concept_key] = {
            "name": row['concept_name'],
            "definition": row['concept_definition'],
            "concept_version": row["concept_version"],
            "sources": {
                source_key: {
                    "name": row['source_name'],
                    'link': row['source_link'],
                    "source_version": row['source_version'],
                }
            }
        }

relationship_map = {}
for index, row in graph_relationship_df.iterrows():
    source_key = row['source_name'] + row['source_definition']
    target_key = row['target_name'] + row['target_definition']

    if source_key in relationship_map:
        relationship_map[source_key].append({
            "source": row['source_name'],
            "relationship": row['relationship_description'],
            "target": row['target_name']
        })
    else:
         relationship_map[source_key] = [{
                "source": row['source_name'],
                "relationship": row['relationship_description'],
                "target": row['target_name']
        }]
         
    if target_key in relationship_map:
        relationship_map[target_key].append({
            "source": row['source_name'],
            "relationship": row['relationship_description'],
            "target": row['target_name']
        })
    else:
         relationship_map[target_key] = [{
                "source": row['source_name'],
                "relationship": row['relationship_description'],
                "target": row['target_name']
        }]



for key, concept in concepts_map.items():
    if key in relationship_map:
        concepts_map[key]['relationships'] = relationship_map[key]

In [None]:
graph_df = pd.DataFrame(concepts_map.values())
graph_df

In [None]:
import pandas as pd

def write_graph_to_markdown(df, filename="graph_output.md"):
    """
    Writes the graph data from a DataFrame to a markdown file.

    Each row in the DataFrame is formatted as a section in the markdown file.

    Args:
        df (pd.DataFrame): DataFrame containing graph entities.
                           Expected columns: 'name', 'definition', 'concept_version',
                           'sources' (dict), 'relationships' (list of dicts).
        filename (str): The name of the output markdown file. Default is "graph_output.md".
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            for index, row in df.iterrows():
                # Validate required columns exist and handle potential missing data
                concept_name = row.get('name', f'Unnamed Entity {index+1}')
                concept_definition = row.get('definition', 'No definition provided.')
                concept_version = row.get('concept_version', 'N/A')
                relationships = row.get('relationships', [])
                sources = row.get('sources', {})

                # Entity Name (Level 2 Heading)
                f.write(f"## {concept_name}\n\n")

                # Definition Section (Level 3 Heading)
                f.write(f"### concept definition\n\n")
                f.write(f"{concept_definition}\n\n")

                # Concept Version
                f.write(f"concept version: {concept_version}\n\n")

                # Relationships Section (Level 2 Heading)
                f.write(f"### relationships\n\n")
                if isinstance(relationships, list) and relationships:
                    for rel in relationships:
                        # Use .get() for safer access to dictionary keys
                        source = rel.get('source', 'Unknown Source')
                        relationship = rel.get('relationship', 'Unknown Relationship')
                        target = rel.get('target', 'Unknown Target')
                        f.write(f"- {source} -> {relationship} -> {target}\n")
                    f.write("\n") # Add a newline after the list
                else:
                    f.write("No relationships defined.\n\n")

                # References Section (Level 1 Heading)
                f.write(f"### references\n\n")
                if isinstance(sources, dict) and sources:
                    # Iterate through the source information dictionaries (values of the main dict)
                    for key, src_info in sources.items():
                        if isinstance(src_info, dict):
                            # Use .get() for safer access to dictionary keys
                            name = src_info.get('name', 'Unknown Source Name')
                            link = src_info.get('link', '#') # Default to '#' if no link
                            version = src_info.get('source_version', 'N/A')
                            f.write(f"- [{name}]({link}). version: {version}\n")
                    f.write("\n") # Add a newline after the list
                else:
                    f.write("No sources defined.\n\n")

                # Add a separator between entries for readability, except for the last one
                if index < len(df) - 1:
                    f.write("######\n\n")

        print(f"Successfully wrote graph data to {filename}")

    except Exception as e:
        print(f"An error occurred: {e}")

write_graph_to_markdown(graph_df, "business_operation_graph.txt")