In [2]:
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.schema import HumanMessage
from langchain_ollama import ChatOllama
from lightrag import LightRAG, QueryParam
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.utils import EmbeddingFunc
import re, os
import sys

In [None]:
WORKING_DIR = "/home/tttung/Khiem/thesis/lv2_install_software_on_windows" 
#llm = ChatOllama(model = 'phi4', temperature=0)

import nest_asyncio
nest_asyncio.apply()

rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,
    llm_model_name='phi4',
    llm_model_max_async=6,
    llm_model_max_token_size=32768*2,
    llm_model_kwargs={"host": "http://localhost:11434", "options": {"num_ctx": 32768*2}},
    embedding_func=EmbeddingFunc(
        embedding_dim=768,
        max_token_size=8192,
        func=lambda texts: ollama_embed(
            texts, embed_model="nomic-embed-text", host="http://localhost:11434"
        ),
    ),
)



  from .autonotebook import tqdm as notebook_tqdm
INFO:nano-vectordb:Load (17, 768) data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': '/home/tttung/Khiem/thesis/lv2_install_software_on_windows/vdb_entities.json'} 17 data
INFO:nano-vectordb:Load (19, 768) data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': '/home/tttung/Khiem/thesis/lv2_install_software_on_windows/vdb_relationships.json'} 19 data
INFO:nano-vectordb:Load (2, 768) data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': '/home/tttung/Khiem/thesis/lv2_install_software_on_windows/vdb_chunks.json'} 2 data
INFO:lightrag:Loaded document status storage with 1 records


INFO:lightrag:Storage Initialization completed!


In [3]:
def get_txt_file_from_dir(directory):
    """Finds the first `.txt` file in the given directory."""
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            return os.path.join(directory, file)
    return None 


document_path = get_txt_file_from_dir(WORKING_DIR)
if document_path:
    with open(document_path, "r", encoding="utf-8") as f:
        rag.insert(f.read())


INFO:lightrag:No new unique documents were found.
INFO:lightrag:All documents have been processed or are duplicates


## Step 1

In [9]:
def identity_steps():
    num_of_steps = rag.query("How many steps in the document, what are they? just give the number of steps and list name of steps with its number, no details, don't give any explantion or summary", 
                             param=QueryParam(mode="global"))

    
    few_shot_examples = [
        {
            "input": "Step 1: Prepare Ingredients\nStep 2: Cook Meat\nStep 3: Serve Dish",
            "output": """
                subgraph cluster_1 {{ label="Prepare Ingredients" }}
                subgraph cluster_2 {{ label="Cook Meat" }}
                subgraph cluster_3 {{ label="Serve Dish" }}
            """
        },
        {
            "input": "Preheat Oven\nMix Batter\nBake Cake",
            "output": """
                subgraph cluster_1 {{ label="Preheat Oven" }}
                subgraph cluster_2 {{ label="Mix Batter" }}
                subgraph cluster_3 {{ label="Bake Cake" }}
            """
        }
    ]

    example_template = """
    Input steps:
    {input}

    Output DOT code:
    {output}
    """

    example_prompt = PromptTemplate(
        input_variables=["input", "output"],
        template=example_template
    )
    
    cot_instructions = """
    You are tasked with converting a list of procedural steps into DOT code for a flowchart. 
    Each step should become a subgraph with a descriptive label. Follow these steps:

    1. Read the list of steps provided.
    2. For each step, create a subgraph in DOT syntax (e.g., 'subgraph cluster_X {{ label="Step Name" }}').
    3. Number the subgraphs sequentially (cluster_1, cluster_2, etc.).
    4. Ensure the output is valid DOT code that can be rendered as a flowchart, output is dot code, no more details needed, no explanation or irrlevant part or description.

    Now, process the following steps and output the DOT code:
    {steps}
    """
    
    few_shot_prompt = FewShotPromptTemplate(
    examples=few_shot_examples,
    example_prompt=example_prompt,
    prefix="Here are some examples of converting steps to DOT code:\n",
    suffix=cot_instructions,
    input_variables=["steps"],
    example_separator="\n---\n"
    )
    
    # Use num_of_steps directly as the input
    final_prompt = few_shot_prompt.format(steps=num_of_steps)
    llm = ChatOllama(model = 'deepseek-r1:14b', temperature=0.0)
    
    dot_code_output = llm.invoke(final_prompt)
    print("raw output \n : ",dot_code_output.content)
    #dot_code_output_step1 = dot_code_output.content
    dot_code_output_step1 = re.sub(r"<think>.*?</think>", "", dot_code_output.content, flags=re.DOTALL).strip()
    
    ## extract dot ``` code ```
    #dot_code_output_step1 = re.search(r"```(.*?)```", dot_code_output_step1, re.DOTALL).group(1)
    ## remove "dot" keyword
    dot_code_output_step1 = re.sub(r"dot", "", dot_code_output_step1)
    
    # Print the result
   # print("Generated DOT code:\n", dot_code_output_step1)
    step_lines = num_of_steps.strip().split("\n")[1:]  # Skip intro line
    step_names = [line.strip() for line in step_lines if line.strip()][:-1]

    # remove the last element 
    
    return step_names,num_of_steps, dot_code_output_step1


In [10]:
steps, num_of_steps,dot_code_output_step1  = identity_steps()
print("steps",steps)
print("num_of_steps",num_of_steps)
print("dot_code_output_step1",dot_code_output_step1)

INFO:lightrag:Non-embedding cached hit(mode:global type:query)


raw output 
 :  <think>

Step 1: Download the Software  
Step 2: Locate the Downloaded File  
Step 3: Run the Installer  
Step 4: Choose Installation Type  
Step 5: Select Installation Settings  
Step 6: Begin the Installation  
Step 7: Handle Installation Errors  
Step 8: Complete the Installation  
Step 9: Verify That the Software Works

Output DOT code:

subgraph cluster_1 { label="Download the Software" }
subgraph cluster_2 { label="Locate the Downloaded File" }
subgraph cluster_3 { label="Run the Installer" }
subgraph cluster_4 { label="Choose Installation Type" }
subgraph cluster_5 { label="Select Installation Settings" }
subgraph cluster_6 { label="Begin the Installation" }
subgraph cluster_7 { label="Handle Installation Errors" }
subgraph cluster_8 { label="Complete the Installation" }
subgraph cluster_9 { label="Verify That the Software Works" }
steps ['Okay, I need to figure out how many steps are in the document and list them by number without any explanations. The user prov

In [6]:
def identify_steps_relations(num_of_steps, dot_code_output_step1):
    query_step4 = (
    f"Document contains these steps:\n{num_of_steps}\n"
    "Identify relationships between steps EXACTLY in this format:\n"
    "From: <X> To: <Y> Label: \"<DESCRIPTIVE_RELATIONSHIP>\"\n"
    "Where:\n"
    "- X and Y are ONLY existing step numbers from the list above\n"
    "- Labels MUST include specific conditions/context from the document\n"
    "- Use natural language explanations in quotes\n"
    "- Common patterns should include:\n"
    "  * \"if [condition]\" for conditional flows\n"
    "  * \"requires [dependency]\" for dependencies\n"
    "  * \"followed by [specific action]\" for sequences\n"
    "  * \"branches when [condition]\" for decisions\n"
    "  * \"merges after [event]\" for convergence points\n\n"
    "Examples:\n"
    "From: 1 To: 4 Label: \"if stateful behavior required\"\n"
    "From: 3 To: 9 Label: \"requires service discovery completion\"\n"
    "From: 4 To: 5 Label: \"when external exposure needed\"\n"
    "From: 6 To: 7 Label: \"followed by TLS security setup\"\n"
    "From: 2 To: 3 Label: \"branches on authentication failure\"\n"
    "From: 5 To: 6 Label: \"after successful health check\""
)
    inter_step_relations = rag.query(query_step4, param=QueryParam(mode="mix"))
    #print("Step 4 - Raw inter-step relationships:\n", inter_step_relations)

    few_shot_examples = [
    {
        "input": (
            "Steps:\n"
            "1. Check water level\n"
            "2. Boil the water\n"
            "Relationships:\n"
            'From: 1 To: 2 Label: "after verifying minimum level"'
        ),
        "output": (
            'cluster_1 -> cluster_2 [label="after verifying minimum level"];'
        )
    },
    {
        "input": (
            "Steps:\n"
            "1. Boil the water\n"
            "2. Check temperature\n"
            "3. Add ingredients\n"
            "4. Combine components\n"
            "Relationships:\n"
            'From: 1 To: 2 Label: "until reaching 100°C"\n'
            'From: 2 To: 3 Label: "if temperature maintained"\n'
            'From: 2,3 To: 4 Label: "merge cooked elements"'
        ),
        "output": (
            'cluster_1 -> cluster_2 [label="until reaching 100°C"];\n'
            'cluster_2 -> cluster_3 [label="if temperature maintained"];\n'
            'cluster_2, cluster_3 -> cluster_4 [label="merge cooked elements"];'
        )
    },
    {
        "input": (
            "Steps:\n"
            "1. Prepare ingredients\n"
            "2. Marinate meat\n"
            "3. Boil vegetables\n"
            "Relationships:\n"
            'From: 1 To: 2 Label: "requires pre-cut components"\n'
            'From: 1 To: 3 Label: "parallel cooking process"'
        ),
        "output": (
            'cluster_1 -> cluster_2 [label="requires pre-cut components"];\n'
            'cluster_1 -> cluster_3 [label="parallel cooking process"];'
        )
    },
    {
        "input": (
            "Steps:\n"
            "1. Sear protein\n"
            "2. Simmer sauce\n"
            "3. Plate dish\n"
            "Relationships:\n"
            'From: 1 To: 3 Label: "when caramelized crust forms"\n'
            'From: 2 To: 3 Label: "after reducing by 50%"'
        ),
        "output": (
            'cluster_1 -> cluster_3 [label="when caramelized crust forms"];\n'
            'cluster_2 -> cluster_3 [label="after reducing by 50%"];'
        )
    },
    {
        "input": (
            "Steps:\n"
            "1. Initialize system\n"
            "2. Check dependencies\n"
            "3. Start services\n"
            "Relationships:\n"
            'From: 1 To: 2 Label: "with configuration loaded"\n'
            'From: 2 To: 3 Label: "if all checks pass"\n'
            'From: 2 To: 4 Label: "when missing components"'
        ),
        "output": (
            'cluster_1 -> cluster_2 [label="with configuration loaded"];\n'
            'cluster_2 -> cluster_3 [label="if all checks pass"];\n'
            'cluster_2 -> cluster_4 [label="when missing components"];'
        )
    }
]

    example_template = """
    Input steps and relationships:
    {input}

    Output DOT edges between subgraphs:
    {output}
    """

    example_prompt = PromptTemplate(
        input_variables=["input", "output"],
        template=example_template
    )

    # Chain-of-thought instructions
    cot_instructions = """
    You are generating DOT code edges to connect subgraphs representing procedural steps, using the Step 1 DOT code for context. 
    Follow these rules:
    1. Read the list of steps, Step 1 DOT code (subgraph titles), and relationships in the format 'From: <step_number> To: <step_number> Label: <relationship>'.
    2. For each relationship, create an edge between subgraphs:
    - 'cluster_X -> cluster_Y [label="Enhanced Relationship"];'
    3. Use the Step 1 DOT code to ensure cluster labels match the steps (e.g., cluster_1 for step 1).
    4. Enhance the relationship label based on context and type, using the subgraph titles for specificity(relationship lables could be used  with synonyms, act accordingly):
    - 'followed by' -> e.g., 'followed by to <cluster_Y label>'
    - 'if' -> e.g., 'if <cluster_X condition>'
    - 'depends on' -> e.g., 'depends on <cluster_X completion>'
    - 'branches to' -> e.g., 'branches to <cluster_Y action>'
    - 'merges from' -> e.g., 'merges from <cluster_X result>'
    - 'parallel' -> e.g., 'parallel with <cluster_Y label>'
    - 'synchronize' -> e.g., 'synchronize with <cluster_X label>'
    - 'except' -> e.g., 'except if <cluster_X condition>'
    - 'trigger' -> e.g., 'triggers <cluster_Y label>'
    4. Format labels as: "<Relation Type>: <Brief Reason>"
    5. Output one edge per line.
    6. Keep the dot syntax (cluster_X -> cluster_Y [label="Relationship"];) and ensure the labels are in double quotes.
    6.Generate DOT edges following ALL rules, no more details needed, no explanation or irrlevant part or description

    Process these steps, Step 1 DOT code, and relationships, and output the DOT edges:
    {relations}
    """
    few_shot_prompt = FewShotPromptTemplate(
    examples=few_shot_examples,
    example_prompt=example_prompt,
    prefix="Here are examples of converting inter-step relationships to DOT edges with Step 1 DOT code:\n",
    suffix=cot_instructions,
    input_variables=["relations"],
    example_separator="\n---\n"
)
    step4_input = f"Steps:\n{num_of_steps}\nStep 1 DOT code:\n{dot_code_output_step1}\nRelationships:\n{inter_step_relations}"
    final_prompt = few_shot_prompt.format(relations=step4_input)
    llm = ChatOllama(model = 'deepseek-r1:14b', temperature=0)
    dot_code_output_step4 = llm.invoke(final_prompt)
   # dot_code_output_step4 =  dot_code_output_step4.content
    #print("Step 4 - Generated inter-subgraph edges:\n", dot_code_output_step4)
    # exxtract dot code
    dot_code_output_step4 = re.sub(r"<think>.*?</think>", "", dot_code_output_step4.content, flags=re.DOTALL).strip()
    #dot_code_output_step4 = re.search(r"```(.*?)```", dot_code_output_step4, re.DOTALL).group(1)
    #dot_code_output_step4 =  dot_code_output_step4.content
    return dot_code_output_step4


In [7]:
dot_code_output_step4 = identify_steps_relations(num_of_steps, dot_code_output_step1)
print("Relation among the steps:\n", dot_code_output_step4)

INFO:lightrag:Non-embedding cached missed(mode:mix type:query)
INFO:lightrag:Non-embedding cached hit(mode:mix type:keywords)
INFO:lightrag:Query nodes: Download Software, Locate Downloaded File, Run Installer, Choose Installation Type, Select Installation Settings, Begin Installation, Handle Installation Errors, Complete Installation, Verify Software Functionality, top_k: 60, cosine: 0.2
INFO:lightrag:Query edges: Software installation process, Step identification, Sequential logic, top_k: 60, cosine: 0.2
INFO:lightrag:Global query uses 27 entites, 19 relations, 2 chunks
INFO:lightrag:Local query uses 17 entites, 9 relations, 2 chunks


Relation among the steps:
 <think>

cluster_1 -> cluster_2 [label="followed by locating file"];
cluster_2 -> cluster_3 [label="followed by launching installer"];
cluster_3 -> cluster_4 [label="followed by choosing installation type"];
cluster_4 -> cluster_5 [label="when custom settings desired"];
cluster_5 -> cluster_6 [label="after configuring settings, start installation"];
cluster_6 -> cluster_7 [label="followed by handling any errors during installation"];
cluster_7 -> cluster_8 [label="followed by completing the installation process upon error resolution"];
cluster_8 -> cluster_9 [label="followed by verifying software functionality"];


## Step 2

In [10]:
def identify_entities(step_names):
    step_details = {}

    for step in step_names:
        query_step2 = (
            f"For the step '{step}' in the document, identify: "
            "1. The actors list all the actors (who or what performs the action, if specified, if not, just use the pronouce 'you' or imply the person, actor, whom, what do the actions), keep the same actor if the same actor is used in the same step "
            "2. The main action, procedure, or event, etc "
            "3. The entities (nouns/objects/things involved, excluding the actor), "
            "4. Relevant info (conditions, details, or constraints). "
            "Return the response in this format:\n"
            "Actor: <actor>\nAction: <action>\nEntities: <entity1>, <entity2>, ...\nRelevant Info: <info>"
        )
        details = rag.query(query_step2, param=QueryParam(mode="global"))
        step_details[step] = details
      #  print(f"Step 2 - Details for '{step}':\n", details)
    example_template = """
    Input step and details:
    {input}

    Output DOT code:
    {output}
    """

    example_prompt = PromptTemplate(
        input_variables=["input", "output"],
        template=example_template
    )

    # Chain-of-thought instructions with actor
    cot_instructions = """
    You are converting a procedural step and its details into DOT code for a flowchart. 
    Each step is a subgraph containing nodes for actor, action, entities, and relevant info. Follow these rules:
    1. Read the step name and its details (Actor, Action, Entities, Relevant Info).
    2. Create a subgraph with the step name as the label: 'subgraph cluster_X {{ label="Step Name"; }}'.
    3. Inside the subgraph, add nodes:
    - If an actor is specified (not 'None'): 'actor_X [label="Actor"];', if not defined, use  'You'.
    - For the action: 'action_X [label="Action"];'
    - For each entity: 'entity_Y [label="Entity"];'
    - For relevant info (if not empty): 'info_Z [label="Info"];'
    4. Use sequential cluster numbering (cluster_1, cluster_2, etc.) based on step order.
    5. Use unique node IDs within each subgraph (e.g., actor_1, action_1, entity_1, info_1 for cluster_1).
    6. Ensure double quotes around all label text and semicolons after each node.
    7. Indent nodes for readability, but keep the subgraph on multiple lines.
    8. Only output the dotcode, no other details

    Process this step and its details and output the DOT code:
    {step_details}
    """
    few_shot_examples = [
    {
        "input": (
            "Step 1: Brian and Stewie cook meat\n"
            "Actor1: Brian\n"
            "Actor2: Stewie\n"
            "Action: Cooks\n"
            "Entities: Meat\n"
            "Relevant Info: On medium heat"
        ),
        "output": """
            'subgraph cluster_1 {{ label="Brian and Stewie cook meat";\n'
            '  actor_1 [label="Brian"];\n'
            '  actor_2 [label="Stewie"];\n'
            '  action_1 [label="Cooks"];\n'
            '  entity_1 [label="Meat"];\n'
            '  info_1 [label="On medium heat"];\n'
            '}}'
        """
    },
    {
        "input": (
            "Step 2: Boil the water\n"
            "Actor: You\n"
            "Action: Boil\n"
            "Entities: Water\n"
            "Relevant Info: High heat"
        ),
        "output": """
            'subgraph cluster_2 {{ label="Boil the water";\n'
            '  actor_2 [label="You"];\n'
            '  action_2 [label="Boil"];\n'
            '  entity_2 [label="Water"];\n'
            '  info_2 [label="High heat"];\n'
            '}}'
        """
    }
]
    # Combine into few-shot prompt
    few_shot_prompt = FewShotPromptTemplate(
        examples=few_shot_examples,
        example_prompt=example_prompt,
        prefix="Here are examples of converting step details to DOT code with nodes:\n",
        suffix=cot_instructions,
        input_variables=["step_details"],
        example_separator="\n---\n"
    )
    dot_code_output_step2 = []
    for i, step in enumerate(step_names, 1):
        step_input = f"{step}\n{step_details[step]}"
        final_prompt = few_shot_prompt.format(step_details=step_input)
        llm = ChatOllama(model = 'phi4', temperature=0.0)
        
        dot_code_output = llm.invoke(final_prompt)
        #dot_code_output = dot_code_output.content
        dot_code_output= re.sub(r"<think[^>]*>.*?</think>", "", dot_code_output.content, flags=re.DOTALL | re.IGNORECASE).strip()
        dot_code_output_step2.append(dot_code_output)
        #print(f"Step 2 - Generated DOT code for '{step}':\n", dot_code_output)
    
    return step_details


In [11]:
step_details= identify_entities(steps)
print("step_details\n   ",step_details)

INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)
INFO:lightrag:Non-embedding cached hit(mode:global type:query)


step_details
