# CBMC-Compatible Harness Generation System - Part 3: Node Functions (First Half)

This part implements the first set of node functions for our LangGraph workflow, including frontend processing, code embedding, and analysis.

In [1]:
# Node 1: Frontend - Initial processing of source code
def frontend_node(state: HarnessGenerationState):
    """Processes the source code and prepares it for embedding."""
    
    if not state.source_code:
        # Extract source code from the latest message if available
        for message in reversed(state["messages"]):
            if isinstance(message, HumanMessage) and "```" in message.content:
                # Extract code between triple backticks
                import re
                match = re.search(r'```(?:\w+)?\n(.+?)\n```', message.content, re.DOTALL)
                if match:
                    state.source_code = match.group(1)
                    break
    
    # If no source code found, ask the user
    if not state.source_code:
        return {
            "messages": [AIMessage(content="Please provide the source code you'd like to analyze for memory leaks.")]
        }
    
    # Proceed to the next step with the source code
    return {
        "messages": [AIMessage(content=f"Received source code ({len(state.source_code)} characters). Proceeding with code embedding and analysis.")],
        "source_code": state.source_code
    }

In [2]:
# Node 2: Code Embedding System
def code_embedding_node(state: HarnessGenerationState):
    """Embeds the source code and stores it in the database."""
    
    # Call the embedding tool
    result = embed_code(state.source_code)
    
    # Store the embeddings
    state.embeddings = result
    
    return {
        "messages": [AIMessage(content=f"Source code embedded successfully. Found {len(result['functions'])} functions.")],
        "embeddings": result
    }

In [3]:
# Node 3: Analyzer (LLM Agent)
def analyzer_node(state: HarnessGenerationState):
    """LLM-based analyzer that identifies potential memory leak vulnerabilities using ChromaDB."""
    
    # Query ChromaDB for functions with potential memory leak indicators
    # First, get all functions that have malloc but no free
    potential_leaks = code_collection.get(
        where={"has_malloc": True, "has_free": False}
    )
    
    # Also get functions with both malloc and free for further analysis
    functions_with_both = code_collection.get(
        where={"has_malloc": True, "has_free": True}
    )
    
    # Create a list of all function details to present to the LLM
    all_functions = {}
    
    # Add potential leaks first
    for i, func_id in enumerate(potential_leaks["ids"]):
        all_functions[func_id] = {
            "full_text": potential_leaks["documents"][i],
            "potential_leak": True
        }
    
    # Add other functions that use malloc
    for i, func_id in enumerate(functions_with_both["ids"]):
        if func_id not in all_functions:
            all_functions[func_id] = {
                "full_text": functions_with_both["documents"][i],
                "potential_leak": False
            }
    
    analyzer_prompt = f"""
    You are a specialized code analyzer focused on identifying potential memory leaks in C code.
    Analyze the following functions and identify those that might have memory leak vulnerabilities.
    Look for patterns like:
    1. Malloc without corresponding free
    2. Conditional frees that might not execute
    3. Error paths that don't properly clean up resources
    4. Nested allocations with incomplete cleanup
    
    Here are the functions to analyze, with preliminary analysis from our system:
    {json.dumps(all_functions, indent=2)}
    
    Some functions are already flagged with 'potential_leak: true' based on initial analysis.
    Please verify these and check other functions for more complex memory leak patterns.
    
    For each suspicious function, use the query_pattern_db tool to find matching memory leak patterns.
    
    Output a list of function names that might have memory leaks, with a brief explanation for each.
    """
    
    # Call the LLM with the analyzer prompt
    response = llm_with_tools.invoke(
        [
            SystemMessage(content=analyzer_prompt)
        ]
    )
    
    # Process the analyzer response to extract vulnerable functions
    # Check for tool calls to pattern_db in the response
    vulnerable_functions = []
    
    # If tool calls were made, process their results
    if hasattr(response, "tool_calls") and response.tool_calls:
        for tool_call in response.tool_calls:
            if tool_call["name"] == "query_pattern_db":
                # Extract function name from the query if possible
                query = tool_call["args"]
                for func_name in all_functions.keys():
                    if func_name in str(query):
                        if func_name not in vulnerable_functions:
                            vulnerable_functions.append(func_name)
    
    # If no tool calls or vulnerable functions found, extract from content
    if not vulnerable_functions:
        for func_name in all_functions.keys():
            if func_name.lower() in response.content.lower():
                vulnerable_functions.append(func_name)
    
    # As a fallback, use our initial analysis
    if not vulnerable_functions:
        for func_name, func_info in all_functions.items():
            if func_info.get("potential_leak", False):
                vulnerable_functions.append(func_name)
    
    return {
        "messages": [AIMessage(content=f"Analysis complete. Identified {len(vulnerable_functions)} potentially vulnerable functions: {', '.join(vulnerable_functions)}")],
        "vulnerable_functions": vulnerable_functions
    }

In [4]:
# Node 4: Junction - Routes each vulnerable function to the generator
def junction_node(state: HarnessGenerationState):
    """Junction that creates processing tasks for each vulnerable function."""
    
    # Simply pass through the state - in a real implementation, this might
    # set up parallel processing or create a queue of functions to process
    return {
        "messages": [AIMessage(content="Preparing to generate harnesses for each vulnerable function.")]
    }