# CBMC-Compatible Harness Generation System - Part 1: Setup and ChromaDB Integration

This part handles the initial setup, imports, and ChromaDB configuration for our memory leak detection system.

In [1]:
# Import necessary libraries
from langgraph.graph import MessagesState
from langchain_core.messages import ToolMessage, HumanMessage, SystemMessage, AIMessage
from langchain_core.tools import tool
from typing_extensions import Literal
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display
import json
import os
from typing import Dict, List, Any
import chromadb
from chromadb.utils import embedding_functions
import numpy as np

In [2]:
# Set up the LLM
from langchain_anthropic import ChatAnthropic
llm = ChatAnthropic(model="claude-3-7-sonnet-latest")

# Set up ChromaDB client
chroma_client = chromadb.Client()

# Create code collection with embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
code_collection = chroma_client.create_collection(
    name="code_embeddings",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": "cosine"}
)

# Create pattern collection for known memory leak patterns
pattern_collection = chroma_client.create_collection(
    name="pattern_embeddings",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": "cosine"}
)

# Initialize pattern collection with known memory leak patterns
pattern_collection.add(
    ids=["pattern1", "pattern2", "pattern3"],
    documents=[
        "Allocation without corresponding deallocation (malloc without free)",
        "Nested malloc calls with potential for partial free",
        "Conditional free that might not execute in all paths"
    ],
    metadatas=[
        {
            "name": "malloc_without_free",
            "description": "Allocation without corresponding deallocation",
            "severity": "high",
            "verification_strategy": "Check all execution paths for memory deallocation"
        },
        {
            "name": "nested_malloc",
            "description": "Nested malloc calls with potential for partial free",
            "severity": "medium",
            "verification_strategy": "Ensure all allocations are freed in all execution paths"
        },
        {
            "name": "conditional_free",
            "description": "Conditional free that might not execute",
            "severity": "medium",
            "verification_strategy": "Verify all conditions that lead to memory release"
        }
    ]
)

## Define the State

We'll extend the basic MessagesState to include source code and other components required by our architecture.

In [3]:
# Define a custom state that extends MessagesState
class HarnessGenerationState(MessagesState):
    source_code: str = ""
    embeddings: Dict = {}
    vulnerable_functions: List[str] = []
    harnesses: Dict[str, str] = {}
    cbmc_results: Dict[str, str] = {}
    
    @property
    def is_done(self) -> bool:
        return len(self.vulnerable_functions) > 0 and all(
            func in self.cbmc_results for func in self.vulnerable_functions
        )