In [2]:
# === IMPORTS ===
import json
import re
import zlib
import base64
import requests
import operator
import os
import pandas as pd
import logging
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Annotated, List, TypedDict, Optional, Dict, Any, Tuple
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from sentence_transformers import SentenceTransformer
from langgraph.graph import StateGraph, START, END
from langgraph.store.memory import InMemoryStore

# === LOGGING CONFIGURATION ===
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# === CONSTANTS ===

class NodeNames(str, Enum):
    """Enum for node names to avoid string literals."""
    RETRIEVE = "retrieve"
    DECOMPOSE = "decompose"
    GENERATE = "generate"
    SYNTAX_CHECK = "syntax_check"
    CRITIC = "critic"
    SUMMARIZE = "summarize"
    REFLECT = "reflect"

# === STRUCTURED OUTPUT MODELS ===

class GraphInput(TypedDict):
    requirements: str


class GraphOutput(TypedDict):
    final_diagram: str
    iterations: int
    logic_valid: bool


class CritiqueError(BaseModel):
    """Model for a single critique error."""
    type: str = Field(description="Type of error")
    description: str = Field(description="Detailed description of the error")


class CritiqueResponse(BaseModel):
    """Structured output from the CRITIC node."""
    is_valid: bool = Field(description="Whether the diagram is valid")
    errors: List[CritiqueError] = Field(default_factory=list, description="List of errors found")
    warnings: List[str] = Field(default_factory=list, description="List of warnings")
    missing_concepts: List[str] = Field(default_factory=list, description="Concepts from requirements not in diagram")


class SummaryResponse(BaseModel):
    """Structured output from the SUMMARIZER node."""
    is_complete: bool = Field(description="Whether all issues are resolved")
    fixed: List[str] = Field(default_factory=list, description="Issues that were fixed")
    unresolved: List[str] = Field(default_factory=list, description="Issues still present")
    message: str = Field(description="Brief status summary")


class ComparisonResponse(BaseModel):
    """Structured output from the COMPARATOR (in REFLECT node)."""
    is_better: bool = Field(description="Whether the new diagram is better than the old")
    reasoning: str = Field(description="Explanation of the decision")
    recommendation: str = Field(description="Either 'keep_new' or 'rollback_to_old'")


class PromptConstants:
    """Centralized prompt templates for better maintainability."""
    
    DECOMPOSER_SYSTEM = """
    You are a Software Architect. Extract the structural elements from these requirements for a UML Class Diagram.
    
    Identify ONLY what is EXPLICITLY mentioned:
    - **Classes**: Main entities explicitly named in requirements (e.g., User, Order, Product)
    - **Attributes**: Data fields explicitly mentioned with types (e.g., name: String, price: Double)
    - **Potential Relationships**: Direct interactions explicitly stated (e.g., "Customer places Order")
    
    CRITICAL RULES:
    - Extract ONLY what is explicitly mentioned - do NOT infer or add entities
    - Do NOT create system-level or manager classes. Focus on the internal components.
    - Do NOT add classes for "the system" or "the application"
    - Extract only data attributes, NOT methods or operations.
    - If something isn't clearly mentioned, DON'T include it
    """
    
    GENERATOR_SYSTEM = """
    You are a UML Design Expert. Create a PlantUML class diagram from the design plan and requirements.
    
    Design principles:
    - Use inheritance when entities share common attributes
    - Choose relationship types: composition (*--) for ownership, association (-->) for references
    - Add cardinality on BOTH sides with quotes: ClassA "1" --> "*" ClassB
    - One relationship per class pair - choose the strongest if multiple apply
    - Connect relationships to parent classes, not subclasses (children inherit them)
    - Every class must connect to at least one other class
    
    CRITICAL CONSTRAINTS:
    - Create ONLY classes explicitly mentioned in the design plan - NO extras
    - Do NOT create system-level classes
    - Do NOT add "helpful" classes that weren't asked for
    - Include ONLY attributes explicitly mentioned in requirements/plan
    - STRICTLY NO METHODS - classes should only have attributes.
    
    Output valid PlantUML with:
    - class ClassName { attributes }
    - Proper arrows: <|-- (inheritance), *-- (composition), --> (association)
    - NO methods (no parentheses)
    """
    
    CRITIC_SYSTEM = """
    You are a UML Validator. Check if the diagram matches the requirements.
    
    Validate:
    - **Completeness**: Are all key entities from requirements present?
    - **Attributes**: Do classes have the right data fields?
    - **Relationships**: Are the right connections present?
    - **Structure**: No duplicate relationships, no isolated classes
    - **Cardinality**: Are multiplicities specified on both ends?
    - **Unnecessary Additions**: Are there EXTRA classes not mentioned in requirements?
    - **No Methods**: Are there any methods defined? (There should be NONE)
    
    Mark is_valid=false if:
    - Missing key entities from requirements
    - Missing critical relationships
    - Missing important attributes
    - Cardinality missing or clearly wrong (mark as error, not just warning)
    - Structural issues (isolated classes, duplicate relationships)
    - EXTRA classes that weren't in requirements
    - System-level or controller classes that represent "the whole system"
    - Any methods or operations are present in the classes

    Only mark is_valid=true when the diagram properly models all requirements WITHOUT extras.
    
    Return your response in the specified structured format.
    """
    
    SUMMARIZER_SYSTEM = """
    Compare the current critique with previous ones and identify progress.
    
    Analyze what has been fixed and what remains unresolved.
    Set is_complete=true only if no errors remain.
    
    Return your response in the specified structured format.
    """
    
    REFLECTOR_SYSTEM = """
    You are a Senior Software Engineer. Your task is to fix the UML Class Diagram provided. 

    Guidelines:
    - PRESERVE everything that's working (review the "fixed" list carefully)
    - ONLY fix the specific "unresolved" issues mentioned
    - DO NOT reorganize, refactor, or "improve" working parts
    - DO NOT remove classes or relationships that aren't explicitly wrong
    - DO NOT add new classes unless explicitly required by unresolved issues
    - DO NOT add system-level classes 
    - REMOVE any methods if they were accidentally added
    - If unsure, prefer keeping the existing structure
    - Make the smallest possible change to address each issue
    
    CRITICAL: This is a FIX operation, not a redesign. Change as little as possible.
    
    Output ONLY the corrected PlantUML diagram with minimal changes.
    """
    
    COMPARATOR_SYSTEM = """
    You are a UML Quality Assessor. Compare two versions of a UML diagram.
    
    Your task:
    1. Compare the OLD diagram with the NEW diagram
    2. Determine if the NEW version is STRICTLY BETTER than the OLD version
    3. Consider:
       - Did it fix the reported issues?
       - Did it preserve what was working?
       - Did it introduce new problems?
       - Is the structure still coherent?
    
    IMPORTANT: Only return is_better=true if the new version is clearly an improvement.
    If unsure, return is_better=false.
    Set recommendation to 'keep_new' or 'rollback_to_old' accordingly.
    
    Return your response in the specified structured format.
    """


@dataclass
class SystemConfig:
    """Configuration for the UML generation system."""
    lmstudio_base_url: str = "http://localhost:1234/v1"
    model_name: str = "qwen2.5-coder-14b-instruct"
    embedder_model: str = "all-MiniLM-L6-v2"
    shots_json_path: str = "../data/complete_shots.json"
    plantuml_host: str = "http://localhost:8080"
    max_iterations: int = 6
    max_tokens_decompose: int = 1024
    max_tokens_generate: int = 2048
    max_tokens_critique: int = 2048
    max_tokens_summarize: int = 512
    max_tokens_reflect: int = 2048
    max_tokens_compare: int = 1024
    temperature_base: float = 0.8
    temperature_reflect: float = 0.5
    num_few_shots: int = 2
    request_timeout: int = 5  # For PlantUML server
    llm_timeout: int = 120  # 2 minutes for LLM operations


def extract_plantuml(text: str) -> str:
    """
    Extract PlantUML code from markdown blocks or raw text.
    
    Args:
        text: Text containing PlantUML code
        
    Returns:
        Extracted PlantUML code or empty string
    """
    if not text:
        return ""
    
    # Try to extract from ```plantuml ... ```
    fence_match = re.search(r"```\s*plantuml\s*(.*?)```", text, re.DOTALL | re.IGNORECASE)
    if fence_match:
        return fence_match.group(1).strip()
    
    # Try to extract from @startuml ... @enduml
    tag_match = re.search(r"@startuml.*?@enduml", text, re.DOTALL | re.IGNORECASE)
    if tag_match:
        return tag_match.group(0).strip()
    
    return text.strip()


# === STATE DEFINITION ===

class AgentState(TypedDict):
    """
    Shared state for the LangGraph workflow.
    
    The 'history' field uses operator.add annotation to automatically
    append new critiques instead of replacing the entire list.
    """
    requirements: str
    plan: Optional[str]
    examples: List[Dict[str, str]]
    current_diagram: Optional[str]
    best_diagram: Optional[str]  # Track best diagram seen so far
    best_quality_score: float  # Quality metric (0-1) for best diagram
    history: Annotated[List[Dict[str, Any]], operator.add]
    summary: Optional[str]
    syntax_valid: bool
    logic_valid: bool
    iterations: int
    error_message: Optional[str]

In [4]:
# === LLM ENGINE ===

def create_llm(config: Optional[SystemConfig] = None, temperature: Optional[float] = None) -> ChatOpenAI:
    """
    Create a ChatOpenAI instance configured for LMStudio.
    
    LMStudio provides an OpenAI-compatible API, making it easy to use
    with LangChain's standard interfaces.
    
    Args:
        config: Optional system configuration
        temperature: Optional temperature override
        
    Returns:
        Configured ChatOpenAI instance
    """
    cfg = config or SystemConfig()
    temp = temperature if temperature is not None else cfg.temperature_base
    logger.info(f"Connecting to LMStudio at {cfg.lmstudio_base_url}")
    logger.info(f"Using model: {cfg.model_name} (temp={temp})")
    
    return ChatOpenAI(
        base_url=cfg.lmstudio_base_url,
        api_key="lm-studio",  
        model=cfg.model_name,
        temperature=temp,
        timeout=cfg.llm_timeout 
    )

In [5]:
# === PLANTUML TOOL ===

@dataclass
class PlantUMLResult:
    """Result from PlantUML validation."""
    is_valid: bool
    error: Optional[str] = None
    url: Optional[str] = None
    svg_url: Optional[str] = None


class PlantUMLTool:
    """
    Tool for validating and rendering PlantUML diagrams.
    
    This class interfaces with a PlantUML server to check syntax
    and generate diagram URLs.
    """
    
    def __init__(self, host: str = "http://localhost:8080"):
        """
        Initialize PlantUML tool.
        
        Args:
            host: PlantUML server host URL
        """
        self.host = host
        logger.info(f"PlantUML tool initialized with host: {host}")

    def _extract_plantuml(self, text: str) -> str:
        """Extract PlantUML code from mixed text."""
        return extract_plantuml(text)

    def _encode_plantuml(self, plantuml_code: str) -> str:
        """
        Encode PlantUML code for URL using deflate compression.
        
        Args:
            plantuml_code: Raw PlantUML code
            
        Returns:
            URL-safe encoded string
        """
        code = plantuml_code.strip()
        
        # Ensure proper tags
        if not code.startswith("@startuml"): 
            code = f"@startuml\n{code}"
        if not code.endswith("@enduml"): 
            code = f"{code}\n@enduml"
        
        # Compress and encode
        compressed = zlib.compress(code.encode('utf-8'))[2:-4]
        encoded = base64.b64encode(compressed).translate(
            bytes.maketrans(
                b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",
                b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_"
            )
        ).decode('utf-8')
        
        return encoded

    def get_diagram_url(self, plantuml_code: str, format: str = "png") -> str:
        """
        Generate a viewable URL for the PlantUML diagram.
        
        Args:
            plantuml_code: PlantUML diagram code
            format: Output format (png, svg, etc.)
            
        Returns:
            URL to view the diagram
        """
        diagram_code = self._extract_plantuml(plantuml_code)
        encoded = self._encode_plantuml(diagram_code)
        return f"{self.host}/{format}/{encoded}"

    def check_syntax(self, plantuml_code: str, timeout: int = 5) -> PlantUMLResult:
        """
        Validate PlantUML syntax by attempting to render.
        
        Args:
            plantuml_code: PlantUML code to validate
            timeout: Request timeout in seconds
            
        Returns:
            PlantUMLResult with validation status
        """
        logger.info("Validating PlantUML syntax")
        
        try:
            diagram_code = self._extract_plantuml(plantuml_code)
            encoded = self._encode_plantuml(diagram_code)
            url = f"{self.host}/png/{encoded}"
            
            response = requests.get(url, timeout=timeout)
            logger.debug(f"Server response status: {response.status_code}")
            
            if response.status_code == 200:
                # Check if response is actually a PNG
                if response.content[:4] == b'\x89PNG':
                    logger.info("✓ Syntax validation passed")
                    return PlantUMLResult(
                        is_valid=True,
                        url=f"{self.host}/png/{encoded}",
                        svg_url=f"{self.host}/svg/{encoded}"
                    )
                else:
                    error_text = response.text[:500] if response.text else "Unknown error"
                    logger.warning(f"Syntax error detected: {error_text}")
                    return PlantUMLResult(
                        is_valid=False,
                        error=f"PlantUML Syntax Error: {error_text}"
                    )
            else:
                error_text = response.text[:500] if response.text else "No details"
                logger.error(f"Server error {response.status_code}: {error_text}")
                return PlantUMLResult(
                    is_valid=False,
                    error=f"Server returned {response.status_code}: {error_text}"
                )
        
        except requests.exceptions.ConnectionError:
            error_msg = f"Cannot connect to PlantUML server at {self.host}"
            logger.error(error_msg)
            return PlantUMLResult(is_valid=False, error=error_msg)
            
        except requests.exceptions.Timeout:
            error_msg = f"Request timeout after {timeout}s"
            logger.error(error_msg)
            return PlantUMLResult(is_valid=False, error=error_msg)
            
        except Exception as e:
            error_msg = f"Unexpected error: {str(e)}"
            logger.error(error_msg)
            return PlantUMLResult(is_valid=False, error=error_msg)


In [6]:
# === LONG-TERM MEMORY MANAGER ===

class MemoryManager:
    """
    Manages long-term memory for UML diagram generation.
    
    Stores successfully validated diagrams with their metadata for future retrieval.
    Supports semantic search to find similar past solutions.
    
    Note: Diagrams are NOT automatically added. Use save_diagram() manually
    after reviewing and approving the generated output.
    """
    
    def __init__(
        self,
        embedder: SentenceTransformer,
        store: Optional[InMemoryStore] = None,
        shots_json_path: Optional[str] = None
    ):
        """
        Initialize memory manager.
        
        Args:
            embedder: SentenceTransformer model for semantic search
            store: Optional InMemoryStore instance (creates new if None)
        """
        self.embedder = embedder
        self._count = 0  # Track number of stored diagrams
        
        # Create embedding function wrapper for InMemoryStore
        def embed_func(texts: List[str]) -> List[List[float]]:
            embeddings = self.embedder.encode(texts, convert_to_tensor=False)
            return embeddings.tolist()
        
        # Initialize store with embedding function
        self.store = store or InMemoryStore(
            index={
                "embed": embed_func,
                "dims": 384  # all-MiniLM-L6-v2 embedding dimensions
            }
        )
        
        logger.info("MemoryManager initialized with InMemoryStore")

        if shots_json_path:
            self._load_initial_shots(shots_json_path)
    

    def _load_initial_shots(self, json_path: str):
        """Internal method to load static examples into the memory store."""
        if not os.path.exists(json_path):
            logger.warning(f"Shots file not found at '{json_path}'.")
            return
            
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                shots = json.load(f)
            
            for shot in shots:
                # Store the decomposition and CoT in metadata so the prompt 
                # can benefit from the full Chain of Thought
                metadata = {
                    "plan": shot.get("subgoal_decomposition"),
                    "reasoning": shot.get("chain_of_thought"),
                    "is_static": True,
                    "title": shot.get("title")
                }
                self.save_diagram(
                    requirements=shot["requirements"],
                    diagram=shot["solution_plantuml"],
                    metadata=metadata
                )
            logger.info(f"✓ Pre-loaded {len(shots)} static shots into MemoryManager")
        except Exception as e:
            logger.error(f"Failed to load initial shots: {e}")

    def save_diagram(
        self,
        requirements: str,
        diagram: str,
        metadata: Optional[Dict[str, Any]] = None
    ) -> str:
        """
        Manually save a validated diagram to long-term memory.
        
        Call this ONLY after reviewing and approving the generated diagram.
        
        Args:
            requirements: Original requirements text
            diagram: PlantUML diagram code
            metadata: Optional metadata (iterations, validation status, etc.)
            
        Returns:
            Key of the stored memory
        """
        timestamp = datetime.now().isoformat()
        key = f"diagram_{timestamp}"
        
        memory_data = {
            "requirements": requirements,
            "diagram": diagram,
            "timestamp": timestamp,
            "metadata": metadata or {}
        }
        
        # Store in the "validated_diagrams" namespace
        self.store.put(
            namespace=("uml_memory", "validated"),
            key=key,
            value=memory_data
        )
        
        self._count += 1
        logger.info(f"✓ Diagram saved to long-term memory: {key}")
        return key
    
    def retrieve_similar_diagrams(
        self,
        requirements: str,
        limit: int = 2
    ) -> List[Dict[str, Any]]:
        """
        Retrieve similar diagrams from long-term memory.
        
        Args:
            requirements: Requirements to search for
            limit: Maximum number of results
            
        Returns:
            List of similar diagram memories
        """
        try:
            # If no items saved yet, return empty
            if self._count == 0:
                logger.info("No diagrams in memory yet")
                return []
            
            # Use search with namespace tuple as first positional argument
            results = self.store.search(
                ("uml_memory", "validated"),
                query=requirements,
                limit=limit
            )
            
            diagrams = [item.value for item in results]
            logger.info(f"Retrieved {len(diagrams)} similar diagrams from memory")
            return diagrams
            
        except Exception as e:
            logger.warning(f"Memory retrieval failed: {e}")
            return []
    
    def clear_memory(self) -> None:
        """
        Clear all memories from the store.
        
        WARNING: This is irreversible!
        """
        logger.warning("Clearing all memories from long-term storage")
        # InMemoryStore doesn't have a direct clear method for namespace,
        # so we recreate the store
        def embed_func(texts: List[str]) -> List[List[float]]:
            embeddings = self.embedder.encode(texts, convert_to_tensor=False)
            return embeddings.tolist()
        
        self.store = InMemoryStore(
            index={
                "embed": embed_func,
                "dims": 384
            }
        )
        self._count = 0
        logger.info("Memory cleared")


In [7]:
# === UML NODES (AGENTS) ===

class UMLNodes:
    """
    Collection of agent nodes for the UML generation workflow.
    
    Each method represents a node in the LangGraph workflow and
    follows the pattern of taking AgentState and returning a dict
    with state updates.
    """
    
    def __init__(
        self,
        llm: ChatOpenAI,
        plantuml_tool: PlantUMLTool,
        memory_manager: Optional['MemoryManager'] = None,
        config: Optional[SystemConfig] = None
    ):
        """
        Initialize UML nodes with required dependencies.
        
        Args:
            llm: LangChain ChatOpenAI instance
            plantuml_tool: Tool for PlantUML validation
            memory_manager: long-term memory manager
            config: Optional system configuration
        """
        self.llm = llm
        self.plantuml_tool = plantuml_tool
        self.memory_manager = memory_manager
        self.config = config or SystemConfig()
        logger.info("UMLNodes initialized")

    def retrieve(self, state: AgentState) -> Dict[str, Any]:
        """
        Retrieve relevant few-shot examples based on requirements.
        
        Now augmented with long-term memory: retrieves both static examples
        and learned diagrams from past successful generations.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'examples' key containing formatted shots
        """
        logger.info(f"--- NODE: {NodeNames.RETRIEVE.upper()} ---")
        
        try:
            # Search for the most relevant examples (both pre-loaded and learned)
            # We use the config to determine how many examples to inject
            memories = self.memory_manager.retrieve_similar_diagrams(
                state["requirements"],
                limit=self.config.num_few_shots
            )
            
            formatted_shots = []
            for mem in memories:
                # 1. User Message: The Requirements
                formatted_shots.append({
                    "role": "user",
                    "content": f"Requirements:\n{mem['requirements']}"
                })
                
                # 2. Assistant Message: Plan + Reasoning + Diagram
                # We pull the CoT steps from metadata if they exist
                meta = mem.get("metadata", {})
                plan = meta.get("plan", "No plan available.")
                reasoning = meta.get("reasoning", "No reasoning available.")
                
                assistant_content = (
                    f"1. DESIGN PLAN:\n{plan}\n\n"
                    f"2. DESIGN REASONING:\n{reasoning}\n\n"
                    f"3. PLANTUML DIAGRAM:\n```plantuml\n{mem['diagram']}\n```"
                )
                
                formatted_shots.append({
                    "role": "assistant",
                    "content": assistant_content
                })
            
            logger.info(f"Retrieved {len(memories)} relevant examples from unified memory")
            return {"examples": formatted_shots}
        except Exception as e:
            logger.error(f"Retrieval failed: {e}")
            return {"examples": []}

    def decompose(self, state: AgentState) -> Dict[str, Any]:
        """
        Decompose requirements into structural building blocks.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'plan' key containing decomposition
        """
        logger.info(f"--- NODE: {NodeNames.DECOMPOSE.upper()} ---")
        
        messages = [
            SystemMessage(content=PromptConstants.DECOMPOSER_SYSTEM),
            HumanMessage(content=f"REQUIREMENTS:\n{state['requirements']}")
        ]
        
        try:
            response = self.llm.invoke(
                messages,
                max_tokens=self.config.max_tokens_decompose
            )
            logger.info("Decomposition completed")
            return {"plan": response.content}
            
        except Exception as e:
            logger.error(f"Decomposition failed: {e}")
            return {"plan": f"Error: {str(e)}"}

    def generate(self, state: AgentState) -> Dict[str, Any]:
        """
        Generate PlantUML diagram using chain-of-thought reasoning.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'current_diagram' and 'iterations' updates
        """
        logger.info(f"--- NODE: {NodeNames.GENERATE.upper()} ---")
        
        messages = [SystemMessage(content=PromptConstants.GENERATOR_SYSTEM)]
        
        # Add few-shot examples if available
        if state.get("examples"):
            for example in state["examples"]:
                if example["role"] == "user":
                    messages.append(HumanMessage(content=example["content"]))
                else:
                    messages.append(AIMessage(content=example["content"]))
            logger.debug(f"Added {len(state['examples'])} example messages")
            
        user_content = f"""
        === ORIGINAL REQUIREMENTS ===
        {state['requirements']}

        === DESIGN PLAN ===
        {state['plan']}

        Follow the examples above exactly. Output your response in three parts:
        1. DESIGN PLAN: (Briefly refine the plan for implementation)
        2. DESIGN REASONING: (Explain your choice of relationships and cardinality)
        3. PLANTUML DIAGRAM: (The code block)
        """
        
        messages.append(HumanMessage(content=user_content))
        
        try:
            response = self.llm.invoke(
                messages,
                max_tokens=self.config.max_tokens_generate
            )
            diagram = extract_plantuml(response.content)
            
            logger.info(f"Generation completed (iteration {state['iterations'] + 1})")
            return {
                "current_diagram": diagram,
                "iterations": state["iterations"] + 1
            }
            
        except Exception as e:
            logger.error(f"Generation failed: {e}")
            return {
                "current_diagram": f"Error: {str(e)}",
                "iterations": state["iterations"] + 1
            }

    def syntax_check(self, state: AgentState) -> Dict[str, Any]:
        """
        Validate PlantUML syntax through server.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'syntax_valid' and optional 'error_message'
        """
        logger.info(f"--- NODE: {NodeNames.SYNTAX_CHECK.upper()} ---")
        
        try:
            result = self.plantuml_tool.check_syntax(
                state["current_diagram"],
                timeout=self.config.request_timeout
            )
            
            if result.is_valid:
                logger.info(f"✓ Syntax valid. View at: {result.url}")
            else:
                logger.warning(f"✗ Syntax error: {result.error}")
            
            return {
                "syntax_valid": result.is_valid,
                "error_message": result.error if not result.is_valid else None
            }
            
        except Exception as e:
            logger.error(f"Syntax check failed: {e}")
            return {
                "syntax_valid": False,
                "error_message": f"Syntax check error: {str(e)}"
            }

    def critic(self, state: AgentState) -> Dict[str, Any]:
        """
        Perform logical validation of the UML diagram.
        
        Uses structured output with Pydantic model for reliable JSON parsing.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'logic_valid' and 'history' updates
        """
        logger.info(f"--- NODE: {NodeNames.CRITIC.upper()} ---")
        
        plantuml_only = extract_plantuml(state["current_diagram"])
        user_msg = f"""=== REQUIREMENTS ===
        {state['requirements']}

        === DIAGRAM ===
        {plantuml_only}

        Audit the diagram thoroughly."""
        
        messages = [
            SystemMessage(content=PromptConstants.CRITIC_SYSTEM),
            HumanMessage(content=user_msg)
        ]
        
        try:
            # Use structured output for reliable JSON parsing
            structured_llm = self.llm.with_structured_output(CritiqueResponse)
            critique_response: CritiqueResponse = structured_llm.invoke(messages)
            
            # Convert Pydantic model to dict for history
            critique = {
                "is_valid": critique_response.is_valid,
                "errors": [{"type": err.type, "description": err.description} 
                          for err in critique_response.errors],
                "warnings": critique_response.warnings,
                "missing_concepts": critique_response.missing_concepts
            }
            
            is_valid = critique_response.is_valid
            logger.info(f"{'✓' if is_valid else '✗'} Logic validation: {'PASSED' if is_valid else 'FAILED'}")
            
            if not is_valid and critique_response.errors:
                logger.info(f"Found {len(critique_response.errors)} errors")
            
            # Track best diagram (store current if valid)
            updates = {
                "logic_valid": is_valid,
                "history": [critique]  # Appends thanks to operator.add
            }
            
            # Update best diagram if this one is valid
            if is_valid and not state.get("best_diagram"):
                logger.info(f"✓ Storing first valid diagram as best")
                updates["best_diagram"] = state["current_diagram"]
            
            return updates
            
        except Exception as e:
            logger.error(f"Critic failed: {e}")
            return {
                "logic_valid": False,
                "history": [{
                    "is_valid": False,
                    "errors": [{"type": "system", "description": str(e)}],
                    "warnings": [],
                    "missing_concepts": []
                }]
            }

    def summarize_memory(self, state: AgentState) -> Dict[str, Any]:
        """
        Summarize progress by comparing current and previous critiques.
        
        Uses structured output with Pydantic model for reliable JSON parsing.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'summary' key containing JSON string
        """
        logger.info(f"--- NODE: {NodeNames.SUMMARIZE.upper()} ---")
        
        if not state.get("history"):
            logger.info("No history to summarize")
            return {"summary": json.dumps({"is_complete": False, "message": "No history"})}
        
        current_critique = state["history"][-1]
        previous_critiques = state["history"][:-1]
        
        user_prompt = f"""
        CURRENT CRITIQUE (Issues in the latest diagram):
        {json.dumps(current_critique)}
        
        PREVIOUS CRITIQUES (History of past issues):
        {json.dumps(previous_critiques)}
        """
        
        messages = [
            SystemMessage(content=PromptConstants.SUMMARIZER_SYSTEM),
            HumanMessage(content=user_prompt)
        ]
        
        try:
            # Use structured output for reliable JSON parsing
            structured_llm = self.llm.with_structured_output(SummaryResponse)
            summary_response: SummaryResponse = structured_llm.invoke(messages)
            
            # Convert to JSON string for storage
            summary = {
                "is_complete": summary_response.is_complete,
                "fixed": summary_response.fixed,
                "unresolved": summary_response.unresolved,
                "message": summary_response.message
            }
            
            logger.info(f"Summary: {summary_response.message}")
            return {"summary": json.dumps(summary)}
            
        except Exception as e:
            logger.error(f"Summarization failed: {e}")
            return {"summary": json.dumps({
                "is_complete": False,
                "fixed": [],
                "unresolved": [],
                "message": f"Error: {str(e)}"
            })}

    def reflect(self, state: AgentState) -> Dict[str, Any]:
        """
        Fix diagram based on memory summary and error history.
        Uses lower temperature for stability and explicit comparison to prevent regression.
        
        Args:
            state: Current workflow state
            
        Returns:
            Dict with 'current_diagram' and 'iterations' updates
        """
        logger.info(f"--- NODE: {NodeNames.REFLECT.upper()} ---")
        
        # Store old diagram for potential rollback
        old_diagram = state['current_diagram']
        
        # Use summary if available, otherwise use error message or last critique
        context = (
            state.get("summary") or 
            state.get("error_message") or 
            (json.dumps(state["history"][-1]) if state.get("history") else "No context")
        )
        
        user_msg = f"""
        === REQUIREMENTS ===
        {state['requirements']}

        === CURRENT DIAGRAM ===
        {state['current_diagram']}

        === MEMORY SUMMARY ===
        {context}

        Generate the corrected diagram, preserving what is fixed and resolving what is broken.
        """
        
        messages = [
            SystemMessage(content=PromptConstants.REFLECTOR_SYSTEM),
            HumanMessage(content=user_msg)
        ]
        
        try:
            # Create LLM with LOWER temperature for more conservative changes
            llm_reflect = create_llm(self.config, temperature=self.config.temperature_reflect)
            
            response = llm_reflect.invoke(
                messages,
                max_tokens=self.config.max_tokens_reflect
            )
            new_diagram = extract_plantuml(response.content)
            
            logger.info(f"Reflection generated new diagram (temp={self.config.temperature_reflect})")
            
            # Explicit comparison: Ask LLM if new version is better using structured output
            comparison_msg = f"""
            === OLD DIAGRAM ===
            {old_diagram}

            === NEW DIAGRAM ===
            {new_diagram}

            === ISSUES TO FIX ===
            {context}

            Compare these diagrams. Is the NEW version strictly better than the OLD?
            """
            
            comparison_messages = [
                SystemMessage(content=PromptConstants.COMPARATOR_SYSTEM),
                HumanMessage(content=comparison_msg)
            ]
            
            # Use structured output for reliable comparison parsing
            structured_llm = self.llm.with_structured_output(ComparisonResponse)
            comparison_response: ComparisonResponse = structured_llm.invoke(comparison_messages)
            
            if comparison_response.is_better:
                logger.info(f"✓ Comparison: NEW diagram is better - {comparison_response.reasoning}")
                final_diagram = new_diagram
            else:
                logger.warning(f"⚠️  Comparison: Rolling back to OLD diagram - {comparison_response.reasoning}")
                final_diagram = old_diagram
            
            logger.info(f"Reflection completed (iteration {state['iterations'] + 1})")
            
            return {
                "current_diagram": final_diagram,
                "iterations": state["iterations"] + 1
            }
            
        except Exception as e:
            logger.error(f"Reflection failed: {e}")
            return {
                "current_diagram": state["current_diagram"],  # Keep current on error
                "iterations": state["iterations"] + 1
            }

In [8]:
# === GRAPH DEFINITION ===

def create_uml_graph(
    nodes: UMLNodes, 
    config: Optional[SystemConfig] = None
) -> Any:
    """
    Create the LangGraph workflow for UML diagram generation.
    
    This function defines the complete workflow graph including:
    - All agent nodes
    - Entry point
    - Linear and conditional edges
    - Routing logic
    
    Args:
        nodes: UMLNodes instance with all agent methods
        config: Optional system configuration
        
    Returns:
        Compiled LangGraph workflow
    """
    cfg = config or SystemConfig()
    logger.info("Creating UML generation workflow")
    
    workflow = StateGraph(AgentState)

    # Add all nodes
    workflow.add_node(NodeNames.RETRIEVE, nodes.retrieve)
    workflow.add_node(NodeNames.DECOMPOSE, nodes.decompose)
    workflow.add_node(NodeNames.GENERATE, nodes.generate)
    workflow.add_node(NodeNames.SYNTAX_CHECK, nodes.syntax_check)
    workflow.add_node(NodeNames.CRITIC, nodes.critic)
    workflow.add_node(NodeNames.SUMMARIZE, nodes.summarize_memory)
    workflow.add_node(NodeNames.REFLECT, nodes.reflect)
    
    logger.debug("Added 7 nodes to workflow")

    # Entry point
    # workflow.set_entry_point(START)
    
    # Linear flow until first diagram generation
    workflow.add_edge(START, NodeNames.RETRIEVE)
    workflow.add_edge(NodeNames.RETRIEVE, NodeNames.DECOMPOSE)
    workflow.add_edge(NodeNames.DECOMPOSE, NodeNames.GENERATE)
    workflow.add_edge(NodeNames.GENERATE, NodeNames.SYNTAX_CHECK)

    # Conditional routing after syntax check
    def route_after_syntax_check(state: AgentState) -> str:
        """
        Route based on syntax validation results and iteration limits.
        
        Args:
            state: Current workflow state
            
        Returns:
            Next node name
        """
        if state["syntax_valid"]:
            logger.debug("Routing: syntax_check → critic")
            return NodeNames.CRITIC
            
        if state["iterations"] >= cfg.max_iterations:
            logger.warning(f"⚠ Max iterations ({cfg.max_iterations}) reached during syntax check")
            return END
            
        logger.debug("Routing: syntax_check → reflect")
        return NodeNames.REFLECT

    workflow.add_conditional_edges(
        NodeNames.SYNTAX_CHECK, 
        route_after_syntax_check,
        {
            NodeNames.CRITIC: NodeNames.CRITIC,
            NodeNames.REFLECT: NodeNames.REFLECT,
            END: END
        }
    )

    # Conditional routing after logic validation
    def is_logic_valid(state: AgentState) -> str:
        """
        Route based on logic validation and iteration limits.
        
        Args:
            state: Current workflow state
            
        Returns:
            Next node name or END
        """
        if state["logic_valid"]:
            logger.info("✓ Diagram validated successfully")
            return END
            
        if state["iterations"] >= cfg.max_iterations:
            logger.warning(f"⚠ Max iterations ({cfg.max_iterations}) reached")
            return END
            
        logger.debug("Routing: critic → summarize")
        return NodeNames.SUMMARIZE

    workflow.add_conditional_edges(
        NodeNames.CRITIC, 
        is_logic_valid,
        {
            END: END,
            NodeNames.SUMMARIZE: NodeNames.SUMMARIZE
        }
    )
    
    # After summarizing, always reflect
    workflow.add_edge(NodeNames.SUMMARIZE, NodeNames.REFLECT)
    
    # After reflecting, check syntax again
    workflow.add_edge(NodeNames.REFLECT, NodeNames.SYNTAX_CHECK)
    
    logger.info("Workflow graph created successfully")
    return workflow.compile()


def create_initial_state(requirements: str) -> AgentState:
    """
    Create an initial state for the workflow.
    
    Args:
        requirements: Software requirements text
        
    Returns:
        Initial AgentState dictionary
    """
    return {
        "requirements": requirements,
        "plan": None,
        "examples": [],
        "current_diagram": None,
        "best_diagram": None,
        "best_quality_score": 0.0,
        "history": [],
        "summary": None,
        "syntax_valid": False,
        "logic_valid": False,
        "iterations": 0,
        "error_message": None
    }

In [None]:
# === LOAD TEST DATA ===

def load_test_exercises(json_path: str = "../data/test_exercises.json") -> List[Dict[str, Any]]:
    """
    Load test exercises from JSON file.
    
    Args:
        json_path: Path to test exercises JSON
        
    Returns:
        List of exercise dictionaries
        
    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If JSON is invalid
    """
    logger.info(f"Loading test exercises from {json_path}")
    
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Test exercises file not found: {json_path}")
    
    with open(json_path, "r", encoding="utf-8") as f:
        exercises = json.load(f)
    
    logger.info(f"Loaded {len(exercises)} test exercises")
    return exercises

# Load exercises
try:
    test_exercises = load_test_exercises()
    print(f"✓ Loaded {len(test_exercises)} test exercises")
    print(f"\nExample exercise preview:")
    print(f"Requirements: {test_exercises[0]['requirements'][:200]}...")
except Exception as e:
    logger.error(f"Failed to load test exercises: {e}")
    test_exercises = []


2025-12-28 19:54:11,484 - __main__ - INFO - Loading test exercises from data/test_exercises.json
2025-12-28 19:54:11,486 - __main__ - INFO - Loaded 8 test exercises


✓ Loaded 8 test exercises

Example exercise preview:
Requirements: An e-commerce platform manages products, customers, and orders.
Each product has a unique SKU, name, description, price, and stock quantity.
Products can be categorized, and a product can belong to mu...


In [10]:
# === SYSTEM INITIALIZATION ===

def initialize_system(
    config: Optional[SystemConfig] = None,
    enable_long_term_memory: bool = True
) -> Tuple[UMLNodes, Any, SystemConfig, Optional[MemoryManager]]:
    """
    Initialize all system components.
    
    Args:
        config: Optional system configuration
        enable_long_term_memory: Whether to enable long-term memory
        
    Returns:
        Tuple of (nodes, compiled_workflow, config, memory_manager)
    """
    cfg = config or SystemConfig()
    logger.info("="*60)
    logger.info("INITIALIZING UML GENERATION SYSTEM")
    logger.info("="*60)
    
    try:
        # Initialize components
        logger.info("Creating LLM connection...")
        llm = create_llm(cfg)
        
        logger.info("Initializing PlantUML tool...")
        puml_tool = PlantUMLTool(cfg.plantuml_host)
        
        # Initialize long-term memory if enabled
        memory_mgr = None
        if enable_long_term_memory:
            logger.info("Initializing long-term memory...")
            memory_mgr = MemoryManager(embedder=SentenceTransformer('all-MiniLM-L6-v2'))
            logger.info("✓ Long-term memory enabled")
        else:
            logger.info("⚠ Long-term memory disabled")
        
        # Build workflow
        logger.info("Building LangGraph workflow...")
        nodes = UMLNodes(llm, puml_tool, memory_mgr, cfg)
        app = create_uml_graph(nodes, cfg)
        
        logger.info("="*60)
        logger.info("✓ SYSTEM INITIALIZED SUCCESSFULLY")
        logger.info("="*60)
        
        return nodes, app, cfg, memory_mgr
        
    except Exception as e:
        logger.error(f"System initialization failed: {e}")
        raise


config = SystemConfig(
    max_iterations=6,
    num_few_shots=3,
    temperature_base=0.8,      # Higher temp for initial creativity
    temperature_reflect=0.5    # Lower temp for stable refinement
)

nodes, app, config, memory_manager = initialize_system(config, enable_long_term_memory=True)
print("\n✓ System ready for diagram generation")
print(f"✓ Long-term memory: {'ENABLED' if memory_manager else 'DISABLED'}")
print(f"Temperature strategy: Base={config.temperature_base}, Reflect={config.temperature_reflect}")


2025-12-28 19:54:11,500 - __main__ - INFO - INITIALIZING UML GENERATION SYSTEM
2025-12-28 19:54:11,501 - __main__ - INFO - Creating LLM connection...
2025-12-28 19:54:11,501 - __main__ - INFO - Connecting to LMStudio at http://localhost:1234/v1
2025-12-28 19:54:11,502 - __main__ - INFO - Using model: qwen2.5-coder-14b-instruct (temp=0.8)
2025-12-28 19:54:11,617 - __main__ - INFO - Initializing PlantUML tool...
2025-12-28 19:54:11,617 - __main__ - INFO - PlantUML tool initialized with host: http://localhost:8080
2025-12-28 19:54:11,617 - __main__ - INFO - Initializing long-term memory...
2025-12-28 19:54:11,627 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2025-12-28 19:54:11,628 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-12-28 19:54:11,805 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Tempora


✓ System ready for diagram generation
✓ Long-term memory: ENABLED
Temperature strategy: Base=0.8, Reflect=0.5


In [12]:
# === RUN SINGLE TEST ===

def run_single_test(
    app: Any,
    requirements: str,
    exercise_name: str = "Test Exercise"
) -> AgentState:
    """
    Run the workflow on a single exercise.
    
    Args:
        app: Compiled LangGraph workflow
        requirements: Software requirements text
        exercise_name: Name for logging purposes
        
    Returns:
        Final workflow state
    """
    logger.info("="*60)
    logger.info(f"RUNNING: {exercise_name}")
    logger.info("="*60)
    logger.info(f"Requirements preview: {requirements[:150]}...")
    
    initial_state = create_initial_state(requirements)
    
    try:
        final_output = app.invoke(initial_state, config={"recursion_limit": 50})
        
        logger.info("="*60)
        logger.info("WORKFLOW COMPLETED")
        logger.info("="*60)
        logger.info(f"Iterations: {final_output['iterations']}")
        logger.info(f"Syntax Valid: {final_output['syntax_valid']}")
        logger.info(f"Logic Valid: {final_output['logic_valid']}")
        
        # Use best diagram if we have one and final is not valid
        if final_output.get('best_diagram') and not final_output['logic_valid']:
            if final_output['best_diagram'] != final_output['current_diagram']:
                logger.info("⚠️  Using BEST diagram instead of final (prevented regression)")
                final_output['current_diagram'] = final_output['best_diagram']
        
        return final_output
        
    except Exception as e:
        logger.error(f"Workflow execution failed: {e}")
        raise


# Select and run a test exercise
test_idx = 2
requirements = test_exercises[test_idx]["requirements"]

final_output = run_single_test(
    app, 
    requirements, 
    f"Exercise {test_idx + 1}"
)

# Display results
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(f"Iterations: {final_output['iterations']}")
print(f"Syntax Valid: {final_output['syntax_valid']}")
print(f"Logic Valid: {final_output['logic_valid']}")

if final_output['current_diagram']:
    puml_tool = PlantUMLTool(config.plantuml_host)
    diagram_url = puml_tool.get_diagram_url(final_output['current_diagram'])
    print(f"\nDiagram URL: {diagram_url}")
    
    print("\nGenerated Diagram:")
    print(final_output['current_diagram'])

2025-12-28 19:54:46,183 - __main__ - INFO - RUNNING: Exercise 3
2025-12-28 19:54:46,184 - __main__ - INFO - Requirements preview: A library system manages books, members, and loans.
Books have an ISBN, title, author, publisher, publication year, and availability status.
Members h...
2025-12-28 19:54:46,186 - __main__ - INFO - --- NODE: RETRIEVE ---
2025-12-28 19:54:46,187 - __main__ - INFO - No diagrams in memory yet
2025-12-28 19:54:46,187 - __main__ - INFO - Retrieved 0 relevant examples from unified memory
2025-12-28 19:54:46,188 - __main__ - INFO - --- NODE: DECOMPOSE ---
2025-12-28 19:55:07,450 - httpx - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-28 19:55:07,462 - __main__ - INFO - Decomposition completed
2025-12-28 19:55:07,463 - __main__ - INFO - --- NODE: GENERATE ---
2025-12-28 19:55:52,402 - httpx - INFO - HTTP Request: POST http://localhost:1234/v1/chat/completions "HTTP/1.1 200 OK"
2025-12-28 19:55:52,404 - __main__ - INFO 

KeyboardInterrupt: 

In [1]:
# === EVALUATION UTILITIES ===

@dataclass
class EvaluationMetrics:
    """Container for evaluation metrics."""
    precision: float
    recall: float
    f1: float
    
    def __str__(self) -> str:
        return f"P={self.precision:.2f}, R={self.recall:.2f}, F1={self.f1:.2f}"


class PlantUMLParser:
    """
    Parser for extracting structured information from PlantUML diagrams.
    
    Extracts classes, attributes, and relationships from PlantUML code
    for evaluation purposes.
    """
    
    def __init__(self, plantuml_code: str):
        """
        Initialize parser with PlantUML code.
        
        Args:
            plantuml_code: PlantUML diagram code
        """
        self.plantuml_code = plantuml_code
        self.classes: Dict[str, Dict[str, List[str]]] = {}
        self.relationships: List[Dict[str, Any]] = []
        self.parse()
    
    def parse(self) -> None:
        """Parse the PlantUML code."""
        try:
            self._extract_classes()
            self._extract_relationships()
            logger.debug(f"Parsed {len(self.classes)} classes and {len(self.relationships)} relationships")
        except Exception as e:
            logger.error(f"Parsing failed: {e}")
    
    def _extract_classes(self) -> None:
        """Extract class definitions and their attributes."""
        class_pattern = r'class\s+(\w+)\s*\{([^}]*)\}'
        matches = re.finditer(class_pattern, self.plantuml_code, re.MULTILINE | re.DOTALL)
        
        for match in matches:
            class_name = match.group(1)
            class_body = match.group(2)
            
            attributes = []
            for line in class_body.strip().split('\n'):
                line = line.strip()
                if line and not line.startswith('--'):
                    attributes.append(line)
            
            self.classes[class_name] = {'attributes': attributes}
    
    def _extract_relationships(self) -> None:
        """Extract relationships between classes."""
        patterns = {
            'generalization': r'(\w+)\s*(?:"([^"]*)")?\s*<\|--\s*(?:"([^"]*)")?\s*(\w+)',
            'composition': r'(\w+)\s*(?:"([^"]*)")?\s*\*--\s*(?:"([^"]*)")?\s*(\w+)',
            'aggregation': r'(\w+)\s*(?:"([^"]*)")?\s*o--\s*(?:"([^"]*)")?\s*(\w+)',
            'association': r'(\w+)\s*(?:"([^"]*)")?\s*-->\s*(?:"([^"]*)")?\s*(\w+)',
        }
        
        for rel_type, pattern in patterns.items():
            for match in re.finditer(pattern, self.plantuml_code):
                self.relationships.append({
                    'type': rel_type,
                    'source': match.group(1),
                    'target': match.group(4),
                    'cardinality_source': match.group(2),
                    'cardinality_target': match.group(3)
                })


class DiagramEvaluator:
    """
    Evaluator for comparing generated diagrams against gold standards.
    
    Computes precision, recall, and F1 scores for classes, attributes,
    and relationships.
    """
    
    def __init__(self, gold_plantuml: str, pred_plantuml: str):
        """
        Initialize evaluator with gold and predicted diagrams.
        
        Args:
            gold_plantuml: Gold standard PlantUML code
            pred_plantuml: Predicted PlantUML code
        """
        self.gold_parser = PlantUMLParser(gold_plantuml)
        self.pred_parser = PlantUMLParser(pred_plantuml)
    
    def _normalize_attr(self, attr_str: str) -> str:
        """Normalize attribute strings for comparison."""
        return attr_str.split(':')[0].strip().lower()
    
    def _normalize_rel_type(self, rel_type: str) -> str:
        """Normalize relationship types."""
        mapping = {
            '<|--': 'INHERITANCE',
            '*--': 'COMPOSITION',
            'o--': 'AGGREGATION',
            '--': 'ASSOCIATION',
            '-->': 'ASSOCIATION'
        }
        return mapping.get(rel_type, 'ASSOCIATION')
    
    def _calculate_metrics(
        self, 
        gold_set: set, 
        pred_set: set
    ) -> EvaluationMetrics:
        """
        Calculate precision, recall, and F1 scores.
        
        Args:
            gold_set: Set of gold standard elements
            pred_set: Set of predicted elements
            
        Returns:
            EvaluationMetrics object
        """
        tp = len(gold_set.intersection(pred_set))
        fp = len(pred_set - gold_set)
        fn = len(gold_set - pred_set)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        return EvaluationMetrics(
            precision=round(precision, 2),
            recall=round(recall, 2),
            f1=round(f1, 2)
        )
    
    def get_metrics(self) -> Dict[str, EvaluationMetrics]:
        """
        Get all evaluation metrics.
        
        Returns:
            Dictionary with metrics for classes, attributes, and relationships
        """
        # Classes comparison
        gold_classes = {c.lower() for c in self.gold_parser.classes.keys()}
        pred_classes = {c.lower() for c in self.pred_parser.classes.keys()}
        
        # Attributes comparison
        gold_attrs = set()
        for cls, info in self.gold_parser.classes.items():
            for attr in info['attributes']:
                gold_attrs.add((cls.lower(), self._normalize_attr(attr)))
        
        pred_attrs = set()
        for cls, info in self.pred_parser.classes.items():
            for attr in info['attributes']:
                pred_attrs.add((cls.lower(), self._normalize_attr(attr)))
        
        # Relationships comparison
        gold_rels = {
            (r['source'].lower(), r['target'].lower(), self._normalize_rel_type(r['type']))
            for r in self.gold_parser.relationships
        }
        pred_rels = {
            (r['source'].lower(), r['target'].lower(), self._normalize_rel_type(r['type']))
            for r in self.pred_parser.relationships
        }
        
        return {
            "classes": self._calculate_metrics(gold_classes, pred_classes),
            "attributes": self._calculate_metrics(gold_attrs, pred_attrs),
            "relationships": self._calculate_metrics(gold_rels, pred_rels)
        }


def evaluate_diagram(
    gold_standard: str,
    generated_diagram: str
) -> Dict[str, EvaluationMetrics]:
    """
    Evaluate a generated diagram against gold standard.
    
    Args:
        gold_standard: Gold standard PlantUML code
        generated_diagram: Generated PlantUML code
        
    Returns:
        Dictionary of evaluation metrics
    """
    evaluator = DiagramEvaluator(gold_standard, generated_diagram)
    return evaluator.get_metrics()


NameError: name 'dataclass' is not defined

In [None]:
# === EVALUATE SINGLE TEST ===

# Evaluate against gold standard
gold_standard = test_exercises[test_idx]["solution_plantuml"]
generated_diagram = final_output["current_diagram"]

metrics = evaluate_diagram(gold_standard, generated_diagram)

print("="*60)
print("EVALUATION METRICS")
print("="*60)
print(f"\nClasses:       {metrics['classes']}")
print(f"Attributes:    {metrics['attributes']}")
print(f"Relationships: {metrics['relationships']}")

avg_f1 = (
    metrics['classes'].f1 + 
    metrics['attributes'].f1 + 
    metrics['relationships'].f1
) / 3

print(f"\n{'='*60}")
print(f"OVERALL F1 SCORE: {avg_f1:.2f}")
print(f"{'='*60}")


EVALUATION METRICS

Classes:       P=0.86, R=0.86, F1=0.86
Attributes:    P=0.86, R=0.82, F1=0.84
Relationships: P=0.14, R=0.50, F1=0.22

OVERALL F1 SCORE: 0.64


In [None]:
# === MANUAL MEMORY CURATION ===

def save_to_memory(
    memory_manager: MemoryManager,
    requirements: str,
    diagram: str,
    final_output: AgentState
) -> str:
    """
    Manually save a validated diagram to long-term memory.
    
    Call this AFTER reviewing the generated diagram and deciding it's good
    enough to be used as a learning example for future generations.
    
    Args:
        memory_manager: MemoryManager instance
        requirements: Original requirements
        diagram: Generated PlantUML diagram
        final_output: Final workflow state with metadata
        
    Returns:
        Key of the saved memory
    """
    metadata = {
        "iterations": final_output["iterations"],
        "syntax_valid": final_output["syntax_valid"],
        "logic_valid": final_output["logic_valid"],
        "saved_at": datetime.now().isoformat()
    }
    
    key = memory_manager.save_diagram(requirements, diagram, metadata)
    print(f"✓ Diagram saved to long-term memory: {key}")
    return key


# Example: Save current diagram to memory if you're satisfied
# Uncomment and run this after reviewing the results above:
#
# if final_output['logic_valid'] and avg_f1 >= 0.80:
#     save_to_memory(
#         memory_manager,
#         requirements,
#         final_output['current_diagram'],
#         final_output
#     )
#     print(f"\n✓ High-quality diagram saved to memory (F1={avg_f1:.2f})")
# else:
#     print(f"\n⚠ Diagram not saved (F1={avg_f1:.2f}, Valid={final_output['logic_valid']})")


In [None]:
# === BATCH EVALUATION ===

@dataclass
class BatchResult:
    """Result from a single exercise in batch evaluation."""
    exercise_num: int
    success: bool
    iterations: int = 0
    syntax_valid: bool = False
    logic_valid: bool = False
    metrics: Optional[Dict[str, EvaluationMetrics]] = None
    diagram_url: Optional[str] = None
    error: Optional[str] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for DataFrame creation."""
        if self.metrics:
            return {
                "exercise": self.exercise_num,
                "success": self.success,
                "iterations": self.iterations,
                "syntax_valid": self.syntax_valid,
                "logic_valid": self.logic_valid,
                "class_f1": self.metrics['classes'].f1,
                "attr_f1": self.metrics['attributes'].f1,
                "rel_f1": self.metrics['relationships'].f1,
                "diagram_url": self.diagram_url
            }
        return {
            "exercise": self.exercise_num,
            "success": self.success,
            "error": self.error
        }


def evaluate_batch(
    app: Any,
    test_exercises: List[Dict[str, Any]],
    puml_tool: PlantUMLTool,
    max_exercises: Optional[int] = None
) -> pd.DataFrame:
    """
    Run batch evaluation on multiple exercises.
    
    Args:
        app: Compiled LangGraph workflow
        test_exercises: List of exercise dictionaries
        puml_tool: PlantUML tool for URL generation
        max_exercises: Optional limit on number of exercises
        
    Returns:
        DataFrame with evaluation results
    """
    exercises_to_test = test_exercises[:max_exercises] if max_exercises else test_exercises
    results = []
    
    logger.info("="*60)
    logger.info(f"BATCH EVALUATION: {len(exercises_to_test)} exercises")
    logger.info("="*60)
    
    for i, exercise in enumerate(exercises_to_test):
        logger.info(f"\n--- Exercise {i+1}/{len(exercises_to_test)} ---")
        
        try:
            # Run workflow
            requirements = exercise["requirements"]
            final_output = run_single_test(app, requirements, f"Exercise {i+1}")
            
            # Evaluate
            gold_standard = exercise["solution_plantuml"]
            generated_diagram = final_output["current_diagram"]
            metrics = evaluate_diagram(gold_standard, generated_diagram)
            
            result = BatchResult(
                exercise_num=i + 1,
                success=True,
                iterations=final_output["iterations"],
                syntax_valid=final_output["syntax_valid"],
                logic_valid=final_output["logic_valid"],
                metrics=metrics,
                diagram_url=puml_tool.get_diagram_url(generated_diagram)
            )
            
            logger.info(f"✓ Exercise {i+1}: F1 = {metrics['classes'].f1:.2f} / "
                       f"{metrics['attributes'].f1:.2f} / {metrics['relationships'].f1:.2f}")
            
        except Exception as e:
            logger.error(f"✗ Exercise {i+1} failed: {e}")
            result = BatchResult(
                exercise_num=i + 1,
                success=False,
                error=str(e)
            )
        
        results.append(result.to_dict())
    
    df = pd.DataFrame(results)
    logger.info("\n" + "="*60)
    logger.info("BATCH EVALUATION COMPLETE")
    logger.info("="*60)
    
    return df


# Example: Run on first 3 exercises (uncomment to execute)
# df_results = evaluate_batch(app, test_exercises, puml_tool, max_exercises=3)
# 
# print("\n" + "="*60)
# print("BATCH EVALUATION SUMMARY")
# print("="*60)
# successful = df_results[df_results['success'] == True]
# if not successful.empty:
#     print(successful[['exercise', 'class_f1', 'attr_f1', 'rel_f1']].to_string(index=False))
#     avg_f1 = successful[['class_f1', 'attr_f1', 'rel_f1']].mean().mean()
#     print(f"\nAverage F1: {avg_f1:.2f}")
# else:
#     print("No successful evaluations")
