In [37]:
import logging
import json
import random
import pydantic
from pydantic import BaseModel, ValidationError
from typing import Optional, List, Dict, Any
from colorama import Fore
from haystack import component, Pipeline
from haystack.dataclasses import ChatMessage
from haystack.components.builders import ChatPromptBuilder
# Note: haystack_integrations must be installed to use OllamaChatGenerator
from haystack_integrations.components.generators.ollama import OllamaChatGenerator

In [38]:
#1b select prefered data structure and define List named "tasks"

from typing import List
from pydantic import BaseModel

class Tasks(BaseModel):
    name: str


class TasksData(BaseModel):
    tasks: List[str]

In [39]:
json_schema = TasksData.model_json_schema()
SCHEMA_STRING = json.dumps(json_schema)


In [40]:
#1c

@component
class OutputValidator:
    def __init__(self, pydantic_model: pydantic.BaseModel):
        self.pydantic_model = pydantic_model
        self.iteration_counter = 0

    # Define the component output
    @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])
    def run(self, replies: List[ChatMessage]):

        self.iteration_counter += 1

        ## Try to parse the LLM's reply ##
        # If the LLM's reply is a valid object, return `"valid_replies"`
        try:
            output_dict = json.loads(replies[0].text)
            self.pydantic_model.parse_obj(output_dict)
            print(
                Fore.GREEN
                + f"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}"
            )
            return {"valid_replies": replies}

        # If the LLM's reply is corrupted or not valid, return "invalid_replies" and the "error_message" for LLM to try again
        except (ValueError, ValidationError) as e:
            print(
                Fore.RED
                + f"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\n"
                f"Output from LLM:\n {replies[0]} \n"
                f"Error from OutputValidator: {e}"
            )
            return {"invalid_replies": replies, "error_message": str(e)}


In [41]:
output_validator = OutputValidator(pydantic_model=TasksData)

In [42]:
#1d

from haystack.components.builders import ChatPromptBuilder
prompt = [
    ChatMessage.from_user(
        """
Extract all **task names** from the BPMN description provided in the passage: {{passage}}.

Return the extracted task names as a single JSON object that adheres strictly to the following schema. Only return the actual task instances without including the schema definition itself.

The required JSON schema is:
{{json_schema}}

For example, if the schema is `{"tasks": ["task1", "task2", ...]}` and the BPMN description contains "Approve Claim" and "Notify Customer", your output must be: `{"tasks": ["Approve Claim", "Notify Customer"]}`.

Make sure your response is a valid JSON dict and not a list.
{% if invalid_replies and error_message %}
  You already created the following output in a previous attempt: {{invalid_replies}}
  However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}
  Correct the output and try again. Just return the corrected output without any extra explanations.
{% endif %}
"""
    )
]

prompt_builder = ChatPromptBuilder(template=prompt, required_variables=["json_schema", "passage"])



In [43]:
#1e

from haystack_integrations.components.generators.ollama import OllamaChatGenerator
chat_generator = OllamaChatGenerator(model="llama3.2:3b",
url = "http://localhost:11434",
timeout = 30*60,
generation_kwargs={
"num_ctx": 4096,
"temperature": 0.9,
})

In [44]:
@component
class MockChatGenerator:
    @component.output_types(replies=List[ChatMessage])
    def run(self, messages: List[ChatMessage]):
        # Very naive: just returns a fixed valid JSON for demo purposes
        reply_text = json.dumps({
            "tasks": [
                {"name": "Task A"},
                {"name": "Task B"}
            ]
        })
        return {"replies": [ChatMessage.from_assistant(reply_text)]}


In [45]:
from haystack import Pipeline

# 1. Define your three inputs (BPMN descriptions)
# Assuming you are defining these variables in your Python script or notebook

bpmn_text1_data = {
    "passage": """BPMN Model 1: 
        The process initiates when a request is submitted, leading immediately to the Initial Screening of the request details. The system then automatically Send Acknowledgment to the customer confirming receipt. Following this, the agent proceeds to Identify Task Type (e.g., bug, feature request, or maintenance). If the description is incomplete, the agent will Request Clarification from the submitter. Once the necessary information is gathered, a qualified manager will Assign Engineer based on project needs and skill set. The assigned engineer's first responsibility is to Create Solution Plan, which outlines the necessary steps and resources. Once the plan is approved, the engineer moves on to Implement Solution. Throughout the development phase, automated systems continually Monitor Progress and resource usage. Before the solution is deployed, a separate team must perform a rigorous Quality Assurance Check. The final required step is securing Final Customer Sign-off from the client, and only then does the process officially end."""
}

bpmn_text2_data = {
    "passage": """BPMN Model 2:
       The process initiates when a request is submitted, leading immediately to the "Initial Screening" of the request details. The system then automatically "Send Acknowledgment" to the customer confirming receipt. Following this, the agent proceeds to "Identify Task Type" (e.g., bug, feature request, or maintenance). If the description is incomplete, the agent will "Request Clarification" from the submitter. Once the necessary information is gathered, a qualified manager will "Assign Engineer" based on project needs and skill set. The assigned engineer's first responsibility is to "Create Solution Plan", which outlines the necessary steps and resources. Once the plan is approved, the engineer moves on to "Implement Solution". Throughout the development phase, automated systems continually "Monitor Progress" and resource usage. Before the solution is deployed, a separate team must perform a rigorous "Quality Assurance Check". The final required step is securing "Final Customer Sign-off" from the client, and only then does the process officially end"""
}

bpmn_text3_data = {
    "passage": """BPMN Model 3:
        The process begins with "Receive Order", then "Check Inventory",
        then "Fulfill Order", and finally "Send Invoice" before it ends."""
}

# ‚≠êÔ∏è Combine them into the single iterable list expected by the loop:
bpmn_texts = [
    bpmn_text1_data,
    bpmn_text2_data,
    bpmn_text3_data
]

pipeline = Pipeline() 

pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=chat_generator , name="llm")
pipeline.add_component(instance=output_validator, name="output_validator")

pipeline.connect("prompt_builder.prompt", "llm.messages")
pipeline.connect("llm.replies", "output_validator")
# The other connections are REMOVED because the 'while' loop handles the retry data manually.


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f607818a650>
üöÖ Components
  - prompt_builder: ChatPromptBuilder
  - llm: OllamaChatGenerator
  - output_validator: OutputValidator
üõ§Ô∏è Connections
  - prompt_builder.prompt -> llm.messages (list[ChatMessage])
  - llm.replies -> output_validator.replies (List[ChatMessage])

In [46]:
import os
# Assuming bpmn_texts, pipeline, and SCHEMA_STRING are defined and accessible

MAX_RERUNS = 3 
all_results = []
ground_truth = [] 

print("Starting Haystack Pipeline Execution...")

# --- CORRECTED: Execution Loop ---
# The loop iterates over each dictionary in bpmn_texts. 
# We call the iteration variable 'bpmn_text_data' for clarity.
for i, bpmn_text_data in enumerate(bpmn_texts):
    
    # --- 1. Model Name and Logging (Fixed Logic) ---
    # Extract model name from the passage for logging.
    passage_text = bpmn_text_data["passage"]
    
    # Try to extract "BPMN Model X" or default to "Model Y"
    model_name_match = passage_text.split(":", 1)[0].strip()
    model_name = model_name_match if model_name_match.startswith("BPMN Model") else f"Model {i+1}"
    
    print(f"\nüöÄ Running Pipeline for {model_name} (Item {i+1}/{len(bpmn_texts)})...")
    
    # --- 2. Initialize Variables for THIS Run ---
    # Initialization MUST occur inside the loop for each new model.
    input_data = {
        "passage": passage_text,
        "json_schema": SCHEMA_STRING,
        "invalid_replies": None,
        "error_message": None
    }
    
    attempt = 0
    successful_run = False
    run_output = None
    
    # --- 3. Start Retry Loop (Applied to THIS Model) ---
    while not successful_run and attempt < MAX_RERUNS:
        try:
            # Run the pipeline with the current state of input_data
            run_output = pipeline.run(data={"prompt_builder": input_data})
            
            validator_output = run_output["output_validator"]
            
            # Check for successful run via the 'result' key
            if validator_output.get("result"):
                 successful_run = True
                 print(f"‚úÖ Success on attempt {attempt + 1}.")
                 all_results.append(validator_output.get("result"))
            else:
                 # --- Manual Update for Retry Inputs ---
                 # Pass the error information back to prompt_builder for the next attempt
                 input_data["invalid_replies"] = validator_output.get("invalid_replies")
                 input_data["error_message"] = validator_output.get("error_message")
                 # --- END FIX ---
                 
                 attempt += 1
                 print(f"‚ö†Ô∏è Run failed (validation error). Retrying... (Attempt {attempt + 1}/{MAX_RERUNS})")
                 
        except Exception as e:
            # Catch unexpected exceptions (e.g., LLM connection or server errors)
            attempt += 1
            print(f"‚ùå Run failed with unexpected error: {e}. Retrying... (Attempt {attempt + 1}/{MAX_RERUNS})")
            # For unexpected errors, reset the retry inputs
            input_data["invalid_replies"] = None
            input_data["error_message"] = f"An unexpected error occurred: {e}"
            
    # --- 4. Final Result Recording for THIS Model ---
    if not successful_run:
        all_results.append(f"Failed to produce valid output after {MAX_RERUNS} attempts for {model_name}.")

# --- Verification Setup (Outside the loop) ---
print("\n--- Manual Verification Setup ---")
print("All Run Results:", all_results)
print("\nYou must now manually check each result against the ground truth for each BPMN model.")
print("This involves calculating **Precision** and **Recall** for the extracted task names.")


Starting Haystack Pipeline Execution...

üöÄ Running Pipeline for BPMN Model 1 (Item 1/3)...
‚ùå Run failed with unexpected error: The following component failed to run:
Component name: 'llm'
Component type: 'OllamaChatGenerator'
Error: model "llama3.2:3b" not found, try pulling it first (status code: 404). Retrying... (Attempt 2/3)
‚ùå Run failed with unexpected error: The following component failed to run:
Component name: 'llm'
Component type: 'OllamaChatGenerator'
Error: model "llama3.2:3b" not found, try pulling it first (status code: 404). Retrying... (Attempt 3/3)
‚ùå Run failed with unexpected error: The following component failed to run:
Component name: 'llm'
Component type: 'OllamaChatGenerator'
Error: model "llama3.2:3b" not found, try pulling it first (status code: 404). Retrying... (Attempt 4/3)

üöÄ Running Pipeline for BPMN Model 2 (Item 2/3)...
‚ùå Run failed with unexpected error: The following component failed to run:
Component name: 'llm'
Component type: 'OllamaChat

In [47]:
import json
from typing import List, Dict, Any, Optional

# --- 1. Define Ground Truth Data ---

# Ground truth for BPMN Model 1 & 2 (10 tasks)
gt_model_1_2 = [
    "Initial Screening",
    "Send Acknowledgment",
    "Identify Task Type",
    "Request Clarification",
    "Assign Engineer",
    "Create Solution Plan",
    "Implement Solution",
    "Monitor Progress",
    "Quality Assurance Check",
    "Final Customer Sign-off"
]

# Ground truth for BPMN Model 3 (4 tasks)
gt_model_3 = [
    "Receive Order",
    "Check Inventory",
    "Fulfill Order",
    "Send Invoice"
]

# Combined ground truth list, matching the order of execution
GROUND_TRUTH = [
    gt_model_1_2,  # Corresponds to Model 1 output
    gt_model_1_2,  # Corresponds to Model 2 output
    gt_model_3     # Corresponds to Model 3 output
]

# --- 2. Define Placeholder Results (Demonstration Data) ---

# NOTE: Replace this PLACEHOLDER_ALL_RESULTS list with your actual 'all_results' 
# list from your successful pipeline execution.
PLACEHOLDER_ALL_RESULTS = [
    # Model 1: Perfect match (10 tasks)
    {'tasks': [{'name': t} for t in GROUND_TRUTH[0]]},

    # Model 2: Misses "Request Clarification" and "Final Customer Sign-off", adds "Wrong Task"
    # (FP=1, FN=2, TP=8)
    {'tasks': [
        {'name': 'Initial Screening'}, 
        {'name': 'Identify Task Type'}, 
        {'name': 'Assign Engineer'}, 
        {'name': 'Monitor Progress'}, 
        {'name': 'Send Acknowledgment'}, 
        {'name': 'Create Solution Plan'}, 
        {'name': 'Implement Solution'}, 
        {'name': 'Quality Assurance Check'}, 
        {'name': 'Wrong Task Added'}
    ]},

    # Model 3: Misses 1 task ("Send Invoice") (FP=0, FN=1, TP=3)
    {'tasks': [
        {'name': 'Receive Order'}, 
        {'name': 'Check Inventory'}, 
        {'name': 'Fulfill Order'}
    ]}
]


# --- 3. Calculation Function ---

def calculate_precision_recall(
    predicted_output: Dict[str, Any], 
    ground_truth_tasks: List[str], 
    case_sensitive: bool = False
) -> Dict[str, Optional[float]]:
    """
    Calculates precision and recall based on extracted task names.
    """
    # 1. Extract predicted tasks from the LLM output structure
    if 'tasks' not in predicted_output or not isinstance(predicted_output['tasks'], list):
        print(f"Warning: Invalid output structure: {predicted_output}. Cannot score.")
        return {"precision": None, "recall": None, "TP": 0, "FP": 0, "FN": 0}
        
    predicted_tasks = [t.get('name', '').strip() for t in predicted_output['tasks'] if t.get('name')]

    # 2. Normalize case for comparison
    if not case_sensitive:
        predicted_set = {t.lower() for t in predicted_tasks}
        ground_set = {t.lower() for t in ground_truth_tasks}
    else:
        predicted_set = set(predicted_tasks)
        ground_set = set(ground_truth_tasks)

    # 3. Calculate metrics
    TP = len(predicted_set.intersection(ground_set))
    FP = len(predicted_set.difference(ground_set))
    FN = len(ground_set.difference(predicted_set))

    # 4. Calculate Precision and Recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0

    return {
        "precision": round(precision, 4), 
        "recall": round(recall, 4), 
        "TP": TP, 
        "FP": FP, 
        "FN": FN
    }


# --- 4. Run Calculation and Display Results ---

# Choose the data source: use PLACEHOLDER_ALL_RESULTS for demonstration
# If your pipeline runs successfully, use: all_results_to_score = all_results 
all_results_to_score = PLACEHOLDER_ALL_RESULTS

print("\n--- Precision and Recall Scoring Results ---")
print("Note: Scoring assumes extracted task names are case-insensitive matches.")

scoring_results = []

for i, (result, truth) in enumerate(zip(all_results_to_score, GROUND_TRUTH)):
    
    # Skip models that failed to produce structured output
    if isinstance(result, str):
        scoring_results.append({
            "Model": f"Model {i+1}",
            "Status": result
        })
        continue

    scores = calculate_precision_recall(result, truth, case_sensitive=False)
    
    scoring_results.append({
        "Model": f"BPMN Model {i+1}",
        "Precision": scores['precision'],
        "Recall": scores['recall'],
        "TP": scores['TP'],
        "FP": scores['FP'],
        "FN": scores['FN'],
        "Extracted Tasks Count": len(result.get('tasks', [])),
        "Ground Truth Count": len(truth)
    })

# Display results in a structured format
for score_data in scoring_results:
    if 'Status' in score_data:
        print(f"\n{score_data['Model']}: {score_data['Status']}")
    else:
        print(f"\n{score_data['Model']}")
        print(f"  > Precision: {score_data['Precision']}")
        print(f"  > Recall:    {score_data['Recall']}")
        print(f"  > Metrics:   TP={score_data['TP']}, FP={score_data['FP']}, FN={score_data['FN']}")
        print(f"  > Counts:    Extracted={score_data['Extracted Tasks Count']}, Ground Truth={score_data['Ground Truth Count']}")


--- Precision and Recall Scoring Results ---
Note: Scoring assumes extracted task names are case-insensitive matches.

BPMN Model 1
  > Precision: 1.0
  > Recall:    1.0
  > Metrics:   TP=10, FP=0, FN=0
  > Counts:    Extracted=10, Ground Truth=10

BPMN Model 2
  > Precision: 0.8889
  > Recall:    0.8
  > Metrics:   TP=8, FP=1, FN=2
  > Counts:    Extracted=9, Ground Truth=10

BPMN Model 3
  > Precision: 1.0
  > Recall:    0.75
  > Metrics:   TP=3, FP=0, FN=1
  > Counts:    Extracted=3, Ground Truth=4


In [48]:
valid_reply = run_output["output_validator"]["result"]["tasks"]
#valid_json = json.loads(valid_reply)
print(valid_reply)


TypeError: 'NoneType' object is not subscriptable