In [8]:
import pandas as pd
import json
import re
from typing import List, Dict, Any, Optional
from haystack import Pipeline, Document, component
from haystack.components.builders import ChatPromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore 
from haystack.dataclasses import ChatMessage
# Import OllamaChatGenerator - muss separat installiert werden
from haystack_integrations.components.generators.ollama import OllamaChatGenerator

In [9]:
# 3a - Updated Document Store with Complete BPMN Models

bpmn_examples = [
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="Process_1">
    <startEvent id="StartEvent_1"/>
    <task id="T1" name="Check Credit"/>
    <task id="T2" name="Approve Loan"/>
    <endEvent id="EndEvent_1"/>
  </process>
</definitions>""",
        "description": "This is a loan approval process. It starts with checking the customer's credit score. If the score is sufficient, the task is to approve the loan."
    },
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="Process_2">
    <startEvent id="Start"/>
    <task id="T3" name="Process Order"/>
    <task id="T4" name="Ship Product"/>
    <endEvent id="End"/>
  </process>
</definitions>""",
        "description": "This is an order fulfillment process. First, process the received order. Once processing is complete, ship the final product to the customer."
    }
]

document_store = InMemoryDocumentStore()
documents = [
    Document(
        content=ex["xml"],  # Store the full XML
        meta={
            "description": ex["description"],
            "id": f"doc_{i}",
            "type": "bpmn_with_description"
        }
    )
    for i, ex in enumerate(bpmn_examples)
]
document_store.write_documents(documents)

retriever = InMemoryBM25Retriever(document_store=document_store)

In [10]:
prompt_template = [
    ChatMessage.from_system("""
You are an expert BPMN model analyst. Your task is to analyze the user-provided 
BPMN XML model and create a clear, natural language description of the process.

CRITICAL REQUIREMENTS:
1. You MUST wrap EVERY task name in <bpmn:task> tags exactly like this: <bpmn:task>Task Name</bpmn:task>
2. Example: "The process begins with <bpmn:task>Collect Customer Information</bpmn:task>"
3. DO NOT use any other tags or formatting
4. DO NOT include "through the" or "using the" before tags - just use the tags directly
5. Task names MUST match exactly what's in the BPMN XML

BAD EXAMPLE: "through the <bpmn:task>Collect Customer Information</bpmn:task>"
GOOD EXAMPLE: "<bpmn:task>Collect Customer Information</bpmn:task>"

Your description should:
1. Start with "This is a [type] process."
2. Identify ALL tasks in order using <bpmn:task> tags
3. Explain the flow and purpose
4. Be comprehensive but concise

Remember: EVERY task name MUST be wrapped in <bpmn:task> tags!
    """),
    ChatMessage.from_user("""
--- Example BPMN Models (for reference) ---
{% for example in examples %}
Example {{loop.index}}:
BPMN XML:
{{example.content}}

Description:
{{example.meta.description}}
{% endfor %}
------------------------------------------------------

Now, analyze the following BPMN XML model and create a description for it:
{{query_bpmn}}

CRITICAL: Wrap EVERY task name in <bpmn:task> tags exactly like this: <bpmn:task>Task Name</bpmn:task>
    """)
]

prompt_builder = ChatPromptBuilder(template=prompt_template, required_variables=["query_bpmn", "examples"])

In [11]:


# 2c. Matching Generator: LLM to perform the generation.
# NOTE: Replace the model and URL if necessary.
chat_generator = OllamaChatGenerator(
    model="llama3.1:8b",
    url="http://localhost:11434",
    timeout=30*60,
    generation_kwargs={"temperature": 0.3}
)

In [12]:
# Pipeline, that recieves all comoponents
bpmn_pipeline = Pipeline()

# Add components
bpmn_pipeline.add_component(instance=retriever, name="retriever")
bpmn_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
bpmn_pipeline.add_component(instance=chat_generator, name="generator")

# Connect components
# 1. Provide the query (the new BPMN model) to the Retriever
bpmn_pipeline.connect("retriever.documents", "prompt_builder.examples") 
# 2. Pass the retrieved documents (as 'examples') to the PromptBuilder
bpmn_pipeline.connect("prompt_builder.prompt", "generator.messages")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f2783db87d0>
üöÖ Components
  - retriever: InMemoryBM25Retriever
  - prompt_builder: ChatPromptBuilder
  - generator: OllamaChatGenerator
üõ§Ô∏è Connections
  - retriever.documents -> prompt_builder.examples (list[Document])
  - prompt_builder.prompt -> generator.messages (list[ChatMessage])

In [13]:
def run_pipeline_with_validation(model_xml, model_name, return_detailed_output=False):
    """Run pipeline with better validation of task tags"""
    
    print(f"\n{'='*60}")
    print(f"MODEL: {model_name}")
    print(f"{'='*60}")
    
    print("Input BPMN Model (truncated):")
    lines = model_xml.split('\n')[:5]
    for line in lines:
        print(f"  {line}")
    if len(model_xml.split('\n')) > 5:
        print("  ...")
    
    # Extract actual tasks from XML
    actual_tasks = re.findall(r'<task[^>]*name="([^"]*)"', model_xml)
    print(f"\nActual tasks in model ({len(actual_tasks)} total):")
    for i, task in enumerate(actual_tasks, 1):
        print(f"  {i:2}. {task}")
    
    # Run pipeline
    result = bpmn_pipeline.run({
        "retriever": {"query": model_xml, "top_k": 2},
        "prompt_builder": {"query_bpmn": model_xml}
    })
    
    generated_description = result["generator"]["replies"][0].text
    
    print("\nGenerated Description:")
    print("-" * 40)
    print(generated_description)
    print("-" * 40)
    
    # Try multiple patterns to extract tasks
    patterns = [
        r'<bpmn:task>([^<]+)</bpmn:task>',  # Standard pattern
        r'<bpmn:task>\s*([^<]+)\s*</bpmn:task>',  # With whitespace
        r'<task>([^<]+)</task>',  # Simplified pattern
    ]
    
    tasks_in_description = []
    for pattern in patterns:
        tasks = re.findall(pattern, generated_description, re.IGNORECASE)
        if tasks:
            tasks_in_description.extend(tasks)
            break
    
    # If no tasks found with tags, try to find task names in text
    if not tasks_in_description:
        print("\n‚ö†Ô∏è WARNING: No <bpmn:task> tags found in output!")
        print("Searching for task names in plain text...")
        
        # Look for task names in the description text
        for task in actual_tasks:
            if task.lower() in generated_description.lower():
                tasks_in_description.append(task)
    
    print(f"\nTasks identified in description ({len(tasks_in_description)} found):")
    if tasks_in_description:
        for i, task in enumerate(tasks_in_description, 1):
            print(f"  {i:2}. {task}")
    else:
        print("  No tasks identified")
    
    # Calculate precision, recall, and F1-score
    described_tasks_lower = {t.lower().strip() for t in tasks_in_description}
    actual_tasks_lower = {t.lower().strip() for t in actual_tasks}
    
    matched_tasks = described_tasks_lower.intersection(actual_tasks_lower)
    
    # True Positives: Tasks correctly identified
    tp = len(matched_tasks)
    
    # False Positives: Tasks identified but not in actual model
    fp = len(described_tasks_lower - actual_tasks_lower)
    
    # False Negatives: Actual tasks not identified
    fn = len(actual_tasks_lower - described_tasks_lower)
    
    # Calculate precision, recall, and F1-score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    print(f"\nüìä PERFORMANCE METRICS:")
    print(f"  True Positives (TP): {tp}")
    print(f"  False Positives (FP): {fp}")
    print(f"  False Negatives (FN): {fn}")
    print(f"  Precision: {precision:.4f} ({precision:.2%})")
    print(f"  Recall: {recall:.4f} ({recall:.2%})")
    print(f"  F1-Score: {f1_score:.4f}")
    
    # Calculate task coverage (same as recall)
    task_coverage = recall
    
    if matched_tasks:
        print(f"  Correctly identified tasks ({len(matched_tasks)}):")
        for task in sorted(matched_tasks):
            print(f"    ‚úì {task}")
    
    missing_tasks = actual_tasks_lower - described_tasks_lower
    if missing_tasks:
        print(f"  Missing tasks ({len(missing_tasks)}):")
        for task in sorted(missing_tasks):
            print(f"    ‚úó {task}")
    
    # False positive tasks (incorrectly identified)
    false_positive_tasks = described_tasks_lower - actual_tasks_lower
    if false_positive_tasks:
        print(f"  False positive tasks ({len(false_positive_tasks)}):")
        for task in sorted(false_positive_tasks):
            print(f"    ‚ö†Ô∏è {task}")
    
    # Check tag usage
    tag_count = len(re.findall(r'<bpmn:task>', generated_description, re.IGNORECASE))
    print(f"\nüîç TAG ANALYSIS:")
    print(f"  Found {tag_count} <bpmn:task> tags in output")
    print(f"  Expected {len(actual_tasks)} tags (one per task)")
    
    if tag_count > 0:
        tag_usage_percentage = (tag_count / len(actual_tasks)) * 100 if len(actual_tasks) > 0 else 0
        print(f"  Tag usage: {tag_usage_percentage:.1f}% of tasks have tags")
    
    return {
        'description': generated_description,
        'tasks_found': tasks_in_description,
        'actual_tasks': actual_tasks,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'coverage': task_coverage,
        'tag_count': tag_count,
        'matched_count': len(matched_tasks),
        'missing_count': len(missing_tasks),
        'matched_tasks_list': [t for t in tasks_in_description if t.lower() in actual_tasks_lower],
        'missing_tasks_list': [t for t in actual_tasks if t.lower() in missing_tasks],
        'false_positive_list': [t for t in tasks_in_description if t.lower() in false_positive_tasks]
    }


# Test with the 2 models
print("="*80)
print("RUNNING PIPELINE WITH VALIDATION FOR 2 MODELS")
print("="*80)

test_models = [
    {
        "name": "Customer Onboarding Process",
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="OnboardingProcess">
    <startEvent id="Start"/>
    <task id="T1" name="Collect Customer Information"/>
    <task id="T2" name="Verify Identity Documents"/>
    <task id="T3" name="Perform Background Check"/>
    <task id="T4" name="Assess Credit Score"/>
    <task id="T5" name="Review Application Form"/>
    <task id="T6" name="Validate Bank Details"/>
    <task id="T7" name="Create Customer Account"/>
    <task id="T8" name="Assign Account Manager"/>
    <task id="T9" name="Send Welcome Package"/>
    <task id="T10" name="Schedule Orientation Call"/>
    <endEvent id="End"/>
  </process>
</definitions>"""
    },
    {
        "name": "Order Fulfillment Process",
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="FulfillmentProcess">
    <startEvent id="StartEvent_1"/>
    <task id="T1" name="Receive Customer Order"/>
    <task id="T2" name="Validate Payment"/>
    <task id="T3" name="Check Inventory Availability"/>
    <task id="T4" name="Allocate Stock Items"/>
    <task id="T5" name="Prepare Shipping Label"/>
    <task id="T6" name="Pick Items from Warehouse"/>
    <task id="T7" name="Package Products"/>
    <task id="T8" name="Apply Quality Check"/>
    <task id="T9" name="Schedule Courier Pickup"/>
    <task id="T10" name="Update Order Status"/>
    <endEvent id="EndEvent_1"/>
  </process>
</definitions>"""
    }
]

print(f"\nStarting pipeline execution for {len(test_models)} models...")
print(f"Models to process: {', '.join([m['name'] for m in test_models])}")

results = []
for i, model in enumerate(test_models, 1):
    print(f"\n{'='*80}")
    print(f"PROCESSING MODEL {i}/{len(test_models)}: {model['name']}")
    print(f"{'='*80}")
    
    try:
        result = run_pipeline_with_validation(model['xml'], model['name'])
        results.append({
            'name': model['name'],
            **result
        })
        print(f"‚úì Successfully processed model {i}")
    except Exception as e:
        print(f"‚úó Error processing model {i}: {e}")
        results.append({
            'name': model['name'],
            'description': f"ERROR: {str(e)}",
            'tasks_found': [],
            'actual_tasks': [],
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'tp': 0,
            'fp': 0,
            'fn': 0,
            'coverage': 0.0,
            'tag_count': 0,
            'matched_count': 0,
            'missing_count': 0,
            'error': str(e),
            'matched_tasks_list': [],
            'missing_tasks_list': [],
            'false_positive_list': []
        })

# ================================================================================
# NEW: SEPARATE OUTPUT FOR FOUND TASKS FROM BOTH MODELS
# ================================================================================

print("\n" + "="*80)
print("SEPARATE TASK OUTPUT FOR EACH MODEL")
print("="*80)

for i, result in enumerate(results, 1):
    print(f"\n{'='*60}")
    print(f"MODEL {i}: {result['name']}")
    print(f"{'='*60}")
    
    if 'error' in result:
        print(f"  ERROR: {result['error']}")
        continue
    
    print(f"\nüìã ALL FOUND TASKS ({len(result['tasks_found'])} total):")
    if result['tasks_found']:
        for j, task in enumerate(result['tasks_found'], 1):
            # Mark if it's correct or false positive
            if task.lower() in {t.lower() for t in result['matched_tasks_list']}:
                marker = "‚úì"
            else:
                marker = "‚ö†Ô∏è"
            print(f"  {j:2}. {marker} {task}")
    else:
        print("  No tasks found")
    
    print(f"\n‚úÖ CORRECTLY IDENTIFIED TASKS ({result['matched_count']}):")
    if result['matched_tasks_list']:
        for j, task in enumerate(result['matched_tasks_list'], 1):
            print(f"  {j:2}. {task}")
    else:
        print("  No correct matches")
    
    print(f"\n‚ùå MISSING TASKS ({result['missing_count']}):")
    if result['missing_tasks_list']:
        for j, task in enumerate(result['missing_tasks_list'], 1):
            print(f"  {j:2}. {task}")
    else:
        print("  No missing tasks (all found!)")
    
    print(f"\n‚ö†Ô∏è FALSE POSITIVE TASKS ({len(result['false_positive_list'])}):")
    if result['false_positive_list']:
        for j, task in enumerate(result['false_positive_list'], 1):
            print(f"  {j:2}. {task}")
    else:
        print("  No false positives")
    
    print(f"\nüìä SUMMARY FOR THIS MODEL:")
    print(f"  ‚Ä¢ Total actual tasks: {len(result['actual_tasks'])}")
    print(f"  ‚Ä¢ Total tasks found: {len(result['tasks_found'])}")
    print(f"  ‚Ä¢ Correct matches: {result['matched_count']}")
    print(f"  ‚Ä¢ Missing: {result['missing_count']}")
    print(f"  ‚Ä¢ False positives: {len(result['false_positive_list'])}")
    print(f"  ‚Ä¢ Precision: {result['precision']:.2%}")
    print(f"  ‚Ä¢ Recall: {result['recall']:.2%}")
    print(f"  ‚Ä¢ F1-Score: {result['f1_score']:.4f}")

# ================================================================================
# COMPARISON OF TASKS FOUND IN BOTH MODELS
# ================================================================================

print("\n" + "="*80)
print("COMPARISON OF TASKS FOUND IN BOTH MODELS")
print("="*80)

# Create comparison table
comparison_data = []

for i, result in enumerate(results, 1):
    if 'error' not in result:
        model_tasks = {
            'Model': result['name'],
            'Actual Tasks': len(result['actual_tasks']),
            'Found Tasks': len(result['tasks_found']),
            'Correct Tasks': result['matched_count'],
            'Missing Tasks': result['missing_count'],
            'False Positives': len(result['false_positive_list']),
            'Precision': f"{result['precision']:.2%}",
            'Recall': f"{result['recall']:.2%}",
            'F1-Score': f"{result['f1_score']:.4f}"
        }
        comparison_data.append(model_tasks)

if comparison_data:
    # Create comparison table
    print(f"\n{'Model':<35} {'Actual':<8} {'Found':<8} {'Correct':<8} {'Missing':<8} {'FP':<8} {'Precision':<12} {'Recall':<12} {'F1':<10}")
    print("-" * 120)
    
    for data in comparison_data:
        model_name = data['Model']
        if len(model_name) > 30:
            model_name = model_name[:27] + "..."
        
        print(f"{model_name:<35} {data['Actual Tasks']:<8} {data['Found Tasks']:<8} {data['Correct Tasks']:<8} "
              f"{data['Missing Tasks']:<8} {data['False Positives']:<8} {data['Precision']:<12} "
              f"{data['Recall']:<12} {data['F1-Score']:<10}")

# ================================================================================
# DETAILED SIDE-BY-SIDE COMPARISON
# ================================================================================

print("\n" + "="*80)
print("SIDE-BY-SIDE TASK COMPARISON")
print("="*80)

# Show actual vs found tasks for each model
for i, result in enumerate(results, 1):
    if 'error' not in result:
        print(f"\n{'‚îÄ' * 40}")
        print(f"MODEL {i}: {result['name']}")
        print(f"{'‚îÄ' * 40}")
        
        print(f"{'ACTUAL TASKS':<30} {'FOUND/STATUS':<30}")
        print(f"{'‚îÄ' * 30} {'‚îÄ' * 30}")
        
        # Create dictionary of found tasks (lowercase for matching)
        found_tasks_lower = {t.lower(): t for t in result['tasks_found']}
        
        for j, actual_task in enumerate(result['actual_tasks'], 1):
            if actual_task.lower() in found_tasks_lower:
                found_version = found_tasks_lower[actual_task.lower()]
                status = f"‚úì Found as: {found_version}"
            else:
                status = "‚úó MISSING"
            
            print(f"{j:2}. {actual_task:<26} {status:<30}")
        
        # Show any extra found tasks (false positives)
        found_actual_tasks_lower = {t.lower() for t in result['actual_tasks']}
        false_positives = [t for t in result['tasks_found'] if t.lower() not in found_actual_tasks_lower]
        
        if false_positives:
            print(f"\nExtra found tasks (false positives):")
            for fp_task in false_positives:
                print(f"  ‚ö†Ô∏è {fp_task}")

# ================================================================================
# OVERALL SUMMARY (keeping existing summary with enhancements)
# ================================================================================

print("\n" + "="*80)
print("COMPREHENSIVE SUMMARY")
print("="*80)

print(f"\n{'Model':<35} {'Tasks':<8} {'Found':<8} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Status':<12}")
print("-" * 100)

for i, result in enumerate(results, 1):
    model_name = result['name']
    if len(model_name) > 30:
        model_name = model_name[:27] + "..."
    
    tasks_total = len(result['actual_tasks'])
    tasks_found = len(result['tasks_found'])
    precision = f"{result['precision']:.4f}"
    recall = f"{result['recall']:.4f}"
    f1_score = f"{result['f1_score']:.4f}"
    
    if 'error' in result:
        status = "ERROR"
    elif result['precision'] == 1.0 and result['recall'] == 1.0:
        status = "PERFECT"
    elif result['f1_score'] >= 0.9:
        status = "EXCELLENT"
    elif result['f1_score'] >= 0.7:
        status = "GOOD"
    elif result['f1_score'] >= 0.5:
        status = "FAIR"
    else:
        status = "POOR"
    
    print(f"{model_name:<35} {tasks_total:<8} {tasks_found:<8} {precision:<12} {recall:<12} {f1_score:<12} {status:<12}")

# ================================================================================
# OVERALL PERFORMANCE ANALYSIS
# ================================================================================

print("\n" + "="*80)
print("OVERALL PERFORMANCE ANALYSIS")
print("="*80)

if len(results) > 0 and all('error' not in r for r in results):
    # Micro-averaged metrics (pool all predictions)
    total_tp = sum(r['tp'] for r in results)
    total_fp = sum(r['fp'] for r in results)
    total_fn = sum(r['fn'] for r in results)
    total_actual_tasks = sum(len(r['actual_tasks']) for r in results)
    total_predicted_tasks = sum(len(r['tasks_found']) for r in results)
    
    # Micro-averaged precision, recall, F1
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0.0
    
    # Macro-averaged metrics (average of individual model metrics)
    macro_precision = sum(r['precision'] for r in results) / len(results)
    macro_recall = sum(r['recall'] for r in results) / len(results)
    macro_f1 = sum(r['f1_score'] for r in results) / len(results)
    
    print(f"\nüìà MICRO-AVERAGED METRICS (pooled):")
    print(f"  ‚Ä¢ Total TP across all models: {total_tp}")
    print(f"  ‚Ä¢ Total FP across all models: {total_fp}")
    print(f"  ‚Ä¢ Total FN across all models: {total_fn}")
    print(f"  ‚Ä¢ Micro Precision: {micro_precision:.4f} ({micro_precision:.2%})")
    print(f"  ‚Ä¢ Micro Recall: {micro_recall:.4f} ({micro_recall:.2%})")
    print(f"  ‚Ä¢ Micro F1-Score: {micro_f1:.4f}")
    
    print(f"\nüìä MACRO-AVERAGED METRICS (average):")
    print(f"  ‚Ä¢ Macro Precision: {macro_precision:.4f} ({macro_precision:.2%})")
    print(f"  ‚Ä¢ Macro Recall: {macro_recall:.4f} ({macro_recall:.2%})")
    print(f"  ‚Ä¢ Macro F1-Score: {macro_f1:.4f}")
    
    print(f"\nüìã SUMMARY STATISTICS:")
    print(f"  ‚Ä¢ Total models evaluated: {len(results)}")
    print(f"  ‚Ä¢ Total actual tasks: {total_actual_tasks}")
    print(f"  ‚Ä¢ Total predicted tasks: {total_predicted_tasks}")
    print(f"  ‚Ä¢ Overall accuracy (exact matches): {(total_tp / total_actual_tasks):.2%}" if total_actual_tasks > 0 else "  ‚Ä¢ Overall accuracy: N/A")

print("\n" + "="*80)
print("PIPELINE EXECUTION COMPLETE FOR ALL MODELS ‚úÖ")
print("="*80)

RUNNING PIPELINE WITH VALIDATION FOR 2 MODELS

Starting pipeline execution for 2 models...
Models to process: Customer Onboarding Process, Order Fulfillment Process

PROCESSING MODEL 1/2: Customer Onboarding Process

MODEL: Customer Onboarding Process
Input BPMN Model (truncated):
  <?xml version="1.0" encoding="UTF-8"?>
  <definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
    <process id="OnboardingProcess">
      <startEvent id="Start"/>
      <task id="T1" name="Collect Customer Information"/>
  ...

Actual tasks in model (10 total):
   1. Collect Customer Information
   2. Verify Identity Documents
   3. Perform Background Check
   4. Assess Credit Score
   5. Review Application Form
   6. Validate Bank Details
   7. Create Customer Account
   8. Assign Account Manager
   9. Send Welcome Package
  10. Schedule Orientation Call

Generated Description:
----------------------------------------
This is a customer onboarding process. First, collect customer information. N