In [1]:
# File: BPMN_Annotated_Pipeline.ipynb
#  Exercise 4.3 - BPMN to Annotated Text Generation Pipeline
#  Pipeline that uses BPMN-annotated-text pairs in document store

import pandas as pd
import json
import re
from typing import List, Dict, Any, Optional
from haystack import Pipeline, Document, component
from haystack.components.builders import ChatPromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore 
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.ollama import OllamaChatGenerator

In [2]:
# Cell 2: Create Document Store with BPMN-Annotated-Text Pairs

# Updated BPMN examples with annotated text containing <bpmn:task> tags
bpmn_annotated_examples = [
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="LoanApprovalProcess">
    <startEvent id="StartEvent_1"/>
    <task id="T1" name="Check Credit Score"/>
    <task id="T2" name="Verify Income"/>
    <task id="T3" name="Approve Loan"/>
    <endEvent id="EndEvent_1"/>
  </process>
</definitions>""",
        "annotated_text": "This is a loan approval process. The process begins with <bpmn:task>Check Credit Score</bpmn:task>, followed by <bpmn:task>Verify Income</bpmn:task>. Finally, the system performs <bpmn:task>Approve Loan</bpmn:task> to complete the process."
    },
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="OrderProcessing">
    <startEvent id="Start"/>
    <task id="T1" name="Receive Order"/>
    <task id="T2" name="Process Payment"/>
    <task id="T3" name="Pack Items"/>
    <task id="T4" name="Ship Order"/>
    <endEvent id="End"/>
  </process>
</definitions>""",
        "annotated_text": "This is an order processing workflow. First, <bpmn:task>Receive Order</bpmn:task> from the customer. Then <bpmn:task>Process Payment</bpmn:task> for the order. Next, <bpmn:task>Pack Items</bpmn:task> for shipping. Finally, <bpmn:task>Ship Order</bpmn:task> to the customer."
    },
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="EmployeeOnboarding">
    <startEvent id="StartEvent_1"/>
    <task id="T1" name="Collect Documents"/>
    <task id="T2" name="Setup Workstation"/>
    <task id="T3" name="Provide Training"/>
    <task id="T4" name="Assign Mentor"/>
    <endEvent id="EndEvent_1"/>
  </process>
</definitions>""",
        "annotated_text": "Employee onboarding process starts with <bpmn:task>Collect Documents</bpmn:task>. Then <bpmn:task>Setup Workstation</bpmn:task> for the new employee. After that, <bpmn:task>Provide Training</bpmn:task> on company policies. Finally, <bpmn:task>Assign Mentor</bpmn:task> to guide the employee."
    },
    {
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="IncidentManagement">
    <startEvent id="Start"/>
    <task id="T1" name="Log Incident"/>
    <task id="T2" name="Analyze Impact"/>
    <task id="T3" name="Implement Fix"/>
    <task id="T4" name="Verify Resolution"/>
    <task id="T5" name="Update Documentation"/>
    <endEvent id="End"/>
  </process>
</definitions>""",
        "annotated_text": "Incident management workflow begins with <bpmn:task>Log Incident</bpmn:task>. Then <bpmn:task>Analyze Impact</bpmn:task> of the issue. Next, <bpmn:task>Implement Fix</bpmn:task> to resolve the problem. After that, <bpmn:task>Verify Resolution</bpmn:task> to ensure the fix works. Finally, <bpmn:task>Update Documentation</bpmn:task> with the solution."
    }
]

# Create document store
document_store = InMemoryDocumentStore()

# Create documents with BPMN XML as content and annotated text as metadata
documents = [
    Document(
        content=ex["xml"],  # Store the BPMN XML
        meta={
            "annotated_text": ex["annotated_text"],  # Store annotated description
            "id": f"doc_{i}",
            "type": "bpmn_annotated_pair",
            "task_count": len(re.findall(r'<task[^>]*name="([^"]*)"', ex["xml"]))
        }
    )
    for i, ex in enumerate(bpmn_annotated_examples)
]

# Write to document store
document_store.write_documents(documents)

print(f"‚úì Document store created with {len(documents)} BPMN-annotated-text pairs")
print(f"Document store statistics:")
print(f"  - Total documents: {len(document_store.filter_documents())}")
print(f"  - Sample document 1: {documents[0].meta['id']}")
print(f"  - Sample annotated text: {documents[0].meta['annotated_text'][:100]}...")
print(f"  - Task tags in annotated text: {'<bpmn:task>' in documents[0].meta['annotated_text']}")

‚úì Document store created with 4 BPMN-annotated-text pairs
Document store statistics:
  - Total documents: 4
  - Sample document 1: doc_0
  - Sample annotated text: This is a loan approval process. The process begins with <bpmn:task>Check Credit Score</bpmn:task>, ...
  - Task tags in annotated text: True


In [3]:
# Cell 3: Create Retriever and Verify Document Store

retriever = InMemoryBM25Retriever(document_store=document_store)

# Verify document store contents
print("üîç VERIFYING DOCUMENT STORE CONTENTS:")
print("="*60)

documents_in_store = document_store.filter_documents()
for i, doc in enumerate(documents_in_store, 1):
    print(f"\nDocument {i} (ID: {doc.meta.get('id', 'N/A')}):")
    print(f"  Content preview: {doc.content[:80]}...")
    print(f"  Annotated text: {doc.meta.get('annotated_text', 'N/A')[:80]}...")
    
    # Count tasks in BPMN XML
    tasks_in_xml = re.findall(r'<task[^>]*name="([^"]*)"', doc.content)
    print(f"  Tasks in BPMN: {len(tasks_in_xml)}")
    
    # Count task tags in annotated text
    task_tags_in_text = len(re.findall(r'<bpmn:task>[^<]+</bpmn:task>', doc.meta.get('annotated_text', '')))
    print(f"  Task tags in text: {task_tags_in_text}")
    
    # Check if numbers match
    if len(tasks_in_xml) == task_tags_in_text:
        print(f"  ‚úì Task counts match!")
    else:
        print(f"  ‚ö†Ô∏è Task counts don't match!")

print("\n" + "="*60)
print(f"‚úÖ Document store verification complete")

üîç VERIFYING DOCUMENT STORE CONTENTS:

Document 1 (ID: doc_0):
  Content preview: <?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/sp...
  Annotated text: This is a loan approval process. The process begins with <bpmn:task>Check Credit...
  Tasks in BPMN: 3
  Task tags in text: 3
  ‚úì Task counts match!

Document 2 (ID: doc_1):
  Content preview: <?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/sp...
  Annotated text: This is an order processing workflow. First, <bpmn:task>Receive Order</bpmn:task...
  Tasks in BPMN: 4
  Task tags in text: 4
  ‚úì Task counts match!

Document 3 (ID: doc_2):
  Content preview: <?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/sp...
  Annotated text: Employee onboarding process starts with <bpmn:task>Collect Documents</bpmn:task>...
  Tasks in BPMN: 4
  Task tags in text: 4
  ‚úì Task counts match!

Document 4 (ID: doc_3):
  Content preview: <?xml version="1.0" encod

In [4]:
# Cell 4: Create Enhanced Prompt Template for Annotated Text Generation

prompt_template = [
    ChatMessage.from_system("""
You are an expert BPMN process analyst. Your task is to analyze BPMN XML models and generate 
natural language descriptions with proper annotations.

CRITICAL REQUIREMENTS:
1. You MUST wrap EVERY task name in <bpmn:task> tags exactly like this: <bpmn:task>Task Name</bpmn:task>
2. NEVER use any other tags or formatting for task names
3. Task names MUST match exactly what's in the BPMN XML
4. Your description should be clear, concise, and follow the logical flow

FORMAT EXAMPLE:
"This is a [process type] process. It begins with <bpmn:task>First Task</bpmn:task>, 
followed by <bpmn:task>Second Task</bpmn:task>. Finally, <bpmn:task>Third Task</bpmn:task> 
completes the workflow."

BAD EXAMPLES:
- "First, check credit score" (missing tags)
- "<task>Check Credit Score</task>" (wrong tag format)
- "[task]Check Credit Score[/task]" (wrong format)

Your output MUST include <bpmn:task> tags for EVERY task mentioned.
    """),
    ChatMessage.from_user("""
--- EXAMPLES OF BPMN MODELS WITH ANNOTATED DESCRIPTIONS ---
{% for example in examples %}
Example {{loop.index}}:
BPMN XML:
{{example.content}}

Annotated Description:
{{example.meta.annotated_text}}
{% endfor %}
-----------------------------------------------------------

Now, analyze the following BPMN XML model and create an annotated description for it:
{{query_bpmn}}

IMPORTANT: Wrap EVERY task name in <bpmn:task> tags like this: <bpmn:task>Task Name</bpmn:task>
    """)
]

prompt_builder = ChatPromptBuilder(
    template=prompt_template, 
    required_variables=["query_bpmn", "examples"]
)

print("‚úì Prompt template created for annotated text generation")
print(f"Required variables: {prompt_builder.required_variables}")

‚úì Prompt template created for annotated text generation
Required variables: ['query_bpmn', 'examples']


In [5]:
# Cell 5: Create Generator Component

chat_generator = OllamaChatGenerator(
    model="llama3.1:8b",
    url="http://localhost:11434",
    timeout=30*60,
    generation_kwargs={
        "temperature": 0.3,
        "max_tokens": 500
    }
)

print("‚úì Chat generator configured")
print(f"Model: {chat_generator.model}")
print(f"URL: {chat_generator.url}")

‚úì Chat generator configured
Model: llama3.1:8b
URL: http://localhost:11434


In [6]:
# Cell 6: Build the Pipeline

# Create pipeline
bpmn_pipeline = Pipeline()

# Add components
bpmn_pipeline.add_component(instance=retriever, name="retriever")
bpmn_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
bpmn_pipeline.add_component(instance=chat_generator, name="generator")

# Connect components
bpmn_pipeline.connect("retriever.documents", "prompt_builder.examples")
bpmn_pipeline.connect("prompt_builder.prompt", "generator.messages")

print("‚úÖ Pipeline built successfully!")
print("\nPipeline Components:")
print("  - retriever: InMemoryBM25Retriever")
print("  - prompt_builder: ChatPromptBuilder")
print("  - generator: OllamaChatGenerator")
print("\nConnections:")
print("  - retriever.documents ‚Üí prompt_builder.examples")
print("  - prompt_builder.prompt ‚Üí generator.messages")

‚úÖ Pipeline built successfully!

Pipeline Components:
  - retriever: InMemoryBM25Retriever
  - prompt_builder: ChatPromptBuilder
  - generator: OllamaChatGenerator

Connections:
  - retriever.documents ‚Üí prompt_builder.examples
  - prompt_builder.prompt ‚Üí generator.messages


In [7]:
# Cell 7: Test Function with Enhanced Task Extraction

def run_annotated_pipeline(model_xml, model_name):
    """Run the annotated text generation pipeline and extract tasks"""
    
    print(f"\n{'='*60}")
    print(f"MODEL: {model_name}")
    print(f"{'='*60}")
    
    # Extract actual tasks from XML
    actual_tasks = re.findall(r'<task[^>]*name="([^"]*)"', model_xml)
    print(f"\nüìã Actual tasks in BPMN model ({len(actual_tasks)}):")
    for i, task in enumerate(actual_tasks, 1):
        print(f"  {i:2}. {task}")
    
    # Run pipeline
    print("\nüöÄ Running pipeline...")
    result = bpmn_pipeline.run({
        "retriever": {"query": model_xml, "top_k": 2},
        "prompt_builder": {"query_bpmn": model_xml}
    })
    
    # Get generated annotated text
    annotated_text = result["generator"]["replies"][0].text
    
    print("\n‚úÖ GENERATED ANNOTATED TEXT:")
    print("-" * 50)
    print(annotated_text)
    print("-" * 50)
    
    # Extract tasks from annotated text
    task_patterns = [
        r'<bpmn:task>([^<]+)</bpmn:task>',  # Standard pattern
        r'<bpmn:task>\s*([^<]+)\s*</bpmn:task>',  # With whitespace
    ]
    
    found_tasks = []
    for pattern in task_patterns:
        tasks = re.findall(pattern, annotated_text)
        if tasks:
            found_tasks.extend(tasks)
            break
    
    print(f"\nüîç TASKS EXTRACTED FROM ANNOTATED TEXT ({len(found_tasks)} found):")
    if found_tasks:
        for i, task in enumerate(found_tasks, 1):
            print(f"  {i:2}. {task}")
    else:
        print("  No tasks found with <bpmn:task> tags!")
        
        # Check for other patterns
        print("\n‚ö†Ô∏è DEBUG: Checking for alternative formatting...")
        patterns_to_check = [
            ('<task> tags', r'<task>([^<]+)</task>'),
            ('Bold text', r'\*\*([^*]+)\*\*'),
            ('Quoted text', r'"([^"]+)"'),
            ('Code format', r'`([^`]+)`'),
        ]
        
        for pattern_name, pattern in patterns_to_check:
            matches = re.findall(pattern, annotated_text)
            if matches:
                print(f"  Found with {pattern_name}: {matches}")
    
    # Calculate metrics
    found_tasks_lower = {t.lower().strip() for t in found_tasks}
    actual_tasks_lower = {t.lower().strip() for t in actual_tasks}
    
    matched_tasks = found_tasks_lower.intersection(actual_tasks_lower)
    missing_tasks = actual_tasks_lower - found_tasks_lower
    
    tp = len(matched_tasks)
    fp = len(found_tasks_lower - actual_tasks_lower)
    fn = len(missing_tasks)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    # Check tag usage
    tag_count = len(re.findall(r'<bpmn:task>', annotated_text))
    
    print(f"\nüìä PERFORMANCE METRICS:")
    print(f"  ‚Ä¢ True Positives (TP): {tp}")
    print(f"  ‚Ä¢ False Positives (FP): {fp}")
    print(f"  ‚Ä¢ False Negatives (FN): {fn}")
    print(f"  ‚Ä¢ Precision: {precision:.4f} ({precision:.2%})")
    print(f"  ‚Ä¢ Recall: {recall:.4f} ({recall:.2%})")
    print(f"  ‚Ä¢ F1-Score: {f1_score:.4f}")
    print(f"  ‚Ä¢ <bpmn:task> tags used: {tag_count}")
    
    if matched_tasks:
        print(f"\n‚úÖ CORRECTLY IDENTIFIED TASKS ({len(matched_tasks)}):")
        for task in sorted(matched_tasks):
            print(f"    ‚úì {task}")
    
    if missing_tasks:
        print(f"\n‚ùå MISSING TASKS ({len(missing_tasks)}):")
        for task in sorted(missing_tasks):
            print(f"    ‚úó {task}")
    
    return {
        'name': model_name,
        'annotated_text': annotated_text,
        'found_tasks': found_tasks,
        'actual_tasks': actual_tasks,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tag_count': tag_count,
        'matched_tasks': list(matched_tasks),
        'missing_tasks': list(missing_tasks)
    }

In [8]:
# Cell 8: Test with Multiple BPMN Models

# Define test BPMN models (without descriptions)
test_models = [
    {
        "name": "Customer Complaint Process",
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="ComplaintProcess">
    <startEvent id="Start"/>
    <task id="T1" name="Receive Complaint"/>
    <task id="T2" name="Investigate Issue"/>
    <task id="T3" name="Escalate if Needed"/>
    <task id="T4" name="Provide Solution"/>
    <task id="T5" name="Follow Up"/>
    <endEvent id="End"/>
  </process>
</definitions>"""
    },
    {
        "name": "Invoice Processing",
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="InvoiceProcess">
    <startEvent id="StartEvent_1"/>
    <task id="T1" name="Receive Invoice"/>
    <task id="T2" name="Validate Invoice"/>
    <task id="T3" name="Approve Payment"/>
    <task id="T4" name="Process Payment"/>
    <task id="T5" name="Update Records"/>
    <task id="T6" name="Send Confirmation"/>
    <endEvent id="EndEvent_1"/>
  </process>
</definitions>"""
    },
    {
        "name": "Software Deployment",
        "xml": """<?xml version="1.0" encoding="UTF-8"?>
<definitions xmlns="http://www.omg.org/spec/BPMN/20100524/MODEL">
  <process id="DeploymentProcess">
    <startEvent id="Start"/>
    <task id="T1" name="Plan Deployment"/>
    <task id="T2" name="Prepare Environment"/>
    <task id="T3" name="Deploy Code"/>
    <task id="T4" name="Run Tests"/>
    <task id="T5" name="Monitor Performance"/>
    <task id="T6" name="Document Changes"/>
    <endEvent id="End"/>
  </process>
</definitions>"""
    }
]

print("="*80)
print("RUNNING ANNOTATED TEXT GENERATION PIPELINE")
print("="*80)
print(f"Testing {len(test_models)} BPMN models...\n")

results = []
for i, model in enumerate(test_models, 1):
    print(f"\n{'='*80}")
    print(f"TEST {i}/{len(test_models)}: {model['name']}")
    print(f"{'='*80}")
    
    try:
        result = run_annotated_pipeline(model['xml'], model['name'])
        results.append(result)
        print(f"\n‚úÖ Model {i} processed successfully")
    except Exception as e:
        print(f"\n‚ùå Error processing model {i}: {e}")
        results.append({
            'name': model['name'],
            'error': str(e),
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0
        })

RUNNING ANNOTATED TEXT GENERATION PIPELINE
Testing 3 BPMN models...


TEST 1/3: Customer Complaint Process

MODEL: Customer Complaint Process

üìã Actual tasks in BPMN model (5):
   1. Receive Complaint
   2. Investigate Issue
   3. Escalate if Needed
   4. Provide Solution
   5. Follow Up

üöÄ Running pipeline...

‚úÖ GENERATED ANNOTATED TEXT:
--------------------------------------------------
Here is the annotated description for the given BPMN XML model:

This is a complaint processing workflow. First, <bpmn:task>Receive Complaint</bpmn:task> from the customer. Then <bpmn:task>Investigate Issue</bpmn:task> to identify the root cause of the problem. Next, <bpmn:task>Escalate if Needed</bpmn:task> to involve higher management or external parties if necessary. After that, <bpmn:task>Provide Solution</bpmn:task> to resolve the customer's issue. Finally, <bpmn:task>Follow Up</bpmn:task> with the customer to ensure they are satisfied with the resolution.
--------------------------------