## GPU Configuration

In [1]:
import torch

# Check if CUDA is available
def check_cuda():
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU Name:", torch.cuda.get_device_name(0))
        print("CUDA Version:", torch.version.cuda)
        print("Number of GPUs:", torch.cuda.device_count())
        print("Current Device:", torch.cuda.current_device())
    else:
        print("CUDA is not available. Running on CPU.")

# Test simple computation on GPU
def gpu_computation_test():
    if not torch.cuda.is_available():
        print("Skipping computation test as CUDA is not available.")
        return
    
    device = torch.device("cuda")
    A = torch.randn(1000, 1000, device=device)
    B = torch.randn(1000, 1000, device=device)
    
    print("Running matrix multiplication on GPU...")
    C = torch.matmul(A, B)
    print("Computation successful! Output shape:", C.shape)
    
# Run the tests
check_cuda()
gpu_computation_test()


CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA Version: 12.0
Number of GPUs: 1
Current Device: 0
Running matrix multiplication on GPU...
Computation successful! Output shape: torch.Size([1000, 1000])


# Importing Dataset and Initial Inspection

In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("gcelikmasat-work/BPMN-IT-Dataset")

train_dataset.jsonl:  19%|#9        | 21.0M/110M [00:00<?, ?B/s]

val_dataset.jsonl:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

test_dataset.jsonl:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23912 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2986 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2990 [00:00<?, ? examples/s]

In [2]:
print(dataset['train'])

Dataset({
    features: ['id', 'file_name', 'category', 'instruction', 'input', 'output'],
    num_rows: 23912
})


In [7]:
df = dataset['train'].to_pandas()

In [8]:
df.head()

Unnamed: 0,id,file_name,category,instruction,input,output
0,0,accounts_receivable_process_0.gv,accounts_receivable_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following description is about the a...,digraph accounts_receivable_process_0 {\n\tgra...
1,1,accounts_receivable_process_1.gv,accounts_receivable_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe text below is about the accounts rec...,digraph accounts_receivable_process_1 {\n\tgra...
2,2,accounts_receivable_process_10.gv,accounts_receivable_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following text is about the accounts...,digraph accounts_receivable_process_10 {\n\tgr...
3,3,accounts_receivable_process_100.gv,accounts_receivable_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following text is about the accounts...,digraph accounts_receivable_process_100 {\n\tg...
4,4,accounts_receivable_process_1000.gv,accounts_receivable_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe text below is about the accounts rec...,digraph accounts_receivable_process_1000 {\n\t...


In [11]:
df['category'].unique()

array(['accounts_receivable_process', 'account_payable_process',
       'budget_preparation_process', 'churn_rate_prevention_process',
       'client_onboarding_process_for_a_marketing_agency',
       'content_promotion_process',
       'customer_support_process_for_the_ticket_management',
       'employee_onboarding_process', 'final_grades_submission_process',
       'loan_application_process', 'order_fulfillment_process',
       'process_for_optimizing_a_process', 'project_management_process',
       'purchase_order_workflow',
       'startup_due_diligence_for_a_venture_capitalist'], dtype=object)

In [13]:
df['category'].nunique()

15

In [15]:
df.groupby('category').size()

category
account_payable_process                               1600
accounts_receivable_process                           1600
budget_preparation_process                            1600
churn_rate_prevention_process                         1570
client_onboarding_process_for_a_marketing_agency      1600
content_promotion_process                             1600
customer_support_process_for_the_ticket_management    1600
employee_onboarding_process                           1600
final_grades_submission_process                       1600
loan_application_process                              1600
order_fulfillment_process                             1600
process_for_optimizing_a_process                      1558
project_management_process                            1600
purchase_order_workflow                               1600
startup_due_diligence_for_a_venture_capitalist        1584
dtype: int64

# Explorative Preprocessing Attempt using Input Description Classifier (see 7.3)
During the dataset preprocessing, it was assumed that specific labels in the textual descriptions would allow for more accurate User Stories and BDD scenarios during generation. As a result, the bpmn-information-extraction-v2 was applied, where the resulting columns would also be fed into the o3 model. However, trial-and-error found that the columns only lead to an increase in hallucination, and as a result, the columns would eventually be dropped, leaving only the input description and DOT outputs in the final cases. Due to the initial hypothesized use, the columns remain in the upcoming data cleaning steps.

## Importing Transformer Model

In [17]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="jtlicardo/bpmn-information-extraction-v2")

Device set to use cuda:0


In [None]:
def extract_tokens_from_row(row):
    # Define the required classification labels
    required_labels = ['B-TASK_INFO', 'I-TASK', 'O', 'B-TASK', 'I-PROCESS_INFO', 'B-PROCESS_INFO', 'I-TASK_INFO', 'I-AGENT', 'B-AGENT', 'I-CONDITION', 'B-CONDITION']

    # Extract the text to be processed
    text = row["input"]
    
    # Use the token classification pipeline (assuming `pipe` is your model)
    results = pipe(text)
    
    # Initialize a dictionary to store categorized tokens
    categorized_tokens = {}

    # Loop over the results of the pipeline and categorize tokens
    for result in results:
        entity = result["entity"]
        word = result["word"]

        # Remove subword tokens (e.g., ## prefixes from WordPiece)
        word = word.replace("##", "")

        # If the entity is not already in the categorized_tokens dictionary, add it
        if entity not in categorized_tokens:
            categorized_tokens[entity] = []

        # Append the word to the correct entity list
        categorized_tokens[entity].append(word)

    # Join words that belong to the same category
    categorized_tokens = {k: " ".join(v) for k, v in categorized_tokens.items()}

    # Ensure all required classification labels exist in the final dictionary
    for label in required_labels:
        if label not in categorized_tokens:
            categorized_tokens[label] = "O"  # Assign default value 'O' for missing labels

    # Return the extracted token classifications
    return categorized_tokens

In [None]:
# Apply token classification to the entire dataset without converting to pandas
dataset = dataset.map(extract_tokens_from_row, batched=False, load_from_cache_file=False)

In [None]:
print(dataset)

In [None]:
train_dataset = dataset['train']

In [None]:
train_dataset.to_csv('train_dataset_modelv1.csv')

# Data Cleaning

Firstly, the initial sentence is removed. This sentence simply reflected the format The following description is about {category}". However, this category is not always reflective of the process, so the first sentence might throw off an LLM. Afterwards, we check what is left. Some scoping already highlighted that the second sentence is formatted to start with "it". Code is added to verify whether this is true on all samples

In [26]:
import pandas as pd
import re

def remove_initial_sentence(text):
    pattern = re.compile(r'^.*?\.\s*', re.DOTALL)
    return pattern.sub('', text, count=1)

# Function to extract the first word (lowercased) from a text.
def get_first_word(text):
    text = text.strip()
    if text:
        return text.split()[0].lower()
    return None

# Load the dataset (adjust the file path as necessary)
df = pd.read_csv("train_dataset_modelv2.csv")

# Apply the function to remove the initial sentence from the 'input' column.
df['input'] = df['input'].apply(remove_initial_sentence)

# Extract the first word from the cleaned 'input'
df['first_word'] = df['input'].apply(get_first_word)

# Verify if all rows start with "it"
all_start_with_it = df['first_word'].eq("it").all()

print("Do all rows' inputs start with 'it'? :", all_start_with_it)

# Print out a few rows that do NOT start with "it", if any.
non_it_rows = df[df['first_word'] != "it"]
if not non_it_rows.empty:
    print("Rows not starting with 'it':")
    print(non_it_rows[['id', 'first_word', 'input']].head())
else:
    print("All rows start with 'it'.")

Do all rows' inputs start with 'it'? : True
All rows start with 'it'.


All rows start with 'it', so to remove ambiguity, "it" is replaced by "the process"

In [27]:
def replace_initial_it(text):
    # This regex matches "it" at the beginning of the text followed by a word boundary.
    return re.sub(r'^(it)\b', "The process", text, flags=re.IGNORECASE)

# Replace the initial "it" with "The process"
df['input'] = df['input'].apply(replace_initial_it)

# Display a few examples to verify
print(df['input'].head())


0    The process starts with recording the buyer's ...
1    The process begins when you record the buyer's...
2    The process begins when you record the buyer's...
3    The process starts with recording the buyer's ...
4    The process starts with recording the buyer's ...
Name: input, dtype: object


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23912 entries, 0 to 23911
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              23912 non-null  int64 
 1   file_name       23912 non-null  object
 2   category        23912 non-null  object
 3   instruction     23912 non-null  object
 4   input           23912 non-null  object
 5   output          23912 non-null  object
 6   I-PROCESS_INFO  23912 non-null  object
 7   B-TASK          23912 non-null  object
 8   I-TASK          23912 non-null  object
 9   B-TASK_INFO     23912 non-null  object
 10  I-TASK_INFO     23912 non-null  object
 11  B-PROCESS_INFO  23912 non-null  object
 12  O               23912 non-null  object
 13  I-AGENT         23912 non-null  object
 14  B-AGENT         23912 non-null  object
 15  I-CONDITION     23912 non-null  object
 16  B-CONDITION     23912 non-null  object
 17  first_word      23912 non-null  object
dtypes: int

Finally, the "instruction" column, the "file_name" column, "O" column and "first_word" columns are dropped

In [29]:
df.drop(["file_name", "instruction", "O", "first_word"], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23912 entries, 0 to 23911
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              23912 non-null  int64 
 1   category        23912 non-null  object
 2   input           23912 non-null  object
 3   output          23912 non-null  object
 4   I-PROCESS_INFO  23912 non-null  object
 5   B-TASK          23912 non-null  object
 6   I-TASK          23912 non-null  object
 7   B-TASK_INFO     23912 non-null  object
 8   I-TASK_INFO     23912 non-null  object
 9   B-PROCESS_INFO  23912 non-null  object
 10  I-AGENT         23912 non-null  object
 11  B-AGENT         23912 non-null  object
 12  I-CONDITION     23912 non-null  object
 13  B-CONDITION     23912 non-null  object
dtypes: int64(1), object(13)
memory usage: 2.6+ MB


# Case Creation

In [16]:
# Define the mapping of categories to each overarching process
financial_categories = [
    "accounts_receivable_process",
    "account_payable_process",
    "loan_application_process",
    "purchase_order_workflow",
    "order_fulfillment_process",
]

marketing_categories = [
    "client_onboarding_process_for_a_marketing_agency",
    "churn_rate_prevention_process",
    "content_promotion_process",
    "employee_onboarding_process"
]

internal_categories = [
    "customer_support_process_for_the_ticket_management",
    "process_for_optimizing_a_process",
    "project_management_process",
]

category_ranges = {
    "accounts_receivable_process": (0, 1599),
    "account_payable_process": (1600, 3199),
    "budget_preparation_process": (3200, 4799),
    "churn_rate_prevention_process": (4800, 6369),
    "client_onboarding_process_for_a_marketing_agency": (6370, 7969),
    "content_promotion_process": (7970, 9569),
    "customer_support_process_for_the_ticket_management": (9570, 11169),
    "employee_onboarding_process": (11170, 12769),
    "final_grades_submission_process": (12770, 14369),
    "loan_application_process": (14370, 15969),
    "order_fulfillment_process": (15970, 17569),
    "process_for_optimizing_a_process": (17570, 19127),
    "project_management_process": (19128, 20727),
    "purchase_order_workflow": (20728, 22327),
    "startup_due_diligence_for_a_venture_capitalist": (22328, 23927)  # or the final range
}

In [17]:
def create_alternating_df(master_df, category_ranges, categories):
    """
    Create a new DataFrame where rows alternate from each category.
    For each "case" (i.e. the i-th row within each category slice),
    the output will include one row from each category (in the order provided).
    """
    # Determine the number of rows (cases) available per category:
    num_rows_per_category = {}
    for cat in categories:
        start, end = category_ranges[cat]
        num_rows_per_category[cat] = end - start + 1
    # Use the minimum number of rows across categories to ensure alignment
    min_cases = min(num_rows_per_category.values())
    
    # Build a list of rows in alternating order
    rows = []
    for i in range(min_cases):
        for cat in categories:
            start, _ = category_ranges[cat]
            global_idx = start + i
            row = master_df.iloc[global_idx].copy()
            # Add columns to indicate the original category and the case number
            row['source_category'] = cat
            row['case'] = i
            rows.append(row)
    
    # Create a DataFrame from the list of rows
    alternating_df = pd.DataFrame(rows)
    # Optionally, reset index
    alternating_df.reset_index(drop=True, inplace=True)
    return alternating_df

In [18]:
df_financial = create_alternating_df(df, category_ranges, financial_categories)
df_marketing = create_alternating_df(df, category_ranges, marketing_categories)
df_internal = create_alternating_df(df, category_ranges, internal_categories)

In [19]:
df_marketing.head()

Unnamed: 0,id,file_name,category,instruction,input,output,source_category,case
0,6370,client_onboarding_process_for_a_marketing_agen...,client_onboarding_process_for_a_marketing_agency,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following description is about the c...,digraph client_onboarding_process_for_a_market...,client_onboarding_process_for_a_marketing_agency,0
1,4800,churn_rate_prevention_process_0.gv,churn_rate_prevention_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following description is about the c...,digraph churn_rate_prevention_process_0 {\n\tg...,churn_rate_prevention_process,0
2,7970,content_promotion_process_0.gv,content_promotion_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following text is about the content ...,digraph content_promotion_process_0 {\n\tgraph...,content_promotion_process,0
3,11170,employee_onboarding_process_0.gv,employee_onboarding_process,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe text below is about the employee onb...,digraph employee_onboarding_process_0 {\n\tgra...,employee_onboarding_process,0
4,6371,client_onboarding_process_for_a_marketing_agen...,client_onboarding_process_for_a_marketing_agency,You are an expert in BPMN modeling and DOT lan...,\n\n\nThe following description is about the c...,digraph client_onboarding_process_for_a_market...,client_onboarding_process_for_a_marketing_agency,1


In [20]:
df_marketing.count()

id                 6280
file_name          6280
category           6280
instruction        6280
input              6280
output             6280
source_category    6280
case               6280
dtype: int64

In [23]:
print(6280/4)

1570.0


In [21]:
df_financial.count()

id                 8000
file_name          8000
category           8000
instruction        8000
input              8000
output             8000
source_category    8000
case               8000
dtype: int64

In [24]:
print(8000/5)

1600.0


In [25]:
df_internal.count()

id                 4674
file_name          4674
category           4674
instruction        4674
input              4674
output             4674
source_category    4674
case               4674
dtype: int64

In [26]:
print(4674/3)

1558.0


In [34]:
import os

# Map dataset names to tuple sizes
datasets = {
    "Financial_Transactions_&_Procurement": (df_financial, 5),
    "Customer_Engagement_&_Marketing": (df_marketing, 4),
    "Internal_Operations_&_Process_Management": (df_internal, 3),
}

# Create export directory
export_dir = "tuple_prompts"
os.makedirs(export_dir, exist_ok=True)

# Loop over each dataset, grouping rows into tuples.
for name, (df_cat, tuple_size) in datasets.items():
    num_tuples = len(df_cat) // tuple_size
    for i in range(num_tuples):
        # Get the current tuple of rows (they are arranged in alternating order)
        rows = df_cat.iloc[i * tuple_size : (i + 1) * tuple_size]
        
        # Build prompt text header
        prompt_text = f"### {name.replace('_', ' ')} — Tuple {i+1}\n\n"
        for j, (_, row) in enumerate(rows.iterrows()):
            prompt_text += f"---\nProcess {j+1}\n"
            prompt_text += f"Input:\n{row['input']}\n\n"
            prompt_text += f"Output:\n{row['output']}\n\n"
        
        # Write the prompt text to a .txt file
        filename = f"{name}_tuple_{i+1}.txt"
        filepath = os.path.join(export_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(prompt_text)

In [35]:
import shutil

# Example: Zip a folder
folder_to_zip = 'tuple_prompts'  
output_filename = 'tuple_prompts_zip' 

shutil.make_archive(output_filename, 'zip', folder_to_zip)

'/home/jorick/tuple_prompts_zip.zip'