In [2]:
# --------------------------
# Force usage of GPU 3 before any TF import
# --------------------------
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Force only GPU 3

# --------------------------
# Imports
# --------------------------
import json
import gc
import numpy as np
import pandas as pd
import pm4py
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import set_global_policy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import keras_tuner as kt

# --------------------------
# Initialize environment
# --------------------------
tf.random.set_seed(42)
np.random.seed(42)
set_global_policy("mixed_float16")
tf.config.experimental.enable_op_determinism()

# --------------------------
# Detect GPU and set memory growth
# --------------------------
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"‚úÖ GPU(s) detected: {[gpu.name for gpu in gpus]}")
    except RuntimeError as e:
        print(f"‚ö†Ô∏è GPU memory growth setup failed: {e}")
else:
    print("‚ö†Ô∏è No GPU detected, running on CPU")

# Use only the single visible GPU (GPU 3) with OneDeviceStrategy
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
print(f"‚úÖ Number of GPUs available in strategy: {strategy.num_replicas_in_sync}")

# --------------------------
# Load event log
# --------------------------
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)
    return event_log

event_log = import_xes("BPI Challenge 2018.xes")
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df = df.sort_values(by=['org:resource', 'time:timestamp'])

# --------------------------
# Sequence creation
# --------------------------
def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            sequences.append(activities[:prefix_length])
            next_activities.append(activities[prefix_length])
            resources.append(resource)
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df

# --------------------------
# Oversample rare classes proportionally
# --------------------------
def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    return X_bal, y_bal

# --------------------------
# Transformer model builder
# --------------------------
def build_transformer_model(hp, prefix_length, num_classes):
    d_model = hp.Choice("d_model", [32, 64])
    num_heads = hp.Choice("num_heads", [2, 4])
    num_layers = hp.Int("num_layers", 1, 2)
    dropout = 0.1

    inputs = layers.Input(shape=(prefix_length,), dtype=tf.int32)
    x = layers.Embedding(input_dim=num_classes, output_dim=d_model)(inputs)
    positions = tf.range(start=0, limit=prefix_length, delta=1)
    pos_encoding = layers.Embedding(input_dim=prefix_length, output_dim=d_model)(positions)
    x = layers.Lambda(lambda x: x + pos_encoding)(x)

    for _ in range(num_layers):
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn_output)
        ffn = layers.Dense(d_model*4, activation="relu")(x)
        ffn = layers.Dense(d_model)(ffn)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ffn)
        x = layers.Dropout(dropout)(x)

    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-4),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

# --------------------------
# Experiment Runner
# --------------------------
def run_experiment(prefix_length):
    print(f"\nüöÄ Running Transformer experiment: sequence length = {prefix_length}")

    sequences_df = create_activity_sequences(df, prefix_length).reset_index(drop=True)

    label_encoder = LabelEncoder()
    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)]
    all_activities = sequences_df[activity_cols + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)
    for col in activity_cols + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    X = sequences_df[activity_cols].values.astype(np.int32)
    y = sequences_df["next_activity"].values.astype(np.int32)
    X, y = oversample_proportional(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    num_classes = len(label_encoder.classes_)

    with strategy.scope():
        tuner = kt.RandomSearch(
            lambda hp: build_transformer_model(hp, prefix_length, num_classes),
            objective='val_accuracy',
            max_trials=15,
            executions_per_trial=1,
            directory='gpu_tuner',
            project_name=f'transformer_seq_{prefix_length}',
            overwrite=True
        )

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    batch_size = 32

    while True:
        try:
            tuner.search(
                X_train, y_train,
                validation_split=0.2,
                epochs=50,
                batch_size=batch_size,
                callbacks=[early_stop],
                verbose=1
            )
            break
        except tf.errors.ResourceExhaustedError:
            print(f"‚ö†Ô∏è Resource exhausted at batch_size={batch_size}. Reducing batch size.")
            batch_size = max(1, batch_size // 2)
            tf.keras.backend.clear_session()
            gc.collect()

    best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
    best_model = tuner.get_best_models(num_models=1)[0]

    y_pred = np.argmax(best_model.predict(X_test, batch_size=1, verbose=0), axis=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred, average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test, y_pred, average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test, y_pred, average="weighted", zero_division=0)),
    }

    print("\nüìä Evaluation Results:")
    for k, v in metrics.items():
        print(f"{k.capitalize()}: {v:.4f}")

    os.makedirs("results/BPIC2018/Transformer model/Baseline encoding", exist_ok=True)
    out_path = f"results/BPIC2018/Transformer model/Baseline encoding/transformer_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump({"sequence_length": prefix_length, "best_hyperparameters": best_hp.values, "metrics": metrics}, f, indent=4)
    print(f"üíæ Saved results to {out_path}")

# --------------------------
# Run all experiments sequentially
# --------------------------
sequence_lengths = [2500]
for seq_len in sequence_lengths:
    run_experiment(seq_len)

print("\nüéâ All experiments completed!")


Trial 8 Complete [00h 07m 09s]
val_accuracy: 0.8303571343421936

Best val_accuracy So Far: 0.9285714030265808
Total elapsed time: 00h 56m 38s


  saveable.load_own_variables(weights_store.get(inner_path))



üìä Evaluation Results:
Accuracy: 0.8643
Precision: 0.8688
Recall: 0.8643
F1_score: 0.8600
üíæ Saved results to results/BPIC2018/Transformer model/Baseline encoding/transformer_seq_2500.json

üéâ All experiments completed!


In [None]:
# Experiment 2

import os
import json
import gc
import numpy as np
import pandas as pd
import pm4py
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import set_global_policy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import keras_tuner as kt

# --------------------------
# Initialize environment
# --------------------------
tf.config.experimental.enable_op_determinism()
tf.random.set_seed(42)
np.random.seed(42)
set_global_policy("mixed_float16")

# Use available GPU(s)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    print(f"‚úÖ Number of GPUs available: {len(gpus)}")
else:
    print("‚ö†Ô∏è No GPU found, using CPU.")

# --------------------------
# Load event log
# --------------------------
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
    return df

event_log = import_xes("BPI Challenge 2018.xes")

# --------------------------
# Resource-Activity diversity matrix
# --------------------------
def create_diversity_matrix(log):
    activity_counts = log.pivot_table(
        index='org:resource',
        columns='concept:name',
        aggfunc='size',
        fill_value=0
    ).reset_index()
    # Convert counts to binary (resource has done activity or not)
    binary_matrix = activity_counts.copy()
    binary_matrix.iloc[:, 1:] = (binary_matrix.iloc[:, 1:] > 0).astype(int)
    return binary_matrix

ra_matrix = create_diversity_matrix(event_log)

# --------------------------
# Sequence creation
# --------------------------
def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, group in df.groupby('org:resource'):
        acts = group['concept:name'].values
        if len(acts) >= prefix_length + 1:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    df_seq = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    df_seq['next_activity'] = next_activities
    df_seq['org:resource'] = resources
    return df_seq

# --------------------------
# Oversample rare classes
# --------------------------
def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    return X_bal, y_bal

# --------------------------
# Transformer model
# --------------------------
def build_transformer_model(hp, prefix_length, num_classes, num_extra_features):
    d_model = hp.Choice("d_model", [32, 64])
    num_heads = hp.Choice("num_heads", [2, 4])
    num_layers = hp.Int("num_layers", 1, 2)
    dropout = 0.1

    inputs = layers.Input(shape=(prefix_length + num_extra_features,), dtype=tf.float32)

    # Split sequence vs extra RA features
    activity_input = layers.Lambda(lambda x: tf.cast(x[:, :prefix_length], tf.int32))(inputs)
    extra_input = layers.Lambda(lambda x: x[:, prefix_length:])(inputs)

    # Embedding for activity sequence
    x = layers.Embedding(input_dim=num_classes, output_dim=d_model)(activity_input)
    
    # Positional encoding
    positions = tf.range(start=0, limit=prefix_length, delta=1)
    pos_encoding = layers.Embedding(input_dim=prefix_length, output_dim=d_model)(positions)
    x = x + pos_encoding

    # Transformer blocks
    for _ in range(num_layers):
        attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn)
        ffn = layers.Dense(d_model*4, activation="relu")(x)
        ffn = layers.Dense(d_model)(ffn)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ffn)
        x = layers.Dropout(dropout)(x)

    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Concatenate()([x, extra_input])

    outputs = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-4),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

# --------------------------
# Run experiment
# --------------------------
def run_experiment(prefix_length, batch_size=32):
    print(f"\nüöÄ Running experiment for sequence length = {prefix_length}")

    seq_df = create_activity_sequences(event_log, prefix_length)

    # Merge RA binary features
    ra_filtered = ra_matrix[ra_matrix['org:resource'].isin(seq_df['org:resource'])].reset_index(drop=True)
    merged_df = pd.concat([seq_df.reset_index(drop=True), ra_filtered.iloc[:, 1:]], axis=1)

    # Encode activities
    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']
    le = LabelEncoder()
    all_acts = merged_df[activity_cols].values.flatten()
    le.fit(all_acts)
    for col in activity_cols:
        merged_df[col] = le.transform(merged_df[col])

    # Prepare X and y
    extra_cols = ra_filtered.columns[1:].tolist()
    X = merged_df[[f"activity_{i+1}" for i in range(prefix_length)] + extra_cols].values.astype(np.float32)
    y = merged_df['next_activity'].values.astype(np.int32)

    # Oversample
    X, y = oversample_proportional(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    num_classes = len(le.classes_)
    num_extra_features = len(extra_cols)

    # Hyperparameter tuning
    tuner = kt.RandomSearch(
        lambda hp: build_transformer_model(hp, prefix_length, num_classes, num_extra_features),
        objective="val_accuracy",
        max_trials=10,
        executions_per_trial=1,
        directory="gpu_tuner",
        project_name=f"transformer_seq_{prefix_length}",
        overwrite=True
    )

    early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

    tuner.search(
        X_train, y_train,
        validation_split=0.2,
        epochs=50,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1
    )

    best_model = tuner.get_best_models(num_models=1)[0]

    # Evaluation
    y_pred = np.argmax(best_model.predict(X_test, batch_size=batch_size, verbose=0), axis=1)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
        "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average="weighted", zero_division=0)
    }

    print("\nüìä Evaluation Results:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    # Save results
    os.makedirs("results/BPIC2018/Transformer model/SCap", exist_ok=True)
    out_file = f"results/BPIC2018/Transformer model/SCap/transformer_seq_{prefix_length}.json"
    with open(out_file, "w") as f:
        json.dump({
            "sequence_length": prefix_length,
            "best_hyperparameters": tuner.get_best_hyperparameters()[0].values,
            "metrics": metrics
        }, f, indent=4)
    print(f"üíæ Results saved to {out_file}")

# --------------------------
# Run experiments
# --------------------------
sequence_lengths = [1000, 1200, 1400, 1500, 2000, 2500]
for seq_len in sequence_lengths:
    run_experiment(seq_len, batch_size=32)

print("\nüéâ All experiments completed!")


Trial 6 Complete [00h 02m 34s]
val_accuracy: 0.8831169009208679

Best val_accuracy So Far: 0.9350649118423462
Total elapsed time: 00h 25m 41s

Search: Running Trial #7

Value             |Best Value So Far |Hyperparameter
32                |64                |d_model
4                 |4                 |num_heads
2                 |2                 |num_layers

Epoch 1/50


2025-10-27 20:29:19.205490: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 403ms/step - accuracy: 0.1382 - loss: 3.4352

2025-10-27 20:29:30.694841: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m13s[0m 461ms/step - accuracy: 0.2251 - loss: 3.1396 - val_accuracy: 0.3636 - val_loss: 2.6934
Epoch 2/50
[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 448ms/step - accuracy: 0.4454 - loss: 2.4474 - val_accuracy: 0.4221 - val_loss: 2.3733
Epoch 3/50
[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 447ms/step - accuracy: 0.5008 - loss: 2.2328 - val_accuracy: 0.5260 - val_loss: 2.2163
Epoch 4/50
[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 443ms/step - accuracy: 0.5726 - loss: 2.1020 - val_accuracy: 0.5260 - val_loss: 2.1037
Epoch 5/50
[1m20/20[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 446ms/step - accuracy: 0.5791 - loss: 2.0030 - val_accuracy: 0.6104 - val_loss: 2.0090
