In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Force GPU 3

import json
import gc
import numpy as np
import pandas as pd
import pm4py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp'])
    return df

df = import_xes("BPI_Challenge_2019.xes")


def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            sequences.append(activities[:prefix_length])
            next_activities.append(activities[prefix_length])
            resources.append(resource)
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df

def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    return X_bal, y_bal


class ActivityTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (batch, seq_len=1, d_model)
        x = self.transformer(x)             # (batch, 1, d_model)
        x = x.mean(dim=1)                   # global average pooling
        x = self.fc(x)
        return x


def run_experiment(prefix_length, epochs=50, patience=5):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")

    sequences_df = create_activity_sequences(df, prefix_length)
    label_encoder = LabelEncoder()
    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)]
    all_activities = sequences_df[activity_cols + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)
    for col in activity_cols + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    X = sequences_df[activity_cols].values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(np.int64)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train, y_train = oversample_proportional(X_train, y_train)

    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    best_model = None
    best_params = None
    best_val_acc = 0.0

    for d_model in [32, 64]:
        for num_heads in [2, 4]:
            for num_layers in [1, 2]:
                model = ActivityTransformer(X_train.shape[1], len(label_encoder.classes_),
                                            d_model=d_model, num_heads=num_heads, num_layers=num_layers).to(device)
                optimizer = optim.Adam(model.parameters(), lr=1e-3)
                criterion = nn.CrossEntropyLoss()
                best_epoch_loss = float('inf')
                patience_counter = 0

                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()
                    outputs = model(X_train)
                    loss = criterion(outputs, y_train)
                    loss.backward()
                    optimizer.step()

                    if loss.item() < best_epoch_loss:
                        best_epoch_loss = loss.item()
                        patience_counter = 0
                        best_model_state = model.state_dict()
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            break

                # Evaluate
                model.load_state_dict(best_model_state)
                model.eval()
                with torch.no_grad():
                    y_pred = torch.argmax(model(X_test), dim=1)
                    acc = accuracy_score(y_test.cpu(), y_pred.cpu())
                if acc > best_val_acc:
                    best_val_acc = acc
                    best_model = model
                    best_params = {'d_model': d_model, 'num_heads': num_heads, 'num_layers': num_layers}

    best_model.eval()
    with torch.no_grad():
        y_pred = torch.argmax(best_model(X_test), dim=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test.cpu(), y_pred.cpu())),
        "precision": float(precision_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test.cpu(), y_pred.cpu(), average="weighted"))
    }

    print(f"\nüìä Metrics for sequence length {prefix_length}: {metrics}")
    print(f"üõ†Ô∏è Best hyperparameters: {best_params}")

    os.makedirs("results/BPIC2019/Transformer model/Baseline encoding", exist_ok=True)
    out_path = f"results/BPIC2019/Transformer model/Baseline encoding/transformer_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump({"sequence_length": prefix_length, "best_hyperparameters": best_params, "metrics": metrics}, f, indent=4)
    print(f"üíæ Saved results to {out_path}")


sequence_lengths = [100, 150, 200, 300, 400, 500, 600, 700, 800]
for seq_len in sequence_lengths:
    run_experiment(seq_len)


In [None]:
# SCap
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import json
import numpy as np
import pandas as pd
import pm4py
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
    return df

df = import_xes("BPI_Challenge_2013_incidents.xes")

def create_ra_matrix(df):
    activity_counts = df.pivot_table(
        index='org:resource', columns='concept:name', aggfunc='size', fill_value=0
    ).reset_index()
    ra_matrix = activity_counts.copy()
    ra_matrix.iloc[:, 1:] = (ra_matrix.iloc[:, 1:] > 0).astype(int)
    return ra_matrix

ra_matrix = create_ra_matrix(df)

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, group in df.groupby('org:resource'):
        acts = group['concept:name'].values
        if len(acts) >= prefix_length + 1:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    seq_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    seq_df['next_activity'] = next_activities
    seq_df['org:resource'] = resources
    return seq_df

def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    return X_bal, y_bal

class ActivityTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (batch, seq_len=1, d_model)
        x = self.transformer(x)             # (batch, 1, d_model)
        x = x.mean(dim=1)                   # global average pooling
        x = self.fc(x)
        return x


def run_experiment(prefix_length, epochs=50, patience=5):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")

    seq_df = create_activity_sequences(df, prefix_length)

    ra_filtered = ra_matrix[ra_matrix['org:resource'].isin(seq_df['org:resource'])].reset_index(drop=True)
    merged_df = pd.concat([seq_df.reset_index(drop=True), ra_filtered.iloc[:, 1:]], axis=1)

    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)]
    le = LabelEncoder()
    all_acts = merged_df[activity_cols + ['next_activity']].values.flatten()
    le.fit(all_acts)
    for col in activity_cols + ['next_activity']:
        merged_df[col] = le.transform(merged_df[col])

    extra_cols = ra_filtered.columns[1:].tolist()
    X = merged_df[activity_cols + extra_cols].values.astype(np.float32)
    y = merged_df['next_activity'].values.astype(np.int64)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train, y_train = oversample_proportional(X_train, y_train)

    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    best_model, best_params, best_val_acc = None, None, 0.0
    for d_model in [32, 64]:
        for num_heads in [2, 4]:
            for num_layers in [1, 2]:
                model = ActivityTransformer(X_train.shape[1], len(le.classes_),
                                            d_model=d_model, num_heads=num_heads, num_layers=num_layers).to(device)
                optimizer = optim.Adam(model.parameters(), lr=1e-3)
                criterion = nn.CrossEntropyLoss()
                best_epoch_loss, patience_counter = float('inf'), 0

                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()
                    outputs = model(X_train)
                    loss = criterion(outputs, y_train)
                    loss.backward()
                    optimizer.step()

                    if loss.item() < best_epoch_loss:
                        best_epoch_loss = loss.item()
                        patience_counter = 0
                        best_model_state = model.state_dict()
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            break

                # Evaluate
                model.load_state_dict(best_model_state)
                model.eval()
                with torch.no_grad():
                    y_pred = torch.argmax(model(X_test), dim=1)
                    acc = accuracy_score(y_test.cpu(), y_pred.cpu())
                if acc > best_val_acc:
                    best_val_acc = acc
                    best_model = model
                    best_params = {'d_model': d_model, 'num_heads': num_heads, 'num_layers': num_layers}

    best_model.eval()
    with torch.no_grad():
        y_pred = torch.argmax(best_model(X_test), dim=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test.cpu(), y_pred.cpu())),
        "precision": float(precision_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test.cpu(), y_pred.cpu(), average="weighted"))
    }

    print(f"\nüìä Metrics for sequence length {prefix_length}: {metrics}")
    print(f"üõ†Ô∏è Best hyperparameters: {best_params}")

    os.makedirs("results/BPIC2013/Transformer model/SCap", exist_ok=True)
    out_path = f"results/BPIC2013/Transformer model/SCap/transformer_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump({"sequence_length": prefix_length, "best_hyperparameters": best_params, "metrics": metrics}, f, indent=4)
    print(f"üíæ Saved results to {out_path}")


sequence_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150]
for seq_len in sequence_lengths:
    run_experiment(seq_len)


In [None]:
# S2g
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Force GPU 3

import json
import numpy as np
import pandas as pd
import pm4py
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
    return df

df = import_xes("BPI_Challenge_2013_incidents.xes")

def create_transition_count_features(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    all_activities = df['concept:name'].unique()
    activity_to_idx = {act: i for i, act in enumerate(all_activities)}
    n_activities = len(all_activities)

    n_transitions = n_activities * n_activities

    for resource, group in df.groupby('org:resource'):
        acts = group['concept:name'].values
        if len(acts) >= prefix_length + 1:
            # Count transitions over the whole prefix
            transition_counts = np.zeros(n_transitions, dtype=np.float32)
            for t in range(prefix_length - 1):
                i = activity_to_idx[acts[t]]
                j = activity_to_idx[acts[t + 1]]
                transition_counts[i * n_activities + j] += 1.0  # increment count

            sequences.append(transition_counts)
            next_activities.append(acts[prefix_length])
            resources.append(resource)

    sequences_df = pd.DataFrame(sequences)
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df, all_activities

def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    return X_bal, y_bal


class ActivityTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (batch, seq_len=1, d_model)
        x = self.transformer(x)             # (batch, 1, d_model)
        x = x.mean(dim=1)                   # global average pooling
        x = self.fc(x)
        return x

def run_experiment(prefix_length, epochs=50, patience=5):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")
    
    sequences_df, all_activities = create_transition_count_features(df, prefix_length)
    
    le = LabelEncoder()
    le.fit(sequences_df['next_activity'])
    sequences_df['next_activity'] = le.transform(sequences_df['next_activity'])
    
    X = sequences_df.drop(columns=['next_activity', 'org:resource']).values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(np.int64)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    
    X_train, y_train = oversample_proportional(X_train, y_train)
    
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)
    
    best_model, best_params, best_val_acc = None, None, 0.0
    for d_model in [32, 64]:
        for num_heads in [2, 4]:
            for num_layers in [1, 2]:
                model = ActivityTransformer(X_train.shape[1], len(le.classes_),
                                            d_model=d_model, num_heads=num_heads, num_layers=num_layers).to(device)
                optimizer = optim.Adam(model.parameters(), lr=1e-3)
                criterion = nn.CrossEntropyLoss()
                best_epoch_loss, patience_counter = float('inf'), 0
                
                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()
                    outputs = model(X_train)
                    loss = criterion(outputs, y_train)
                    loss.backward()
                    optimizer.step()
                    
                    if loss.item() < best_epoch_loss:
                        best_epoch_loss = loss.item()
                        patience_counter = 0
                        best_model_state = model.state_dict()
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            break
                
                # Evaluate
                model.load_state_dict(best_model_state)
                model.eval()
                with torch.no_grad():
                    y_pred = torch.argmax(model(X_test), dim=1)
                    acc = accuracy_score(y_test.cpu(), y_pred.cpu())
                if acc > best_val_acc:
                    best_val_acc = acc
                    best_model = model
                    best_params = {'d_model': d_model, 'num_heads': num_heads, 'num_layers': num_layers}
    
    best_model.eval()
    with torch.no_grad():
        y_pred = torch.argmax(best_model(X_test), dim=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test.cpu(), y_pred.cpu())),
        "precision": float(precision_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test.cpu(), y_pred.cpu(), average="weighted"))
    }
    
    print(f"\nüìä Metrics for sequence length {prefix_length}: {metrics}")
    print(f"üõ†Ô∏è Best hyperparameters: {best_params}")
    
    os.makedirs("results/BPIC2013/Transformer model/S2g", exist_ok=True)
    out_path = f"results/BPIC2013/Transformer model/S2g/transformer_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump({"sequence_length": prefix_length, "best_hyperparameters": best_params, "metrics": metrics}, f, indent=4)
    print(f"üíæ Saved results to {out_path}")


sequence_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150]
for seq_len in sequence_lengths:
    run_experiment(seq_len)


In [None]:
# S2gR
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import json, gc, numpy as np, pandas as pd
from collections import defaultdict
import pm4py
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
    return df

df = import_xes("BPI_Challenge_2013_incidents.xes")

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            sequences.append(activities[:prefix_length])
            next_activities.append(activities[prefix_length])
            resources.append(resource)
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df

def create_transition_and_repeat_features(sequences_df):
    unique_activities = sorted(
        set(sequences_df.drop(columns=["next_activity", "org:resource"]).values.flatten()) - {None}
    )
    all_possible_transitions = [(a, b) for a in unique_activities for b in unique_activities]

    transition_counts = []
    repeat_pattern_features = []

    for _, row in sequences_df.iterrows():
        transitions = defaultdict(int)
        activities = row.drop(labels=["next_activity", "org:resource"]).dropna().tolist()

        # Transition counts
        for i in range(len(activities) - 1):
            transitions[(activities[i], activities[i + 1])] += 1
        row_counts = [transitions.get((a, b), 0) for (a, b) in all_possible_transitions]
        transition_counts.append(row_counts)

        # Repeat pattern features
        current_run = 1
        run_lengths = []
        for i in range(1, len(activities)):
            if activities[i] == activities[i - 1]:
                current_run += 1
            else:
                run_lengths.append(current_run)
                current_run = 1
        run_lengths.append(current_run)
        repeat_pattern_features.append([np.mean(run_lengths), len(run_lengths)])

    transitions_df = pd.DataFrame(transition_counts)
    repeat_df = pd.DataFrame(repeat_pattern_features, columns=["avg_run_length", "num_runs"])
    return pd.concat([sequences_df.reset_index(drop=True), transitions_df, repeat_df], axis=1)

def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    return np.vstack(X_resampled), np.hstack(y_resampled)


class ActivityTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

def run_experiment(prefix_length, epochs=50, patience=5):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")

    sequences_df = create_activity_sequences(df, prefix_length)

    label_encoder = LabelEncoder()
    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)]
    all_activities = sequences_df[activity_cols + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)
    for col in activity_cols + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    sequences_df = create_transition_and_repeat_features(sequences_df)
    X = sequences_df.drop(columns=['next_activity', 'org:resource']).values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(np.int64)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, shuffle=True)

    X_train, y_train = oversample_proportional(X_train, y_train)

    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test = torch.tensor(y_test, dtype=torch.long).to(device)

    best_model, best_params, best_val_acc = None, None, 0.0
    for d_model in [32, 64]:
        for num_heads in [2, 4]:
            for num_layers in [1, 2]:
                model = ActivityTransformer(X_train.shape[1], len(label_encoder.classes_),
                                            d_model=d_model, num_heads=num_heads, num_layers=num_layers).to(device)
                optimizer = optim.Adam(model.parameters(), lr=1e-3)
                criterion = nn.CrossEntropyLoss()
                best_epoch_loss, patience_counter = float('inf'), 0

                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()
                    outputs = model(X_train)
                    loss = criterion(outputs, y_train)
                    loss.backward()
                    optimizer.step()

                    if loss.item() < best_epoch_loss:
                        best_epoch_loss = loss.item()
                        patience_counter = 0
                        best_model_state = model.state_dict()
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            break

                # Evaluate
                model.load_state_dict(best_model_state)
                model.eval()
                with torch.no_grad():
                    y_pred = torch.argmax(model(X_test), dim=1)
                    acc = accuracy_score(y_test.cpu(), y_pred.cpu())
                if acc > best_val_acc:
                    best_val_acc = acc
                    best_model = model
                    best_params = {'d_model': d_model, 'num_heads': num_heads, 'num_layers': num_layers}

    best_model.eval()
    with torch.no_grad():
        y_pred = torch.argmax(best_model(X_test), dim=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test.cpu(), y_pred.cpu())),
        "precision": float(precision_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test.cpu(), y_pred.cpu(), average="weighted"))
    }

    print(f"\nüìä Metrics for sequence length {prefix_length}: {metrics}")
    print(f"üõ†Ô∏è Best hyperparameters: {best_params}")

    os.makedirs("results/BPIC2013/Transformer model/S2gR", exist_ok=True)
    out_path = f"results/BPIC2013/Transformer model/S2gR/transformer_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump({"sequence_length": prefix_length, "best_hyperparameters": best_params, "metrics": metrics}, f, indent=4)
    print(f"üíæ Saved results to {out_path}")

sequence_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150]
for seq_len in sequence_lengths:
    run_experiment(seq_len)


In [9]:
import shutil
import os

# Folder you want to download
folder_path = "models"
# Where to save the zip
zip_path = folder_path + ".zip"

# Create a zip file
shutil.make_archive(folder_path, 'zip', folder_path)

print(f"‚úÖ Folder zipped at: {zip_path}")


‚úÖ Folder zipped at: models.zip


In [5]:
# ==============================================================
# Transformer Experiment 4 ‚Äî Transitions + Repeat Features + GPU (PyTorch)
# Auto-saves model, encoder, and test set for SHAP
# ==============================================================

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import json, numpy as np, pandas as pd
from collections import defaultdict
import pm4py
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

# --------------------------
# GPU setup
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

# --------------------------
# Load event log
# --------------------------
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    df = pm4py.convert_to_dataframe(log)
    df = df[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
    df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
    return df

df = import_xes("BPI_Challenge_2019.xes")

# --------------------------
# Create sequences
# --------------------------
def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values
        if len(activities) >= prefix_length + 1:
            sequences.append(activities[:prefix_length])
            next_activities.append(activities[prefix_length])
            resources.append(resource)
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df

# --------------------------
# Transition + Repeat features
# --------------------------
def create_transition_and_repeat_features(sequences_df):
    unique_activities = sorted(
        set(sequences_df.drop(columns=["next_activity", "org:resource"]).values.flatten()) - {None}
    )
    all_possible_transitions = [(a, b) for a in unique_activities for b in unique_activities]

    transition_counts = []
    repeat_pattern_features = []

    for _, row in sequences_df.iterrows():
        transitions = defaultdict(int)
        activities = row.drop(labels=["next_activity", "org:resource"]).dropna().tolist()

        # Transition counts
        for i in range(len(activities) - 1):
            transitions[(activities[i], activities[i + 1])] += 1
        row_counts = [transitions.get((a, b), 0) for (a, b) in all_possible_transitions]
        transition_counts.append(row_counts)

        # Repeat pattern features
        current_run = 1
        run_lengths = []
        for i in range(1, len(activities)):
            if activities[i] == activities[i - 1]:
                current_run += 1
            else:
                run_lengths.append(current_run)
                current_run = 1
        run_lengths.append(current_run)
        repeat_pattern_features.append([np.mean(run_lengths), len(run_lengths)])

    transitions_df = pd.DataFrame(transition_counts)
    repeat_df = pd.DataFrame(repeat_pattern_features, columns=["avg_run_length", "num_runs"])
    return pd.concat([sequences_df.reset_index(drop=True), transitions_df, repeat_df], axis=1)

# --------------------------
# Oversample training set
# --------------------------
def oversample_proportional(X, y):
    counts = pd.Series(y).value_counts()
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls in counts.index:
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / len(y_cls)))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    return np.vstack(X_resampled), np.hstack(y_resampled)

# --------------------------
# PyTorch Transformer
# --------------------------
class ActivityTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=64, num_heads=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

# --------------------------
# Run experiment
# --------------------------
def run_experiment(prefix_length, epochs=50, patience=5):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")

    # 1Ô∏è‚É£ Create sequences
    sequences_df = create_activity_sequences(df, prefix_length)

    # 2Ô∏è‚É£ Encode activities
    label_encoder = LabelEncoder()
    activity_cols = [f"activity_{i+1}" for i in range(prefix_length)]
    all_activities = sequences_df[activity_cols + ['next_activity']].values.flatten()
    label_encoder.fit(all_activities)
    for col in activity_cols + ['next_activity']:
        sequences_df[col] = label_encoder.transform(sequences_df[col])

    # 3Ô∏è‚É£ Transition + repeat features
    sequences_df = create_transition_and_repeat_features(sequences_df)
    X = sequences_df.drop(columns=['next_activity', 'org:resource']).values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(np.int64)

    # 4Ô∏è‚É£ Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, shuffle=True)

    # Save test set for SHAP
    os.makedirs("data", exist_ok=True)
    X_test_df = pd.DataFrame(X_test, columns=sequences_df.drop(columns=['next_activity', 'org:resource']).columns)
    X_test_df.to_csv(f"data/bpic2019_s2gr_test_seq{prefix_length}.csv", index=False)
    print(f"üíæ Saved test set to data/bpic2019_s2gr_test_seq{prefix_length}.csv")

    # Save LabelEncoder for SHAP
    os.makedirs("models", exist_ok=True)
    with open(f"models/activity_label_encoder_seq{prefix_length}.pkl", "wb") as f:
        pickle.dump(label_encoder, f)
    print(f"üíæ Saved LabelEncoder to models/activity_label_encoder_seq{prefix_length}.pkl")

    # 5Ô∏è‚É£ Oversample training set
    X_train, y_train = oversample_proportional(X_train, y_train)

    # 6Ô∏è‚É£ Convert to tensors
    X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

    # 7Ô∏è‚É£ Hyperparameter tuning
    best_model, best_params, best_val_acc = None, None, 0.0
    for d_model in [32, 64]:
        for num_heads in [2, 4]:
            for num_layers in [1, 2]:
                model = ActivityTransformer(X_train.shape[1], len(label_encoder.classes_),
                                            d_model=d_model, num_heads=num_heads, num_layers=num_layers).to(device)
                optimizer = optim.Adam(model.parameters(), lr=1e-3)
                criterion = nn.CrossEntropyLoss()
                best_epoch_loss, patience_counter = float('inf'), 0

                for epoch in range(epochs):
                    model.train()
                    optimizer.zero_grad()
                    outputs = model(X_train)
                    loss = criterion(outputs, y_train)
                    loss.backward()
                    optimizer.step()

                    if loss.item() < best_epoch_loss:
                        best_epoch_loss = loss.item()
                        patience_counter = 0
                        best_model_state = model.state_dict()
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            break

                # Evaluate
                model.load_state_dict(best_model_state)
                model.eval()
                with torch.no_grad():
                    y_pred = torch.argmax(model(X_test_tensor), dim=1)
                    acc = accuracy_score(y_test_tensor.cpu(), y_pred.cpu())
                if acc > best_val_acc:
                    best_val_acc = acc
                    best_model = model
                    best_params = {'d_model': d_model, 'num_heads': num_heads, 'num_layers': num_layers}

    # 8Ô∏è‚É£ Final evaluation
    best_model.eval()
    with torch.no_grad():
        y_pred = torch.argmax(best_model(X_test_tensor), dim=1)
    metrics = {
        "accuracy": float(accuracy_score(y_test_tensor.cpu(), y_pred.cpu())),
        "precision": float(precision_score(y_test_tensor.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "recall": float(recall_score(y_test_tensor.cpu(), y_pred.cpu(), average="weighted", zero_division=0)),
        "f1_score": float(f1_score(y_test_tensor.cpu(), y_pred.cpu(), average="weighted"))
    }

    print(f"\nüìä Metrics for sequence length {prefix_length}: {metrics}")
    print(f"üõ†Ô∏è Best hyperparameters: {best_params}")

    # Save model for SHAP
    torch.save(best_model.state_dict(), f"models/bpic2019_transformer_s2gr_seq{prefix_length}.pt")
    print(f"üíæ Saved model to models/bpic2019_transformer_s2gr_seq{prefix_length}.pt")


# --------------------------
# Run multiple sequence lengths
# --------------------------
sequence_lengths = [100, 150, 200, 300, 400, 500, 600, 700, 800]
for seq_len in sequence_lengths:
    run_experiment(seq_len)


‚úÖ Using device: cuda


parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]


üöÄ Running experiment: sequence length = 100
üíæ Saved test set to data/bpic2019_s2gr_test_seq100.csv
üíæ Saved LabelEncoder to models/activity_label_encoder_seq100.pkl

üìä Metrics for sequence length 100: {'accuracy': 0.8307692307692308, 'precision': 0.7894586894586894, 'recall': 0.8307692307692308, 'f1_score': 0.7948914348063284}
üõ†Ô∏è Best hyperparameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 1}
üíæ Saved model to models/bpic2019_transformer_s2gr_seq100.pt

üöÄ Running experiment: sequence length = 150
üíæ Saved test set to data/bpic2019_s2gr_test_seq150.csv
üíæ Saved LabelEncoder to models/activity_label_encoder_seq150.pkl

üìä Metrics for sequence length 150: {'accuracy': 0.9137931034482759, 'precision': 0.934437386569873, 'recall': 0.9137931034482759, 'f1_score': 0.9201086270051788}
üõ†Ô∏è Best hyperparameters: {'d_model': 64, 'num_heads': 4, 'num_layers': 1}
üíæ Saved model to models/bpic2019_transformer_s2gr_seq150.pt

üöÄ Running experiment: sequence 