In [None]:
import os
import time
import json
import numpy as np
import pandas as pd
import pm4py
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")


def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

event_log = import_xes("BPI_Challenge_2019.xes")
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
print(f"‚úÖ Log loaded: {len(df)} events, {df['org:resource'].nunique()} resources.")

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        acts = resource_df['concept:name'].values
        if len(acts) >= prefix_length + 1:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    if not sequences:
        return pd.DataFrame()
    seq_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    seq_df['next_activity'] = next_activities
    seq_df['org:resource'] = resources
    return seq_df

def proportional_sampling(X, y):
    unique_classes, counts = np.unique(y, return_counts=True)
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls, count in zip(unique_classes, counts):
        if cls == -1:
            continue
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / count))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    idx = np.random.permutation(len(y_bal))
    return X_bal[idx], y_bal[idx]

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size//2, batch_first=True)
        self.fc = nn.Linear(hidden_size//2, num_classes)

    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=None):
    model.to(device)
    best_val_loss = float("inf")
    patience_counter = 0
    best_model_wts = model.state_dict()

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if (y_batch >= 0).sum() == 0:
                continue
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            if torch.isnan(loss):
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_loss, val_count = 0.0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                mask = y_batch >= 0
                if mask.sum() == 0:
                    continue
                output = model(X_batch)
                val_loss += criterion(output, y_batch).item()
                val_count += 1
        val_loss /= max(val_count, 1)

        print(f"Trial {trial_num} | Epoch {epoch+1}/{epochs} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_wts = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}.")
                break

    model.load_state_dict(best_model_wts)
    return model


def run_experiment(prefix_length):
    print(f"\n Running experiment: sequence length = {prefix_length}")
    start_time = time.time()

    sequences_df = create_activity_sequences(df, prefix_length)
    if sequences_df.empty:
        print("‚ö†Ô∏è Not enough sequences, skipping.")
        return

    # Label encoding (fit only on training)
    cols = [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']
    all_acts = sequences_df[cols].values.flatten()
    global_le = LabelEncoder()
    global_le.fit(all_acts)

    for col in cols:
        sequences_df[col] = global_le.transform(sequences_df[col])

    X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]].values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(np.int64)

    unique, counts = np.unique(y, return_counts=True)
    rare_labels = unique[counts < 2]
    y = np.where(np.isin(y, rare_labels), -1, y)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_res, y_train_res = proportional_sampling(X_train, y_train)
    X_train_res = np.expand_dims(X_train_res, axis=2)
    X_val = np.expand_dims(X_val, axis=2)

    valid_classes = np.unique(y_train_res[y_train_res >= 0])
    num_classes = len(valid_classes)
    label_map = {old: new for new, old in enumerate(valid_classes)}
    y_train_res = np.array([label_map[v] if v >= 0 else -1 for v in y_train_res])
    y_val = np.array([label_map[v] if v in label_map else -1 for v in y_val])

    def objective(trial):
        hidden_size = trial.suggest_int("hidden_size", 32, 192, step=32)
        lr = trial.suggest_categorical("lr", [1e-4, 5e-4, 1e-3])
        optimizer_name = trial.suggest_categorical("optimizer", ["adam", "rmsprop", "sgd"])
        trial_num = trial.number + 1

        print(f"\nüî• Starting Trial {trial_num} | hidden={hidden_size}, lr={lr}, opt={optimizer_name}")

        model = LSTMModel(X_train_res.shape[2], hidden_size, num_classes).to(device)
        if optimizer_name == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_name == "rmsprop":
            optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

        criterion = nn.CrossEntropyLoss(ignore_index=-1)
        train_loader = DataLoader(TensorDataset(torch.tensor(X_train_res), torch.tensor(y_train_res)), batch_size=32, shuffle=True)
        val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
        model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=trial_num)

        # Evaluate
        model.eval()
        y_pred = []
        valid_mask = y_val >= 0
        with torch.no_grad():
            for X_batch, _ in val_loader:
                X_batch = X_batch.to(device)
                output = model(X_batch)
                y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())
        if valid_mask.sum() == 0:
            return 0.0
        return accuracy_score(y_val[valid_mask], np.array(y_pred)[valid_mask])

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=15, catch=(Exception,))

    if len(study.trials) == 0 or study.best_trial is None:
        print("‚ö†Ô∏è No successful trials. Skipping saving metrics.")
        return

    best_params = study.best_params
    print(f"üèÜ Best hyperparameters: {best_params}")

    hidden_size = best_params["hidden_size"]
    lr = best_params["lr"]
    optimizer_name = best_params["optimizer"]

    model = LSTMModel(X_train_res.shape[2], hidden_size, num_classes).to(device)
    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train_res), torch.tensor(y_train_res)), batch_size=32, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
    model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=50, patience=5, trial_num="final")

    model.eval()
    y_pred = []
    valid_mask = y_val >= 0
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            output = model(X_batch)
            y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())

    metrics = {
        "accuracy": accuracy_score(y_val[valid_mask], np.array(y_pred)[valid_mask]),
        "precision": precision_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "recall": recall_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "f1_score": f1_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "num_samples": len(y_train_res),
        "sequence_length": prefix_length
    }

    results = {
        "sequence_length": prefix_length,
        "best_hyperparameters": best_params,
        "metrics": metrics,
        "runtime_seconds": round(time.time() - start_time, 2)
    }

    os.makedirs("results/BPIC2019/LSTM model/Baseline encoding", exist_ok=True)
    out_path = f"results/BPIC2019/LSTM model/Baseline encoding/lstm_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump(results, f, indent=4)

    print(f"\nüìä Evaluation Results for seq={prefix_length}:")
    for k, v in metrics.items():
        print(f"   {k}: {v:.4f}")
    print(f"üíæ Results saved to {out_path}")
    print(f"‚úÖ Experiment completed in {results['runtime_seconds']}s.\n")

sequence_lengths = [100, 150, 200, 300, 400, 500, 600, 700, 800]
for seq_len in sequence_lengths:
    run_experiment(seq_len)

print("All experiments completed! Results saved in /results/BPIC2019/LSTM model/Baseline encoding/")


In [None]:
#SCap
import os
import time
import json
import numpy as np
import pandas as pd
import pm4py
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

event_log = import_xes("BPI Challenge 2018.xes")
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)

def create_diversity_matrix(log):
    activity_counts = log.pivot_table(
        index='org:resource',
        columns='concept:name',
        aggfunc='size',
        fill_value=0
    )
    activity_counts.reset_index(inplace=True)
    return activity_counts

ra_div_matrix = create_diversity_matrix(event_log)
ra_div_binary = ra_div_matrix.copy()
ra_div_binary.iloc[:, 1:] = (ra_div_binary.iloc[:, 1:] > 0).astype(int)
activities = ra_div_matrix.columns[1:].tolist()

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, resource_df in df.groupby('org:resource'):
        acts = resource_df['concept:name'].values
        if len(acts) >= prefix_length + 1:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources
    return sequences_df

def proportional_sampling(X, y):
    unique_classes, counts = np.unique(y, return_counts=True)
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls, count in zip(unique_classes, counts):
        if cls == -1:
            continue  # skip ignored label
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / count))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    idx = np.random.permutation(len(y_bal))
    return X_bal[idx], y_bal[idx]

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size//2, batch_first=True)
        self.fc = nn.Linear(hidden_size//2, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3):
    model.to(device)
    best_val_loss = float("inf")
    patience_counter = 0
    best_model_wts = model.state_dict()

    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if (y_batch >= 0).sum() == 0:
                continue  # skip batch if all labels are -1
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            if torch.isnan(loss):
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        model.eval()
        val_loss = 0.0
        val_count = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                mask = y_batch >= 0
                if mask.sum() == 0:
                    continue
                output = model(X_batch)
                val_loss += criterion(output, y_batch).item()
                val_count += 1
        val_loss = val_loss / max(val_count, 1)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_wts = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    model.load_state_dict(best_model_wts)
    return model

def run_experiment(prefix_length):
    print(f"\nüöÄ Running experiment: sequence length = {prefix_length}")
    start_time = time.time()
    
    sequences_df = create_activity_sequences(df, prefix_length)
    if sequences_df.empty:
        print(f"‚ö†Ô∏è Skipping sequence length {prefix_length}: not enough data")
        return
    
    ra_filtered = ra_div_binary[ra_div_matrix['org:resource'].isin(sequences_df['org:resource'])].reset_index(drop=True)
    merged_df = pd.concat([sequences_df.reset_index(drop=True), ra_filtered.iloc[:, 1:]], axis=1)
    
    label_encoder = LabelEncoder()
    all_acts = merged_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()
    label_encoder.fit(all_acts)
    for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
        merged_df[col] = label_encoder.transform(merged_df[col])
    
    X = merged_df[[f"activity_{i+1}" for i in range(prefix_length)] + activities].values.astype(np.float32)
    y = merged_df['next_activity'].values.astype(np.int64)
    
    unique, counts = np.unique(y, return_counts=True)
    rare_labels = unique[counts < 2]
    y = np.where(np.isin(y, rare_labels), -1, y)
    
    X, y = proportional_sampling(X, y)
    X = np.expand_dims(X, axis=2)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    valid_classes = np.unique(y[y >= 0])
    num_classes = len(valid_classes)
    label_map = {old: new for new, old in enumerate(valid_classes)}
    y_train = np.array([label_map[v] if v >= 0 else -1 for v in y_train])
    y_val = np.array([label_map[v] if v >= 0 else -1 for v in y_val])
    
    def objective(trial):
        hidden_size = trial.suggest_int("hidden_size", 32, 192, step=32)
        lr = trial.suggest_categorical("lr", [1e-4, 5e-4, 1e-3])
        optimizer_name = trial.suggest_categorical("optimizer", ["adam", "rmsprop", "sgd"])
        
        model = LSTMModel(X_train.shape[2], hidden_size, num_classes).to(device)
        if optimizer_name == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_name == "rmsprop":
            optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        
        criterion = nn.CrossEntropyLoss(ignore_index=-1)
        train_loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)), batch_size=32, shuffle=True)
        val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
        
        model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3)
        
        # Evaluate
        model.eval()
        y_pred = []
        valid_mask = y_val >= 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                output = model(X_batch)
                y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())
        if valid_mask.sum() == 0:
            return 0.0
        acc = accuracy_score(y_val[valid_mask], np.array(y_pred)[valid_mask])
        return acc
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=15, catch=(Exception,))
    
    if len(study.trials) == 0 or study.best_trial is None:
        print("‚ö†Ô∏è No successful trials. Skipping saving metrics.")
        return
    
    best_params = study.best_params
    print(f"üèÜ Best hyperparameters for seq {prefix_length}: {best_params}")
    
    hidden_size = best_params["hidden_size"]
    lr = best_params["lr"]
    optimizer_name = best_params["optimizer"]
    model = LSTMModel(X_train.shape[2], hidden_size, num_classes).to(device)
    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)), batch_size=32, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
    model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=50, patience=5)
    
    # Metrics
    model.eval()
    y_pred = []
    valid_mask = y_val >= 0
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            output = model(X_batch)
            y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())
    
    metrics = {
        "accuracy": accuracy_score(y_val[valid_mask], np.array(y_pred)[valid_mask]),
        "precision": precision_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "recall": recall_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "f1_score": f1_score(y_val[valid_mask], np.array(y_pred)[valid_mask], average="weighted", zero_division=0),
        "num_samples": len(y_train),
        "sequence_length": prefix_length
    }
    
    results = {
        "sequence_length": prefix_length,
        "best_hyperparameters": best_params,
        "metrics": metrics,
        "runtime_seconds": round(time.time() - start_time, 2)
    }

    os.makedirs("results/BPIC2018/LSTM model/SCap_check", exist_ok=True)
    out_path = f"results/BPIC2018/LSTM model/SCap_check/lstm_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump(results, f, indent=4)

    print(f"\nüíæ Results saved to {out_path}")
    print(f"‚úÖ Experiment for sequence_length={prefix_length} completed in {results['runtime_seconds']}s.\n")

sequence_lengths = [2500]
for seq_len in sequence_lengths:
    run_experiment(seq_len)

print("\nüéØ All experiments completed! Results saved in /LSTM model/SCap/")


In [None]:
# S2g
import os
import time
import json
import numpy as np
import pandas as pd
import pm4py
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

event_log = import_xes("BPI Challenge 2018.xes")
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
print(f"‚úÖ Log loaded: {len(df)} events, {df['org:resource'].nunique()} resources.")

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, group in df.groupby('org:resource'):
        acts = group['concept:name'].tolist()
        if len(acts) > prefix_length:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    if not sequences:
        return pd.DataFrame()
    seq_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    seq_df["next_activity"] = next_activities
    seq_df["org:resource"] = resources
    return seq_df

def create_transition_features(sequences_df):
    unique_activities = sorted(set(sequences_df.drop(columns=['next_activity','org:resource']).values.flatten()))
    all_transitions = [(a, b) for a in unique_activities for b in unique_activities]
    transition_counts = []
    for _, row in sequences_df.iterrows():
        transitions = defaultdict(int)
        acts = row.drop(labels=['next_activity','org:resource']).dropna().tolist()
        for i in range(len(acts)-1):
            transitions[(acts[i], acts[i+1])] += 1
        row_counts = {f"{a}->{b}": transitions.get((a,b),0) for (a,b) in all_transitions}
        transition_counts.append(row_counts)
    return pd.concat([sequences_df.reset_index(drop=True), pd.DataFrame(transition_counts)], axis=1)

def proportional_sampling(X, y):
    unique_classes, counts = np.unique(y, return_counts=True)
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls, count in zip(unique_classes, counts):
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / count))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    idx = np.random.permutation(len(y_bal))
    return X_bal[idx], y_bal[idx]

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size // 2, batch_first=True)
        self.fc = nn.Linear(hidden_size // 2, num_classes)
    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=None):
    model.to(device)
    best_val_loss = float("inf")
    patience_counter = 0
    best_wts = model.state_dict()
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            if torch.isnan(loss):
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        count = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = model(X_batch)
                val_loss += criterion(output, y_batch).item()
                count += 1
        val_loss /= max(count, 1)

        print(f"üåÄ Trial {trial_num} | Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_wts = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"‚ö†Ô∏è Early stopping at epoch {epoch+1} (no improvement).")
                break

    model.load_state_dict(best_wts)
    return model

def run_experiment(prefix_length):
    print(f"\nüöÄ Running experiment for prefix_length = {prefix_length}")
    start_time = time.time()
    sequences_df = create_activity_sequences(df, prefix_length)
    if sequences_df.empty:
        print(f"‚ö†Ô∏è Skipping prefix_length={prefix_length}: not enough data.")
        return
    sequences_df = create_transition_features(sequences_df)

    # Encode all object columns
    encoders = {}
    for col in sequences_df.columns:
        if sequences_df[col].dtype == 'object':
            le = LabelEncoder()
            sequences_df[col] = le.fit_transform(sequences_df[col].astype(str))
            encoders[col] = le

    # Split features/labels
    X = sequences_df.drop(columns=['next_activity', 'org:resource']).values.astype(np.float32)
    y = sequences_df['next_activity'].values.astype(str)  # keep as string for LabelEncoder
    X = np.expand_dims(X, axis=1)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    # Fit LabelEncoder on training labels only
    le_target = LabelEncoder()
    y_train_enc = le_target.fit_transform(y_train)

    # Filter validation to only seen classes
    valid_mask = np.isin(y_val, le_target.classes_)
    X_val = X_val[valid_mask]
    y_val = y_val[valid_mask]
    y_val_enc = le_target.transform(y_val)

    # Oversample training only
    X_train_flat = X_train.reshape((X_train.shape[0], X_train.shape[2]))
    X_train_res, y_train_res = proportional_sampling(X_train_flat, y_train_enc)
    X_train_res = np.expand_dims(X_train_res, axis=1)

    num_classes = len(le_target.classes_)

    # Optuna Hyperparameter Tuning
    def objective(trial):
        trial_num = trial.number + 1
        hidden_size = trial.suggest_int("hidden_size", 32, 192, step=32)
        lr = trial.suggest_categorical("lr", [1e-4, 5e-4, 1e-3])
        optimizer_name = trial.suggest_categorical("optimizer", ["adam", "rmsprop", "sgd"])

        print(f"\nüî• Starting Trial {trial_num} | hidden={hidden_size}, lr={lr}, opt={optimizer_name}")

        model = LSTMModel(X_train_res.shape[2], hidden_size, num_classes).to(device)
        if optimizer_name == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_name == "rmsprop":
            optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

        criterion = nn.CrossEntropyLoss()
        train_loader = DataLoader(TensorDataset(torch.tensor(X_train_res), torch.tensor(y_train_res)), batch_size=32, shuffle=True)
        val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val_enc)), batch_size=32)
        model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=trial_num)

        # Evaluate accuracy
        model.eval()
        y_pred = []
        with torch.no_grad():
            for X_batch, _ in val_loader:
                X_batch = X_batch.to(device)
                output = model(X_batch)
                y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())
        acc = accuracy_score(y_val_enc, y_pred)
        print(f"‚úÖ Trial {trial_num} finished with Accuracy: {acc:.4f}")
        return acc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=15, catch=(Exception,))
    if len(study.trials) == 0 or study.best_trial is None:
        print("‚ö†Ô∏è No successful trials. Skipping saving metrics.")
        return

    best_params = study.best_params
    print(f"\nüèÜ Best Hyperparameters for seq {prefix_length}: {best_params}")

    # Final Training with Best Params
    hidden_size = best_params["hidden_size"]
    lr = best_params["lr"]
    optimizer_name = best_params["optimizer"]

    model = LSTMModel(X_train_res.shape[2], hidden_size, num_classes).to(device)
    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train_res), torch.tensor(y_train_res)), batch_size=32, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val_enc)), batch_size=32)
    model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=50, patience=5, trial_num="final")

    # Evaluation
    model.eval()
    y_pred = []
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            output = model(X_batch)
            y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())

    metrics = {
        "accuracy": accuracy_score(y_val_enc, y_pred),
        "precision": precision_score(y_val_enc, y_pred, average="weighted", zero_division=0),
        "recall": recall_score(y_val_enc, y_pred, average="weighted", zero_division=0),
        "f1_score": f1_score(y_val_enc, y_pred, average="weighted", zero_division=0),
        "num_samples": len(y_train_res),
        "sequence_length": prefix_length
    }

    results = {
        "sequence_length": prefix_length,
        "best_hyperparameters": best_params,
        "metrics": metrics,
        "runtime_seconds": round(time.time() - start_time, 2)
    }

    os.makedirs("results/BPIC2018/LSTM model/S2g", exist_ok=True)
    out_path = f"results/BPIC2018/LSTM model/S2g/lstm_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump(results, f, indent=4)

    print("\nüìä Evaluation Results:")
    for k, v in metrics.items():
        print(f"   {k}: {v:.4f}")
    print(f"üíæ Results saved to {out_path}")
    print(f"‚úÖ Experiment for sequence_length={prefix_length} completed in {results['runtime_seconds']}s.\n")

sequence_lengths = [100, 150, 200, 400, 600, 800, 1000, 1200, 1400, 1500, 2000, 2500]
for seq_len in sequence_lengths:
    run_experiment(seq_len)

print("\nüéØ All experiments completed! Results saved in /results/BPIC2019/LSTM model/S2g_fixed/")


In [None]:
# S2gR

import os
import time
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import optuna
import pm4py
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Using device: {device}")

def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    return pm4py.convert_to_dataframe(log)

event_log = import_xes("BPI_Challenge_2013_incidents.xes")
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]
df = df.sort_values(by=['org:resource', 'time:timestamp']).reset_index(drop=True)
print(f"‚úÖ Log loaded: {len(df)} events, {df['org:resource'].nunique()} resources.")

def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []
    for resource, group in df.groupby('org:resource'):
        acts = group['concept:name'].tolist()
        if len(acts) > prefix_length:
            sequences.append(acts[:prefix_length])
            next_activities.append(acts[prefix_length])
            resources.append(resource)
    if not sequences:
        return pd.DataFrame()
    seq_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    seq_df["next_activity"] = next_activities
    seq_df["org:resource"] = resources
    return seq_df

def create_transition_and_repeat_features(sequences_df):
    unique_activities = sorted(set(sequences_df.drop(columns=['next_activity','org:resource']).values.flatten()))
    all_transitions = [(a,b) for a in unique_activities for b in unique_activities]

    transition_counts = []
    repeat_features = []

    for _, row in sequences_df.iterrows():
        transitions = defaultdict(int)
        acts = row.drop(labels=['next_activity','org:resource']).dropna().tolist()

        # Count transitions
        for i in range(len(acts)-1):
            transitions[(acts[i], acts[i+1])] += 1
        row_counts = {f"{a}->{b}": transitions.get((a,b),0) for (a,b) in all_transitions}
        transition_counts.append(row_counts)

        # Repeat pattern features
        current_run = 1
        run_lengths = []
        for i in range(1, len(acts)):
            if acts[i] == acts[i-1]:
                current_run += 1
            else:
                run_lengths.append(current_run)
                current_run = 1
        run_lengths.append(current_run)
        repeat_features.append({
            "avg_run_length": np.mean(run_lengths),
            "num_runs": len(run_lengths)
        })

    transitions_df = pd.DataFrame(transition_counts)
    repeat_df = pd.DataFrame(repeat_features)
    return pd.concat([sequences_df.reset_index(drop=True), transitions_df, repeat_df], axis=1)

def proportional_sampling(X, y):
    unique_classes, counts = np.unique(y, return_counts=True)
    max_count = counts.max()
    X_resampled, y_resampled = [], []
    for cls, count in zip(unique_classes, counts):
        cls_mask = (y == cls)
        X_cls, y_cls = X[cls_mask], y[cls_mask]
        n_repeat = int(np.ceil(max_count / count))
        X_resampled.append(np.tile(X_cls, (n_repeat, 1)))
        y_resampled.append(np.tile(y_cls, n_repeat))
    X_bal = np.vstack(X_resampled)
    y_bal = np.hstack(y_resampled)
    idx = np.random.permutation(len(y_bal))
    return X_bal[idx], y_bal[idx]


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size//2, batch_first=True)
        self.fc = nn.Linear(hidden_size//2, num_classes)
    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=None):
    model.to(device)
    best_val_loss = float("inf")
    patience_counter = 0
    best_wts = model.state_dict()
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            if torch.isnan(loss):
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        count = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = model(X_batch)
                val_loss += criterion(output, y_batch).item()
                count += 1
        val_loss /= max(count, 1)

        print(f"üåÄ Trial {trial_num} | Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_wts = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"‚ö†Ô∏è Early stopping at epoch {epoch+1} (no improvement).")
                break

    model.load_state_dict(best_wts)
    return model

def run_experiment(prefix_length):
    print(f"Running Experiment 4 for prefix_length = {prefix_length}")
    start_time = time.time()
    sequences_df = create_activity_sequences(df, prefix_length)
    if sequences_df.empty:
        print(f"‚ö†Ô∏è Skipping prefix_length={prefix_length}: not enough data.")
        return
    sequences_df = create_transition_and_repeat_features(sequences_df)

    # Encode object features except target
    for col in sequences_df.columns:
        if sequences_df[col].dtype == 'object' and col not in ['next_activity']:
            le = LabelEncoder()
            sequences_df[col] = le.fit_transform(sequences_df[col].astype(str))

    # Separate features and raw labels
    X = sequences_df.drop(columns=['next_activity','org:resource']).values.astype(np.float32)
    y_raw = sequences_df['next_activity'].astype(str).values

    # Split before encoding y
    X_train, X_val, y_train_raw, y_val_raw = train_test_split(X, y_raw, test_size=0.2, random_state=42)

    # Encode y only on training labels
    le_y = LabelEncoder()
    y_train = le_y.fit_transform(y_train_raw)
    y_val = np.array([le_y.transform([label])[0] if label in le_y.classes_ else -1 for label in y_val_raw])
    mask = y_val != -1
    X_val = X_val[mask]
    y_val = y_val[mask]

    # Balance only the training data
    X_train, y_train = proportional_sampling(X_train, y_train)
    X_train = np.expand_dims(X_train, axis=1)
    X_val = np.expand_dims(X_val, axis=1)
    num_classes = len(le_y.classes_)

    def objective(trial):
        trial_num = trial.number + 1
        hidden_size = trial.suggest_int("hidden_size", 32, 192, step=32)
        lr = trial.suggest_categorical("lr", [1e-4, 5e-4, 1e-3])
        optimizer_name = trial.suggest_categorical("optimizer", ["adam", "rmsprop", "sgd"])

        print(f"\nüî• Starting Trial {trial_num} | hidden={hidden_size}, lr={lr}, opt={optimizer_name}")

        model = LSTMModel(X_train.shape[2], hidden_size, num_classes).to(device)
        if optimizer_name == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif optimizer_name == "rmsprop":
            optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

        criterion = nn.CrossEntropyLoss()
        train_loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)), batch_size=32, shuffle=True)
        val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
        model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=15, patience=3, trial_num=trial_num)

        # Evaluate accuracy
        model.eval()
        y_pred = []
        with torch.no_grad():
            for X_batch, _ in val_loader:
                X_batch = X_batch.to(device)
                output = model(X_batch)
                y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())
        acc = accuracy_score(y_val, y_pred)
        print(f"‚úÖ Trial {trial_num} finished with Accuracy: {acc:.4f}")
        return acc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=18, catch=(Exception,))
    if len(study.trials) == 0 or study.best_trial is None:
        print("‚ö†Ô∏è No successful trials. Skipping saving metrics.")
        return

    best_params = study.best_params
    print(f"Best Hyperparameters for seq {prefix_length}: {best_params}")

    hidden_size = best_params["hidden_size"]
    lr = best_params["lr"]
    optimizer_name = best_params["optimizer"]

    model = LSTMModel(X_train.shape[2], hidden_size, num_classes).to(device)
    if optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)), batch_size=32, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)), batch_size=32)
    model = train_model(model, train_loader, val_loader, optimizer, criterion, epochs=50, patience=5, trial_num="final")

    model.eval()
    y_pred = []
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            output = model(X_batch)
            y_pred.extend(torch.argmax(output, dim=1).cpu().numpy())

    metrics = {
        "accuracy": accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred, average="weighted", zero_division=0),
        "recall": recall_score(y_val, y_pred, average="weighted", zero_division=0),
        "f1_score": f1_score(y_val, y_pred, average="weighted", zero_division=0),
        "num_samples": len(y_train),
        "sequence_length": prefix_length
    }

    results = {
        "sequence_length": prefix_length,
        "best_hyperparameters": best_params,
        "metrics": metrics,
        "runtime_seconds": round(time.time() - start_time, 2)
    }

    os.makedirs("results/BPIC2013/LSTM model/S2gR", exist_ok=True)
    out_path = f"results/BPIC2013/LSTM model/S2gR/lstm_seq_{prefix_length}.json"
    with open(out_path, "w") as f:
        json.dump(results, f, indent=4)

    print("\nüìä Evaluation Results:")
    for k, v in metrics.items():
        print(f"   {k}: {v:.4f}")
    print(f"üíæ Results saved to {out_path}")
    print(f"‚úÖ Experiment for sequence_length={prefix_length} completed in {results['runtime_seconds']}s.\n")

sequence_lengths = [10, 20, 30, 40, 50, 75, 100, 125, 150]
for seq_len in sequence_lengths:
    run_experiment(seq_len)

print("\nüéâ All experiments completed successfully for Experiment 4!")
