In [3]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [43]:
def import_xes(file_path):
    log = pm4py.read_xes(file_path)
    event_log = pm4py.convert_to_dataframe(log)

    return event_log

event_log = import_xes("/Users/6706363/Downloads/BPI_Challenge_2019.xes")

parsing log, completed traces :: 100%|██████████| 251734/251734 [00:52<00:00, 4773.62it/s]


In [74]:
df = event_log[['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp']]

df = df.sort_values(by=['org:resource', 'time:timestamp'])

df.head(n=20)

Unnamed: 0,case:concept:name,concept:name,org:resource,time:timestamp
118143,4507004931_00010,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118144,4507004931_00010,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118153,4507004931_00020,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118154,4507004931_00020,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118163,4507004931_00030,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118164,4507004931_00030,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118173,4507004931_00040,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118174,4507004931_00040,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00
118183,4507004931_00050,Vendor creates invoice,NONE,1948-01-26 22:59:00+00:00
118184,4507004931_00050,Vendor creates debit memo,NONE,1948-01-26 22:59:00+00:00


In [95]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

prefix_length = 700  

# Function to create activity sequences
def create_activity_sequences(df, prefix_length):
    sequences, next_activities, resources = [], [], []

    for resource, resource_df in df.groupby('org:resource'):
        activities = resource_df['concept:name'].values  
        if len(activities) >= prefix_length + 1:
            prefix = activities[:prefix_length]
            next_activity = activities[prefix_length]
            sequences.append(prefix)
            next_activities.append(next_activity)
            resources.append(resource)

    sequences_df = pd.DataFrame(sequences, columns=[f"activity_{i+1}" for i in range(prefix_length)])
    sequences_df['next_activity'] = next_activities
    sequences_df['org:resource'] = resources  

    return sequences_df

# Create sequences
sequences_df = create_activity_sequences(df, prefix_length)

In [96]:
label_encoder = LabelEncoder()
all_activities = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']].values.flatten()
label_encoder.fit(all_activities)

# Apply encoding
for col in [f"activity_{i+1}" for i in range(prefix_length)] + ['next_activity']:
    sequences_df[col] = label_encoder.transform(sequences_df[col])

# Store mapping
activity_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
inverse_activity_mapping = {v: k for k, v in activity_mapping.items()}

In [53]:
## Experiment 1
X = sequences_df[[f"activity_{i+1}" for i in range(prefix_length)]]
y = sequences_df['next_activity']

X.columns = [col.replace(":", "_") for col in X.columns]

rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if rare_classes:
    if len(rare_classes) > 1:
        # Replace multiple rare classes with the last valid label
        new_label = len(y.unique()) - 1  
        y = y.replace(rare_classes, new_label)
    else:
        # Duplicate the single rare class in both X and y
        rare_indices = y[y.isin(rare_classes)].index  # Get indices of rare classes
        
        # Duplicate entries in X and y using the rare indices
        X = pd.concat([X, X.loc[rare_indices]], ignore_index=True)  # Concatenate rows for X
        y = pd.concat([y, y.iloc[rare_indices]], ignore_index=True)  # Concatenate labels for y


# Convert to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y)

# DataLoader
batch_size = 20
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class ActivityTransformer(nn.Module):
    def __init__(self, num_activities, d_model=128, num_heads=4, num_layers=2, dropout=0.1):
        super(ActivityTransformer, self).__init__()
        self.embedding = nn.Embedding(num_activities, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, prefix_length, d_model))

        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(d_model, num_activities)  # ✅ Fix: Output size matches num_activities

    def forward(self, x):
        x = self.embedding(x) + self.pos_embedding  # Add positional encoding
        x = self.transformer(x)  
        x = x.mean(dim=1)  # Pooling
        x = self.fc(x)  # Fully connected layer
        return x

# Define Model
num_activities = len(activity_mapping)  # ✅ Ensure correct number of activities
model = ActivityTransformer(num_activities)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ✅ Early Stopping
best_loss = float('inf')
patience, patience_counter = 20, 0  # Stop if no improvement after 20 epochs

# ✅ Training Loop with Early Stopping
epochs = 200
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # Early Stopping Logic
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset patience
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break  # Stop training

# ✅ Load Best Model
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# ✅ Evaluation (Accuracy, Precision, Recall, F1, SD)
all_y_true, all_y_pred = [], []

batch_acc, batch_prec, batch_rec, batch_f1 = [], [], [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = model(X_batch)
        _, predicted = torch.max(output, 1)

        batch_true = y_batch.cpu().numpy()
        batch_pred = predicted.cpu().numpy()

        # Store all predictions
        all_y_true.extend(batch_true)
        all_y_pred.extend(batch_pred)

        # Compute metrics for the batch
        batch_acc.append(accuracy_score(batch_true, batch_pred))
        batch_prec.append(precision_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_rec.append(recall_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_f1.append(f1_score(batch_true, batch_pred, average='weighted', zero_division=0))

# Convert to NumPy arrays
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)

# Compute Overall Metrics
accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
recall = recall_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
f1 = f1_score(all_y_true, all_y_pred, average='weighted')

# Compute Standard Deviation (SD) for each metric
acc_std = np.std(batch_acc)
prec_std = np.std(batch_prec)
rec_std = np.std(batch_rec)
f1_std = np.std(batch_f1)

# ✅ Print Metrics with SD
print("\n📊 Model Performance:")
print(f"✅ Accuracy: {accuracy:.4f} (±{acc_std:.4f})")
print(f"✅ Precision: {precision:.4f} (±{prec_std:.4f})")
print(f"✅ Recall: {recall:.4f} (±{rec_std:.4f})")
print(f"✅ F1-score: {f1:.4f} (±{f1_std:.4f})")




Epoch 1, Loss: 1.7706
Epoch 2, Loss: 0.8308
Epoch 3, Loss: 0.6547
Epoch 4, Loss: 0.6236
Epoch 5, Loss: 0.5113
Epoch 6, Loss: 0.4686
Epoch 7, Loss: 0.4556
Epoch 8, Loss: 0.4343
Epoch 9, Loss: 0.3968
Epoch 10, Loss: 0.3979
Epoch 11, Loss: 0.3960
Epoch 12, Loss: 0.3973
Epoch 13, Loss: 0.3869
Epoch 14, Loss: 0.3898
Epoch 15, Loss: 0.3874
Epoch 16, Loss: 0.3676
Epoch 17, Loss: 0.3729
Epoch 18, Loss: 0.3823
Epoch 19, Loss: 0.3802
Epoch 20, Loss: 0.3588
Epoch 21, Loss: 0.3732
Epoch 22, Loss: 0.3589
Epoch 23, Loss: 0.3632
Epoch 24, Loss: 0.4212
Epoch 25, Loss: 0.3634
Epoch 26, Loss: 0.3509
Epoch 27, Loss: 0.3573
Epoch 28, Loss: 0.3493
Epoch 29, Loss: 0.3692
Epoch 30, Loss: 0.3509
Epoch 31, Loss: 0.3559
Epoch 32, Loss: 0.3553
Epoch 33, Loss: 0.3534
Epoch 34, Loss: 0.3450
Epoch 35, Loss: 0.3456
Epoch 36, Loss: 0.3260
Epoch 37, Loss: 0.3285
Epoch 38, Loss: 0.3329
Epoch 39, Loss: 0.3340
Epoch 40, Loss: 0.3507
Epoch 41, Loss: 0.3426
Epoch 42, Loss: 0.3375
Epoch 43, Loss: 0.3302
Epoch 44, Loss: 0.32

In [66]:
## Experiment 2: Next Activity Prediction with activity information
import binary_classifier

ra_diversity_matrix = binary_classifier.create_diversity_matrix(event_log)
ra_diversity_matrix_binary = ra_diversity_matrix.copy()
# Apply a binary transformation: any count > 0 becomes 1 (yes), else 0 (no)
ra_diversity_matrix_binary.iloc[:, 1:] = (ra_diversity_matrix_binary.iloc[:, 1:] > 0).astype(int)

activities = ra_diversity_matrix.columns[1:].tolist()  # Convert to a list of activities
print(activities)
binary_activities = ra_diversity_matrix_binary.iloc[:, :]


['Block Purchase Order Item', 'Cancel Goods Receipt', 'Cancel Invoice Receipt', 'Cancel Subsequent Invoice', 'Change Approval for Purchase Order', 'Change Currency', 'Change Delivery Indicator', 'Change Final Invoice Indicator', 'Change Price', 'Change Quantity', 'Change Rejection Indicator', 'Change Storage Location', 'Change payment term', 'Clear Invoice', 'Create Purchase Order Item', 'Create Purchase Requisition Item', 'Delete Purchase Order Item', 'Reactivate Purchase Order Item', 'Receive Order Confirmation', 'Record Goods Receipt', 'Record Invoice Receipt', 'Record Service Entry Sheet', 'Record Subsequent Invoice', 'Release Purchase Order', 'Release Purchase Requisition', 'Remove Payment Block', 'SRM: Awaiting Approval', 'SRM: Change was Transmitted', 'SRM: Complete', 'SRM: Created', 'SRM: Deleted', 'SRM: Document Completed', 'SRM: Held', 'SRM: In Transfer to Execution Syst.', 'SRM: Incomplete', 'SRM: Ordered', 'SRM: Transaction Completed', 'SRM: Transfer Failed (E.Sys.)', 'Set 

In [67]:
# Keep only resources that are in sequences_df
filtered_binary_activities = binary_activities[binary_activities['org:resource'].isin(sequences_df['org:resource'])]

# Reset index to ensure proper alignment
filtered_binary_activities = filtered_binary_activities.reset_index(drop=True)
sequences_df = sequences_df.reset_index(drop=True)

# Merge again
merged_df = pd.concat([sequences_df, filtered_binary_activities], axis=1)

In [68]:

X = merged_df[[f"activity_{i+1}" for i in range(prefix_length)] + activities]
y = merged_df['next_activity']

X.columns = [col.replace(":", "_") for col in X.columns]

rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if rare_classes:
    if len(rare_classes) > 1:
        # Replace multiple rare classes with the last valid label
        new_label = len(y.unique()) - 1  
        y = y.replace(rare_classes, new_label)
    else:
        # Duplicate the single rare class in both X and y
        rare_indices = y[y.isin(rare_classes)].index  # Get indices of rare classes
        
        # Duplicate entries in X and y using the rare indices
        X = pd.concat([X, X.loc[rare_indices]], ignore_index=True)  # Concatenate rows for X
        y = pd.concat([y, y.iloc[rare_indices]], ignore_index=True)  # Concatenate labels for y


# Convert to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y)

# DataLoader
batch_size = 20
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class ActivityTransformer(nn.Module):
    def __init__(self, num_activities, d_model=128, num_heads=4, num_layers=2, dropout=0.1):
        super(ActivityTransformer, self).__init__()
        self.embedding = nn.Embedding(num_activities, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, prefix_length + len(activities), d_model))

        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(d_model, num_activities)  # ✅ Fix: Output size matches num_activities

    def forward(self, x):
        x = self.embedding(x) + self.pos_embedding  # Add positional encoding
        x = self.transformer(x)  
        x = x.mean(dim=1)  # Pooling
        x = self.fc(x)  # Fully connected layer
        return x

# Define Model
num_activities = len(activity_mapping)  # ✅ Ensure correct number of activities
model = ActivityTransformer(num_activities)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ✅ Early Stopping
best_loss = float('inf')
patience, patience_counter = 20, 0  # Stop if no improvement after 20 epochs

# ✅ Training Loop with Early Stopping
epochs = 200
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # Early Stopping Logic
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset patience
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break  # Stop training

# ✅ Load Best Model
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# ✅ Evaluation (Accuracy, Precision, Recall, F1, SD)
all_y_true, all_y_pred = [], []

batch_acc, batch_prec, batch_rec, batch_f1 = [], [], [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = model(X_batch)
        _, predicted = torch.max(output, 1)

        batch_true = y_batch.cpu().numpy()
        batch_pred = predicted.cpu().numpy()

        # Store all predictions
        all_y_true.extend(batch_true)
        all_y_pred.extend(batch_pred)

        # Compute metrics for the batch
        batch_acc.append(accuracy_score(batch_true, batch_pred))
        batch_prec.append(precision_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_rec.append(recall_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_f1.append(f1_score(batch_true, batch_pred, average='weighted', zero_division=0))

# Convert to NumPy arrays
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)

# Compute Overall Metrics
accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
recall = recall_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
f1 = f1_score(all_y_true, all_y_pred, average='weighted')

# Compute Standard Deviation (SD) for each metric
acc_std = np.std(batch_acc)
prec_std = np.std(batch_prec)
rec_std = np.std(batch_rec)
f1_std = np.std(batch_f1)

# ✅ Print Metrics with SD
print("\n📊 Model Performance:")
print(f"✅ Accuracy: {accuracy:.4f} (±{acc_std:.4f})")
print(f"✅ Precision: {precision:.4f} (±{prec_std:.4f})")
print(f"✅ Recall: {recall:.4f} (±{rec_std:.4f})")
print(f"✅ F1-score: {f1:.4f} (±{f1_std:.4f})")




Epoch 1, Loss: 1.5453
Epoch 2, Loss: 0.7273
Epoch 3, Loss: 0.6359
Epoch 4, Loss: 0.5269
Epoch 5, Loss: 0.5098
Epoch 6, Loss: 0.4727
Epoch 7, Loss: 0.4638
Epoch 8, Loss: 0.4247
Epoch 9, Loss: 0.4339
Epoch 10, Loss: 0.4153
Epoch 11, Loss: 0.4141
Epoch 12, Loss: 0.3834
Epoch 13, Loss: 0.3914
Epoch 14, Loss: 0.4090
Epoch 15, Loss: 0.4058
Epoch 16, Loss: 0.3855
Epoch 17, Loss: 0.4040
Epoch 18, Loss: 0.3676
Epoch 19, Loss: 0.3871
Epoch 20, Loss: 0.3638
Epoch 21, Loss: 0.3795
Epoch 22, Loss: 0.3696
Epoch 23, Loss: 0.3647
Epoch 24, Loss: 0.3583
Epoch 25, Loss: 0.3460
Epoch 26, Loss: 0.3541
Epoch 27, Loss: 0.3597
Epoch 28, Loss: 0.3523
Epoch 29, Loss: 0.3449
Epoch 30, Loss: 0.3358
Epoch 31, Loss: 0.3494
Epoch 32, Loss: 0.3415
Epoch 33, Loss: 0.3466
Epoch 34, Loss: 0.3617
Epoch 35, Loss: 0.3376
Epoch 36, Loss: 0.3514
Epoch 37, Loss: 0.3379
Epoch 38, Loss: 0.3415
Epoch 39, Loss: 0.3541
Epoch 40, Loss: 0.3282
Epoch 41, Loss: 0.3273
Epoch 42, Loss: 0.3301
Epoch 43, Loss: 0.3216
Epoch 44, Loss: 0.32

In [97]:
sequences_df = sequences_df.drop(columns=['org:resource'])

In [86]:
from collections import defaultdict
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []

# Iterate through each row to count transitions
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Extract non-null activities

    # Count actual transitions
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1

    # Ensure every possible transition exists (fill with 0 if not present)
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)

# Convert list of transition count dictionaries to a DataFrame
transitions_df = pd.DataFrame(transition_counts)

# Rename columns to string format (e.g., '0->0', '0->1', etc.)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

# Merge with original DataFrame
result_df = pd.concat([sequences_df, transitions_df], axis=1)

X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

X.columns = [col.replace(":", "_") for col in X.columns]

rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if rare_classes:
    if len(rare_classes) > 1:
        # Replace multiple rare classes with the last valid label
        new_label = len(y.unique()) - 1  
        y = y.replace(rare_classes, new_label)
    else:
        # Duplicate the single rare class in both X and y
        rare_indices = y[y.isin(rare_classes)].index  # Get indices of rare classes
        
        # Duplicate entries in X and y using the rare indices
        X = pd.concat([X, X.loc[rare_indices]], ignore_index=True)  # Concatenate rows for X
        y = pd.concat([y, y.iloc[rare_indices]], ignore_index=True)  # Concatenate labels for y

# Feature selection (now after handling rare classes)
X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_selected, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y)

print(f"Min in X_train: {X_train.min()}, Max in X_train: {X_train.max()}")
print(f"Min in X_test: {X_test.min()}, Max in X_test: {X_test.max()}")

# DataLoader
batch_size = 20
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class ActivityTransformer(nn.Module):
    def __init__(self, num_activities, d_model=128, num_heads=4, num_layers=2, dropout=0.1):
        super(ActivityTransformer, self).__init__()
        self.embedding = nn.Embedding(num_activities, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, X_batch.size(1), d_model))  # Use X_batch.size(1)

        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(d_model, num_activities)  # ✅ Fix: Output size matches num_activities

    def forward(self, x):
        x = self.embedding(x) + self.pos_embedding  # Add positional encoding
        x = self.transformer(x)  
        x = x.mean(dim=1)  # Pooling
        x = self.fc(x)  # Fully connected layer
        return x

# Define Model
num_activities = X_tensor.max().item() + 1  
model = ActivityTransformer(num_activities)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ✅ Early Stopping
best_loss = float('inf')
patience, patience_counter = 20, 0  # Stop if no improvement after 20 epochs

# ✅ Training Loop with Early Stopping
epochs = 200
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # Early Stopping Logic
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset patience
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break  # Stop training

# ✅ Load Best Model
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# ✅ Evaluation (Accuracy, Precision, Recall, F1, SD)
all_y_true, all_y_pred = [], []

batch_acc, batch_prec, batch_rec, batch_f1 = [], [], [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = model(X_batch)
        _, predicted = torch.max(output, 1)

        batch_true = y_batch.cpu().numpy()
        batch_pred = predicted.cpu().numpy()

        # Store all predictions
        all_y_true.extend(batch_true)
        all_y_pred.extend(batch_pred)

        # Compute metrics for the batch
        batch_acc.append(accuracy_score(batch_true, batch_pred))
        batch_prec.append(precision_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_rec.append(recall_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_f1.append(f1_score(batch_true, batch_pred, average='weighted', zero_division=0))

# Convert to NumPy arrays
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)

# Compute Overall Metrics
accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
recall = recall_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
f1 = f1_score(all_y_true, all_y_pred, average='weighted')

# Compute Standard Deviation (SD) for each metric
acc_std = np.std(batch_acc)
prec_std = np.std(batch_prec)
rec_std = np.std(batch_rec)
f1_std = np.std(batch_f1)

# ✅ Print Metrics with SD
print("\n📊 Model Performance:")
print(f"✅ Accuracy: {accuracy:.4f} (±{acc_std:.4f})")
print(f"✅ Precision: {precision:.4f} (±{prec_std:.4f})")
print(f"✅ Recall: {recall:.4f} (±{rec_std:.4f})")
print(f"✅ F1-score: {f1:.4f} (±{f1_std:.4f})")

Min in X_train: 1, Max in X_train: 31
Min in X_test: 1, Max in X_test: 20




Epoch 1, Loss: 1.4990
Epoch 2, Loss: 0.6353
Epoch 3, Loss: 0.4478
Epoch 4, Loss: 0.3649
Epoch 5, Loss: 0.3027
Epoch 6, Loss: 0.2720
Epoch 7, Loss: 0.2622
Epoch 8, Loss: 0.2681
Epoch 9, Loss: 0.2435
Epoch 10, Loss: 0.2452
Epoch 11, Loss: 0.2320
Epoch 12, Loss: 0.2205
Epoch 13, Loss: 0.2088
Epoch 14, Loss: 0.2110
Epoch 15, Loss: 0.2020
Epoch 16, Loss: 0.2083
Epoch 17, Loss: 0.1868
Epoch 18, Loss: 0.2026
Epoch 19, Loss: 0.1853
Epoch 20, Loss: 0.1897
Epoch 21, Loss: 0.1837
Epoch 22, Loss: 0.1860
Epoch 23, Loss: 0.1780
Epoch 24, Loss: 0.1806
Epoch 25, Loss: 0.1812
Epoch 26, Loss: 0.1689
Epoch 27, Loss: 0.1588
Epoch 28, Loss: 0.1596
Epoch 29, Loss: 0.1954
Epoch 30, Loss: 0.1663
Epoch 31, Loss: 0.1651
Epoch 32, Loss: 0.1538
Epoch 33, Loss: 0.1603
Epoch 34, Loss: 0.1453
Epoch 35, Loss: 0.1522
Epoch 36, Loss: 0.1463
Epoch 37, Loss: 0.1427
Epoch 38, Loss: 0.1383
Epoch 39, Loss: 0.1445
Epoch 40, Loss: 0.1528
Epoch 41, Loss: 0.1299
Epoch 42, Loss: 0.1305
Epoch 43, Loss: 0.1257
Epoch 44, Loss: 0.13

In [98]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Get unique activities from the dataset
unique_activities = sorted(set(sequences_df.values.flatten()))

# Generate all possible transitions
all_possible_transitions = {(a, b) for a in unique_activities for b in unique_activities}

# Create a list to store transition count dictionaries
transition_counts = []
repeat_pattern_features = []

# Iterate through each row to count transitions and compute repeat features
for _, row in sequences_df.iterrows():
    transitions = defaultdict(int)
    activities = row.dropna().values  # Non-null activities
    
    # --- Transition Counting ---
    for i in range(len(activities) - 1):
        transition = (activities[i], activities[i + 1])
        transitions[transition] += 1
    row_counts = {t: transitions.get(t, 0) for t in all_possible_transitions}
    transition_counts.append(row_counts)
    
    # --- Repeat Pattern Features ---
    max_run = 1
    current_run = 1
    run_lengths = []
    repetitive_activities = set()
    
    for i in range(1, len(activities)):
        if activities[i] == activities[i - 1]:
            current_run += 1
            repetitive_activities.add(activities[i])
        else:
            run_lengths.append(current_run)
            current_run = 1
    run_lengths.append(current_run)  # Add final run
    
    max_run_length = max(run_lengths)
    avg_run_length = np.mean(run_lengths)
    num_runs = len(run_lengths)
    num_repetitive_activities = len(repetitive_activities)

    repeat_pattern_features.append({
        'max_run_length': max_run_length,
        'avg_run_length': avg_run_length,
        'num_runs': num_runs,
        'num_repetitive_activities': num_repetitive_activities
    })

# Convert to DataFrames
transitions_df = pd.DataFrame(transition_counts)
transitions_df.columns = [f"{a}->{b}" for a, b in transitions_df.columns]

repeat_df = pd.DataFrame(repeat_pattern_features)

# Merge everything
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Compute mutual information scores for repeat pattern features
mi_scores = mutual_info_classif(repeat_df, result_df['next_activity'], discrete_features=True)
feature_scores = dict(zip(repeat_df.columns, mi_scores))
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

print("\nMutual Information Scores for Repeat Pattern Features:")
for feature, score in sorted_features:
    print(f"{feature}: {score:.4f}")

# Remove the least important features based on MI scores (i.e., num_repetitive_activities, max_run_length)
repeat_df = repeat_df.drop(columns=['num_repetitive_activities', 'max_run_length'])

# Merge updated repeat_df with result_df
result_df = pd.concat([sequences_df, transitions_df, repeat_df], axis=1)

# Prepare features and labels
X = result_df.drop(columns=['next_activity'])
y = result_df['next_activity']

X.columns = [col.replace(":", "_") for col in X.columns]

rare_classes = y.value_counts()[y.value_counts() == 1].index.tolist()

if rare_classes:
    if len(rare_classes) > 1:
        # Replace multiple rare classes with the last valid label
        new_label = len(y.unique()) - 1  
        y = y.replace(rare_classes, new_label)
    else:
        # Duplicate the single rare class in both X and y
        rare_indices = y[y.isin(rare_classes)].index  # Get indices of rare classes
        
        # Duplicate entries in X and y using the rare indices
        X = pd.concat([X, X.loc[rare_indices]], ignore_index=True)  # Concatenate rows for X
        y = pd.concat([y, y.iloc[rare_indices]], ignore_index=True)  # Concatenate labels for y

# Feature selection (now after handling rare classes)
X_selected = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_selected, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y)

# DataLoader
batch_size = 20
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class ActivityTransformer(nn.Module):
    def __init__(self, num_activities, d_model=128, num_heads=4, num_layers=2, dropout=0.1):
        super(ActivityTransformer, self).__init__()
        self.embedding = nn.Embedding(num_activities, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, 1, d_model))  # Initialize with dummy sequence length

        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(d_model, num_activities)  # Output size matches num_activities

    def forward(self, x):
        # Get sequence length dynamically during the forward pass
        seq_length = x.size(1)
        
        # Adjust the positional embedding to match the sequence length
        pos_embedding = self.pos_embedding[:, :seq_length, :].expand(x.size(0), seq_length, -1)

        # Add embedding and positional encoding
        x = self.embedding(x) + pos_embedding

        # Transformer layers
        x = self.transformer(x)

        # Pooling
        x = x.mean(dim=1)  # Pooling over sequence length (average)

        # Fully connected layer
        x = self.fc(x)  # Output

        return x

# Define Model
num_activities = X_tensor.max().item() + 1  
model = ActivityTransformer(num_activities)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early Stopping
best_loss = float('inf')
patience, patience_counter = 20, 0  # Stop if no improvement after 20 epochs

# Training Loop with Early Stopping
epochs = 200
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # Early Stopping Logic
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0  # Reset patience
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break  # Stop training

# Load Best Model
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

# Evaluation (Accuracy, Precision, Recall, F1, SD)
all_y_true, all_y_pred = [], []

batch_acc, batch_prec, batch_rec, batch_f1 = [], [], [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        output = model(X_batch)
        _, predicted = torch.max(output, 1)

        batch_true = y_batch.cpu().numpy()
        batch_pred = predicted.cpu().numpy()

        # Store all predictions
        all_y_true.extend(batch_true)
        all_y_pred.extend(batch_pred)

        # Compute metrics for the batch
        batch_acc.append(accuracy_score(batch_true, batch_pred))
        batch_prec.append(precision_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_rec.append(recall_score(batch_true, batch_pred, average='weighted', zero_division=0))
        batch_f1.append(f1_score(batch_true, batch_pred, average='weighted', zero_division=0))

# Convert to NumPy arrays
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)

# Compute Overall Metrics
accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
recall = recall_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
f1 = f1_score(all_y_true, all_y_pred, average='weighted')

# Compute Standard Deviation (SD) for each metric
acc_std = np.std(batch_acc)
prec_std = np.std(batch_prec)
rec_std = np.std(batch_rec)
f1_std = np.std(batch_f1)

# Print Metrics with SD
print("\n📊 Model Performance:")
print(f"✅ Accuracy: {accuracy:.4f} (±{acc_std:.4f})")
print(f"✅ Precision: {precision:.4f} (±{prec_std:.4f})")
print(f"✅ Recall: {recall:.4f} (±{rec_std:.4f})")
print(f"✅ F1-score: {f1:.4f} (±{f1_std:.4f})")





Mutual Information Scores for Repeat Pattern Features:
max_run_length: 1.2873
num_runs: 1.1414
avg_run_length: 1.1414
num_repetitive_activities: 0.4906




Epoch 1, Loss: 1.4757
Epoch 2, Loss: 0.5421
Epoch 3, Loss: 0.3876
Epoch 4, Loss: 0.3224
Epoch 5, Loss: 0.2909
Epoch 6, Loss: 0.2919
Epoch 7, Loss: 0.3091
Epoch 8, Loss: 0.2674
Epoch 9, Loss: 0.2797
Epoch 10, Loss: 0.2577
Epoch 11, Loss: 0.2644
Epoch 12, Loss: 0.2536
Epoch 13, Loss: 0.2643
Epoch 14, Loss: 0.2551
Epoch 15, Loss: 0.2490
Epoch 16, Loss: 0.2482
Epoch 17, Loss: 0.2588
Epoch 18, Loss: 0.2509
Epoch 19, Loss: 0.2413
Epoch 20, Loss: 0.2395
Epoch 21, Loss: 0.2373
Epoch 22, Loss: 0.2439
Epoch 23, Loss: 0.2453
Epoch 24, Loss: 0.2378
Epoch 25, Loss: 0.2462
Epoch 26, Loss: 0.2489
Epoch 27, Loss: 0.2488
Epoch 28, Loss: 0.2611
Epoch 29, Loss: 0.2460
Epoch 30, Loss: 0.2401
Epoch 31, Loss: 0.2367
Epoch 32, Loss: 0.2370
Epoch 33, Loss: 0.2447
Epoch 34, Loss: 0.2282
Epoch 35, Loss: 0.2407
Epoch 36, Loss: 0.2362
Epoch 37, Loss: 0.2359
Epoch 38, Loss: 0.2294
Epoch 39, Loss: 0.2288
Epoch 40, Loss: 0.2377
Epoch 41, Loss: 0.2319
Epoch 42, Loss: 0.2316
Epoch 43, Loss: 0.2249
Epoch 44, Loss: 0.22